Dump discovered HTML and JSON out to files

pR0Ps · pR0Ps · commit f98b0509d2ce · 2014-12-06T15:27:00.000-05:00
diff --git a/update_tests.py b/update_tests.py
@@ -1,7 +1,10 @@
 #!/usr/bin/env python
 import logging
 import yaml
+import json
 import sys
+import os
+import datetime
 from navigation import SolusSession
 
 def str_unless_none(obj):
@@ -32,15 +35,40 @@ def get_filter(obj):
     else:
         return tuple(str(obj))
 
+def buildpath(path, new_obj):
+    return "{}_{}".format(path, new_obj.replace(" ", "-"))
+
+def mkdir(path):
+    try:
+        os.makedirs(path, exist_ok=True)
+    except TypeError:
+        # Python 2 - no 'exist_ok'
+        try:
+            os.makedirs(path)
+        except OSError as e:
+            # Worry about this code breaking if it becomes a problem
+            pass
+
+def json_dumper(obj):
+    """Deal with dumping datetimes to JSON"""
+    if isinstance(obj, (datetime.datetime, datetime.time, datetime.date)):
+        return obj.isoformat()
+    else:
+        raise TypeError('Object of type %s with value of %s is not JSON serializable' % (type(obj), repr(obj)))
+
 class TestUpdater(object):
     """Dump HTML and the scraped data"""
 
-    def __init__(self, config_file, user, passwd):
+    def __init__(self, config_file, output_dir, user, passwd):
         """Initialize the session to grab the data with"""
 
         # Load the config
         self.load_config(config_file)
 
+        # Create the folder for output
+        self.output_dir = output_dir
+        mkdir(output_dir)
+
         # Initialize the session
         try:
             session = SolusSession(user, passwd, souplib="lxml", testing_mode=True)
@@ -60,6 +88,16 @@ def load_config(self, config_file):
             logging.critical("Couldn't load config file '{}'".format(config_file))
             raise
 
+    def data_dump(self, path, data=None):
+        """Dump the current HTML and provided data into files"""
+        basename = os.path.join(self.output_dir, path)
+
+        with open(basename + ".html", 'wb') as f:
+            f.write(self.session.parser.get_raw_html().encode("utf-8"))
+        if data is not None:
+            with open(basename + ".json", 'w') as f:
+                json.dump(data, f, default=json_dumper, sort_keys=True, indent=2, separators=(',', ': '))
+
     def start(self):
         """Starts updating the local data"""
 
@@ -69,27 +107,30 @@ def start(self):
     def scrape_alphanums(self):
         """Scrape alphanums"""
 
-        all_alphanums = list(self.session.parser.all_alphanums(filter_=get_filter(self.config)))
+        all_alphanums = list(self.session.parser.all_alphanums())
 
         for alphanum, subjects in iterkeyvalue(self.config):
 
             if alphanum not in all_alphanums:
                 logging.warning("Couldn't find alphanum {} specified in config file".format(alphanum))
                 continue
+
             self.session.select_alphanum(alphanum)
 
             logging.info("Alphanum: {}".format(alphanum))
 
-            self.scrape_subjects(subjects)
+            self.scrape_subjects(subjects, alphanum)
 
-    def scrape_subjects(self, subjects):
+    def scrape_subjects(self, subjects, path):
         """Scrape subjects"""
 
         # Get a list of all subjects to iterate over
-        parsed_subjects = self.session.parser.all_subjects(filter_=get_filter(subjects))
+        parsed_subjects = self.session.parser.all_subjects()
         # Index by abbreviation
         all_subjects = {x["abbreviation"]: x for x in parsed_subjects}
 
+        self.data_dump(path, parsed_subjects)
+
         # Iterate over all subjects
         for subject, courses in iterkeyvalue(subjects):
 
@@ -102,17 +143,19 @@ def scrape_subjects(self, subjects):
             logging.info(u"--Subject: {abbreviation} - {title}".format(**curr_subject))
 
             self.session.dropdown_subject(curr_subject["_unique"])
-            self.scrape_courses(courses)
+            self.scrape_courses(courses, buildpath(path, subject))
             self.session.rollup_subject(curr_subject["_unique"])
 
-    def scrape_courses(self, courses):
+    def scrape_courses(self, courses, path):
         """Scrape courses"""
 
         # Get a list of all courses to iterate over
-        parsed_courses = self.session.parser.all_courses(filter_=get_filter(courses))
+        parsed_courses = self.session.parser.all_courses()
         # Index by code
         all_courses = {x["code"]: x for x in parsed_courses}
 
+        self.data_dump(path, parsed_courses)
+
         # Iterate over all courses
         for course, terms in iterkeyvalue(courses):
 
@@ -122,43 +165,55 @@ def scrape_courses(self, courses):
                     logging.warning("Couldn't find course {} specified in config file".format(course))
                 continue
 
-            self.session.open_course(curr_course["_unique"])
+            course_path = buildpath(path, course)
 
+            self.session.open_course(curr_course["_unique"])
             course_attrs = self.session.parser.course_attrs()
+            self.data_dump(course_path, course_attrs)
+
             logging.info(u"----Course: {number} - {title}".format(**course_attrs['basic']))
             logging.debug(u"COURSE DATA DUMP: {0}".format(course_attrs['extra']))
 
             self.session.show_sections()
-            self.scrape_terms(terms)
+
+            self.scrape_terms(terms, buildpath(course_path, "sections"))
             self.session.return_from_course()
 
-    def scrape_terms(self, terms):
+    def scrape_terms(self, terms, path):
         """Scrape terms"""
 
         # Get all terms on the page and iterate over them
-        parsed_terms = self.session.parser.all_terms(filter_=get_filter(terms))
+        parsed_terms = self.session.parser.all_terms()
         all_terms = {"{year} {season}".format(**x): x for x in parsed_terms}
 
+        self.data_dump(path, parsed_terms)
+
         for term, sections in iterkeyvalue(terms):
             curr_term = all_terms.get(term)
             if curr_term is None:
                 if term is not None:
                     logging.warning("Couldn't find term {} specified in config file".format(term))
                 continue
 
+            term_path = buildpath(path, term)
+
             logging.info(u"------Term: {year} - {season}".format(**curr_term))
             self.session.switch_to_term(curr_term["_unique"])
+            self.data_dump(term_path)
 
             self.session.view_all_sections()
-            self.scrape_sections(sections)
 
-    def scrape_sections(self, sections):
+            self.scrape_sections(sections, buildpath(term_path, "all"))
+
+    def scrape_sections(self, sections, path):
         """Scrape sections"""
 
         # Grab all the basic data
-        parsed_sections = self.session.parser.all_section_data(filter_=get_filter(sections))
+        parsed_sections = self.session.parser.all_section_data()
         all_sections = {x["basic"]["solus_id"]: x for x in parsed_sections}
 
+        self.data_dump(path, parsed_sections)
+
         # Don't really need the `iterkeyvalue` but it makes the config
         # parsing a litte more lax so whatever
         for section, _ in iterkeyvalue(sections):
@@ -172,8 +227,9 @@ def scrape_sections(self, sections):
             logging.info(u"--------Section: {class_num}-{type} ({solus_id}) -- {status}".format(**curr_section["basic"]))
 
             self.session.visit_section_page(curr_section["_unique"])
-
             new_data = self.session.parser.section_deep_attrs()
+            self.data_dump(buildpath(path, section), new_data)
+
             logging.info(u"----------Section details: session:{session} loc:{location} campus:{campus}".format(**new_data["details"]))
 
             self.session.return_from_section()
@@ -201,4 +257,4 @@ def _init_logging():
     except ImportError:
         logging.critical("No credientials found. Create a config.py file with USER and PASS constants")
 
-    TestUpdater("tests/testconfig.yaml", USER, PASS).start()
+    TestUpdater("tests/testconfig.yaml", "tests/out", USER, PASS).start()