11#!/usr/bin/env python
22import logging
33import yaml
4+ import json
45import sys
6+ import os
7+ import datetime
58from navigation import SolusSession
69
710def str_unless_none (obj ):
@@ -32,15 +35,40 @@ def get_filter(obj):
3235 else :
3336 return tuple (str (obj ))
3437
38+ def buildpath (path , new_obj ):
39+ return "{}_{}" .format (path , new_obj .replace (" " , "-" ))
40+
41+ def mkdir (path ):
42+ try :
43+ os .makedirs (path , exist_ok = True )
44+ except TypeError :
45+ # Python 2 - no 'exist_ok'
46+ try :
47+ os .makedirs (path )
48+ except OSError as e :
49+ # Worry about this code breaking if it becomes a problem
50+ pass
51+
52+ def json_dumper (obj ):
53+ """Deal with dumping datetimes to JSON"""
54+ if isinstance (obj , (datetime .datetime , datetime .time , datetime .date )):
55+ return obj .isoformat ()
56+ else :
57+ raise TypeError ('Object of type %s with value of %s is not JSON serializable' % (type (obj ), repr (obj )))
58+
3559class TestUpdater (object ):
3660 """Dump HTML and the scraped data"""
3761
38- def __init__ (self , config_file , user , passwd ):
62+ def __init__ (self , config_file , output_dir , user , passwd ):
3963 """Initialize the session to grab the data with"""
4064
4165 # Load the config
4266 self .load_config (config_file )
4367
68+ # Create the folder for output
69+ self .output_dir = output_dir
70+ mkdir (output_dir )
71+
4472 # Initialize the session
4573 try :
4674 session = SolusSession (user , passwd , souplib = "lxml" , testing_mode = True )
@@ -60,6 +88,16 @@ def load_config(self, config_file):
6088 logging .critical ("Couldn't load config file '{}'" .format (config_file ))
6189 raise
6290
91+ def data_dump (self , path , data = None ):
92+ """Dump the current HTML and provided data into files"""
93+ basename = os .path .join (self .output_dir , path )
94+
95+ with open (basename + ".html" , 'wb' ) as f :
96+ f .write (self .session .parser .get_raw_html ().encode ("utf-8" ))
97+ if data is not None :
98+ with open (basename + ".json" , 'w' ) as f :
99+ json .dump (data , f , default = json_dumper , sort_keys = True , indent = 2 , separators = (',' , ': ' ))
100+
63101 def start (self ):
64102 """Starts updating the local data"""
65103
@@ -69,27 +107,30 @@ def start(self):
69107 def scrape_alphanums (self ):
70108 """Scrape alphanums"""
71109
72- all_alphanums = list (self .session .parser .all_alphanums (filter_ = get_filter ( self . config ) ))
110+ all_alphanums = list (self .session .parser .all_alphanums ())
73111
74112 for alphanum , subjects in iterkeyvalue (self .config ):
75113
76114 if alphanum not in all_alphanums :
77115 logging .warning ("Couldn't find alphanum {} specified in config file" .format (alphanum ))
78116 continue
117+
79118 self .session .select_alphanum (alphanum )
80119
81120 logging .info ("Alphanum: {}" .format (alphanum ))
82121
83- self .scrape_subjects (subjects )
122+ self .scrape_subjects (subjects , alphanum )
84123
85- def scrape_subjects (self , subjects ):
124+ def scrape_subjects (self , subjects , path ):
86125 """Scrape subjects"""
87126
88127 # Get a list of all subjects to iterate over
89- parsed_subjects = self .session .parser .all_subjects (filter_ = get_filter ( subjects ) )
128+ parsed_subjects = self .session .parser .all_subjects ()
90129 # Index by abbreviation
91130 all_subjects = {x ["abbreviation" ]: x for x in parsed_subjects }
92131
132+ self .data_dump (path , parsed_subjects )
133+
93134 # Iterate over all subjects
94135 for subject , courses in iterkeyvalue (subjects ):
95136
@@ -102,17 +143,19 @@ def scrape_subjects(self, subjects):
102143 logging .info (u"--Subject: {abbreviation} - {title}" .format (** curr_subject ))
103144
104145 self .session .dropdown_subject (curr_subject ["_unique" ])
105- self .scrape_courses (courses )
146+ self .scrape_courses (courses , buildpath ( path , subject ) )
106147 self .session .rollup_subject (curr_subject ["_unique" ])
107148
108- def scrape_courses (self , courses ):
149+ def scrape_courses (self , courses , path ):
109150 """Scrape courses"""
110151
111152 # Get a list of all courses to iterate over
112- parsed_courses = self .session .parser .all_courses (filter_ = get_filter ( courses ) )
153+ parsed_courses = self .session .parser .all_courses ()
113154 # Index by code
114155 all_courses = {x ["code" ]: x for x in parsed_courses }
115156
157+ self .data_dump (path , parsed_courses )
158+
116159 # Iterate over all courses
117160 for course , terms in iterkeyvalue (courses ):
118161
@@ -122,43 +165,55 @@ def scrape_courses(self, courses):
122165 logging .warning ("Couldn't find course {} specified in config file" .format (course ))
123166 continue
124167
125- self . session . open_course ( curr_course [ "_unique" ] )
168+ course_path = buildpath ( path , course )
126169
170+ self .session .open_course (curr_course ["_unique" ])
127171 course_attrs = self .session .parser .course_attrs ()
172+ self .data_dump (course_path , course_attrs )
173+
128174 logging .info (u"----Course: {number} - {title}" .format (** course_attrs ['basic' ]))
129175 logging .debug (u"COURSE DATA DUMP: {0}" .format (course_attrs ['extra' ]))
130176
131177 self .session .show_sections ()
132- self .scrape_terms (terms )
178+
179+ self .scrape_terms (terms , buildpath (course_path , "sections" ))
133180 self .session .return_from_course ()
134181
135- def scrape_terms (self , terms ):
182+ def scrape_terms (self , terms , path ):
136183 """Scrape terms"""
137184
138185 # Get all terms on the page and iterate over them
139- parsed_terms = self .session .parser .all_terms (filter_ = get_filter ( terms ) )
186+ parsed_terms = self .session .parser .all_terms ()
140187 all_terms = {"{year} {season}" .format (** x ): x for x in parsed_terms }
141188
189+ self .data_dump (path , parsed_terms )
190+
142191 for term , sections in iterkeyvalue (terms ):
143192 curr_term = all_terms .get (term )
144193 if curr_term is None :
145194 if term is not None :
146195 logging .warning ("Couldn't find term {} specified in config file" .format (term ))
147196 continue
148197
198+ term_path = buildpath (path , term )
199+
149200 logging .info (u"------Term: {year} - {season}" .format (** curr_term ))
150201 self .session .switch_to_term (curr_term ["_unique" ])
202+ self .data_dump (term_path )
151203
152204 self .session .view_all_sections ()
153- self .scrape_sections (sections )
154205
155- def scrape_sections (self , sections ):
206+ self .scrape_sections (sections , buildpath (term_path , "all" ))
207+
208+ def scrape_sections (self , sections , path ):
156209 """Scrape sections"""
157210
158211 # Grab all the basic data
159- parsed_sections = self .session .parser .all_section_data (filter_ = get_filter ( sections ) )
212+ parsed_sections = self .session .parser .all_section_data ()
160213 all_sections = {x ["basic" ]["solus_id" ]: x for x in parsed_sections }
161214
215+ self .data_dump (path , parsed_sections )
216+
162217 # Don't really need the `iterkeyvalue` but it makes the config
163218 # parsing a litte more lax so whatever
164219 for section , _ in iterkeyvalue (sections ):
@@ -172,8 +227,9 @@ def scrape_sections(self, sections):
172227 logging .info (u"--------Section: {class_num}-{type} ({solus_id}) -- {status}" .format (** curr_section ["basic" ]))
173228
174229 self .session .visit_section_page (curr_section ["_unique" ])
175-
176230 new_data = self .session .parser .section_deep_attrs ()
231+ self .data_dump (buildpath (path , section ), new_data )
232+
177233 logging .info (u"----------Section details: session:{session} loc:{location} campus:{campus}" .format (** new_data ["details" ]))
178234
179235 self .session .return_from_section ()
@@ -201,4 +257,4 @@ def _init_logging():
201257 except ImportError :
202258 logging .critical ("No credientials found. Create a config.py file with USER and PASS constants" )
203259
204- TestUpdater ("tests/testconfig.yaml" , USER , PASS ).start ()
260+ TestUpdater ("tests/testconfig.yaml" , "tests/out" , USER , PASS ).start ()
0 commit comments