Skip to content

Commit f98b050

Browse files
committed
Dump discovered HTML and JSON out to files
1 parent bcbb88b commit f98b050

File tree

1 file changed

+73
-17
lines changed

1 file changed

+73
-17
lines changed

update_tests.py

Lines changed: 73 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
#!/usr/bin/env python
22
import logging
33
import yaml
4+
import json
45
import sys
6+
import os
7+
import datetime
58
from navigation import SolusSession
69

710
def str_unless_none(obj):
@@ -32,15 +35,40 @@ def get_filter(obj):
3235
else:
3336
return tuple(str(obj))
3437

38+
def buildpath(path, new_obj):
39+
return "{}_{}".format(path, new_obj.replace(" ", "-"))
40+
41+
def mkdir(path):
42+
try:
43+
os.makedirs(path, exist_ok=True)
44+
except TypeError:
45+
# Python 2 - no 'exist_ok'
46+
try:
47+
os.makedirs(path)
48+
except OSError as e:
49+
# Worry about this code breaking if it becomes a problem
50+
pass
51+
52+
def json_dumper(obj):
53+
"""Deal with dumping datetimes to JSON"""
54+
if isinstance(obj, (datetime.datetime, datetime.time, datetime.date)):
55+
return obj.isoformat()
56+
else:
57+
raise TypeError('Object of type %s with value of %s is not JSON serializable' % (type(obj), repr(obj)))
58+
3559
class TestUpdater(object):
3660
"""Dump HTML and the scraped data"""
3761

38-
def __init__(self, config_file, user, passwd):
62+
def __init__(self, config_file, output_dir, user, passwd):
3963
"""Initialize the session to grab the data with"""
4064

4165
# Load the config
4266
self.load_config(config_file)
4367

68+
# Create the folder for output
69+
self.output_dir = output_dir
70+
mkdir(output_dir)
71+
4472
# Initialize the session
4573
try:
4674
session = SolusSession(user, passwd, souplib="lxml", testing_mode=True)
@@ -60,6 +88,16 @@ def load_config(self, config_file):
6088
logging.critical("Couldn't load config file '{}'".format(config_file))
6189
raise
6290

91+
def data_dump(self, path, data=None):
92+
"""Dump the current HTML and provided data into files"""
93+
basename = os.path.join(self.output_dir, path)
94+
95+
with open(basename + ".html", 'wb') as f:
96+
f.write(self.session.parser.get_raw_html().encode("utf-8"))
97+
if data is not None:
98+
with open(basename + ".json", 'w') as f:
99+
json.dump(data, f, default=json_dumper, sort_keys=True, indent=2, separators=(',', ': '))
100+
63101
def start(self):
64102
"""Starts updating the local data"""
65103

@@ -69,27 +107,30 @@ def start(self):
69107
def scrape_alphanums(self):
70108
"""Scrape alphanums"""
71109

72-
all_alphanums = list(self.session.parser.all_alphanums(filter_=get_filter(self.config)))
110+
all_alphanums = list(self.session.parser.all_alphanums())
73111

74112
for alphanum, subjects in iterkeyvalue(self.config):
75113

76114
if alphanum not in all_alphanums:
77115
logging.warning("Couldn't find alphanum {} specified in config file".format(alphanum))
78116
continue
117+
79118
self.session.select_alphanum(alphanum)
80119

81120
logging.info("Alphanum: {}".format(alphanum))
82121

83-
self.scrape_subjects(subjects)
122+
self.scrape_subjects(subjects, alphanum)
84123

85-
def scrape_subjects(self, subjects):
124+
def scrape_subjects(self, subjects, path):
86125
"""Scrape subjects"""
87126

88127
# Get a list of all subjects to iterate over
89-
parsed_subjects = self.session.parser.all_subjects(filter_=get_filter(subjects))
128+
parsed_subjects = self.session.parser.all_subjects()
90129
# Index by abbreviation
91130
all_subjects = {x["abbreviation"]: x for x in parsed_subjects}
92131

132+
self.data_dump(path, parsed_subjects)
133+
93134
# Iterate over all subjects
94135
for subject, courses in iterkeyvalue(subjects):
95136

@@ -102,17 +143,19 @@ def scrape_subjects(self, subjects):
102143
logging.info(u"--Subject: {abbreviation} - {title}".format(**curr_subject))
103144

104145
self.session.dropdown_subject(curr_subject["_unique"])
105-
self.scrape_courses(courses)
146+
self.scrape_courses(courses, buildpath(path, subject))
106147
self.session.rollup_subject(curr_subject["_unique"])
107148

108-
def scrape_courses(self, courses):
149+
def scrape_courses(self, courses, path):
109150
"""Scrape courses"""
110151

111152
# Get a list of all courses to iterate over
112-
parsed_courses = self.session.parser.all_courses(filter_=get_filter(courses))
153+
parsed_courses = self.session.parser.all_courses()
113154
# Index by code
114155
all_courses = {x["code"]: x for x in parsed_courses}
115156

157+
self.data_dump(path, parsed_courses)
158+
116159
# Iterate over all courses
117160
for course, terms in iterkeyvalue(courses):
118161

@@ -122,43 +165,55 @@ def scrape_courses(self, courses):
122165
logging.warning("Couldn't find course {} specified in config file".format(course))
123166
continue
124167

125-
self.session.open_course(curr_course["_unique"])
168+
course_path = buildpath(path, course)
126169

170+
self.session.open_course(curr_course["_unique"])
127171
course_attrs = self.session.parser.course_attrs()
172+
self.data_dump(course_path, course_attrs)
173+
128174
logging.info(u"----Course: {number} - {title}".format(**course_attrs['basic']))
129175
logging.debug(u"COURSE DATA DUMP: {0}".format(course_attrs['extra']))
130176

131177
self.session.show_sections()
132-
self.scrape_terms(terms)
178+
179+
self.scrape_terms(terms, buildpath(course_path, "sections"))
133180
self.session.return_from_course()
134181

135-
def scrape_terms(self, terms):
182+
def scrape_terms(self, terms, path):
136183
"""Scrape terms"""
137184

138185
# Get all terms on the page and iterate over them
139-
parsed_terms = self.session.parser.all_terms(filter_=get_filter(terms))
186+
parsed_terms = self.session.parser.all_terms()
140187
all_terms = {"{year} {season}".format(**x): x for x in parsed_terms}
141188

189+
self.data_dump(path, parsed_terms)
190+
142191
for term, sections in iterkeyvalue(terms):
143192
curr_term = all_terms.get(term)
144193
if curr_term is None:
145194
if term is not None:
146195
logging.warning("Couldn't find term {} specified in config file".format(term))
147196
continue
148197

198+
term_path = buildpath(path, term)
199+
149200
logging.info(u"------Term: {year} - {season}".format(**curr_term))
150201
self.session.switch_to_term(curr_term["_unique"])
202+
self.data_dump(term_path)
151203

152204
self.session.view_all_sections()
153-
self.scrape_sections(sections)
154205

155-
def scrape_sections(self, sections):
206+
self.scrape_sections(sections, buildpath(term_path, "all"))
207+
208+
def scrape_sections(self, sections, path):
156209
"""Scrape sections"""
157210

158211
# Grab all the basic data
159-
parsed_sections = self.session.parser.all_section_data(filter_=get_filter(sections))
212+
parsed_sections = self.session.parser.all_section_data()
160213
all_sections = {x["basic"]["solus_id"]: x for x in parsed_sections}
161214

215+
self.data_dump(path, parsed_sections)
216+
162217
# Don't really need the `iterkeyvalue` but it makes the config
163218
# parsing a litte more lax so whatever
164219
for section, _ in iterkeyvalue(sections):
@@ -172,8 +227,9 @@ def scrape_sections(self, sections):
172227
logging.info(u"--------Section: {class_num}-{type} ({solus_id}) -- {status}".format(**curr_section["basic"]))
173228

174229
self.session.visit_section_page(curr_section["_unique"])
175-
176230
new_data = self.session.parser.section_deep_attrs()
231+
self.data_dump(buildpath(path, section), new_data)
232+
177233
logging.info(u"----------Section details: session:{session} loc:{location} campus:{campus}".format(**new_data["details"]))
178234

179235
self.session.return_from_section()
@@ -201,4 +257,4 @@ def _init_logging():
201257
except ImportError:
202258
logging.critical("No credientials found. Create a config.py file with USER and PASS constants")
203259

204-
TestUpdater("tests/testconfig.yaml", USER, PASS).start()
260+
TestUpdater("tests/testconfig.yaml", "tests/out", USER, PASS).start()

0 commit comments

Comments
 (0)