-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
189 lines (145 loc) · 7.23 KB
/
scraper.py
File metadata and controls
189 lines (145 loc) · 7.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import math
import re
from datetime import datetime, timedelta
from time import sleep
from progress.bar import Bar
import bs4
import requests
from bs4 import BeautifulSoup
from lesson import Lesson
SCHEDULE_URL = "https://alunos.uminho.pt/PT/estudantes/Paginas/InfoUteisHorarios.aspx"
STATE = lambda course, course_id: (
f'{{"logEntries":[],"value":"{course_id}","text":"{course}","enabled":true,'
f'"checkedIndices":[],"checkedItemsTextOverflows":false}}'
)
TIME_SLOT_SIZE_PX = 60
# Powered by hopes and dreams
class Scraper:
course_name = None
year = ""
form_id = None
lessons: list[Lesson] = []
classes: list[str] = []
def __init__(self, config: dict):
weeks = self.get_weeks_between(config["week"]["start"], config["week"]["end"])
if "classes" in config.keys() and type(config["classes"]) is list:
self.classes = config["classes"]
bar = Bar('Scraping schedule', max=len(weeks))
bar.start()
self.course_name = config["course_name"]
self.year = str(config["year"])
res = requests.get(SCHEDULE_URL)
soup = BeautifulSoup(res.text, features="lxml")
self.form_id = self.get_form_id(soup)
res = requests.post(SCHEDULE_URL, headers={
"Content-Type": "application/x-www-form-urlencoded"
}, data={
**self.parse_hidden_inputs(soup),
f"{self.form_id}dataCurso": self.course_name,
f"{self.get_client_state_input_name(soup)}": STATE(self.course_name, self.get_course_id(soup, self.course_name))
})
if "Mostrar horário expandido" not in res.text:
print("Couldn't get course data. Stopping...")
exit(-1)
soup = BeautifulSoup(res.text, features="lxml")
for i, week in enumerate(weeks):
state = f'{{"enabled":true,"emptyMessage":"","validationText":"{week}-00-00-00","valueAsString":"{week}-00-00-00","minDateStr":"2025-09-15-00-00-00","maxDateStr":"2026-06-21-00-00-00","lastSetTextBoxValue":"20-10-2025"}}'
res = requests.post(SCHEDULE_URL, headers={
"Content-Type": "application/x-www-form-urlencoded",
# Note: Must fake user agent, or else the website will not render the schedule correctly
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Gecko/20100101 Firefox/143.0"
}, data={
**self.parse_hidden_inputs(soup),
f"{self.form_id}dataCurso": self.course_name,
f"{self.form_id}dataAnoCurricular": self.year,
# This field requires the date in YYYY-MM-DD
f"{self.form_id}dataWeekSelect": week,
# But this field requires the date in DD-MM-YYYY for some reason
f"{self.form_id}dataWeekSelect$dateInput": "-".join(reversed(week.split("-"))),
f"{self.form_id}chkMostraExpandido": "on",
f"{self.form_id.replace('$', '_')}dataWeekSelect_dateInput_ClientState": state
})
self.parse_schedule(res.text)
bar.next()
if i != len(weeks) - 1:
sleep(config["timeout"])
bar.finish()
def parse_schedule(self, raw_schedule_data: str):
soup = BeautifulSoup(raw_schedule_data, features="lxml")
table = soup.select_one(".rsContent > table:nth-child(1)")
days = [th.find("a").attrs["href"][1:] for th in table.select(".rsHorizontalHeaderTable th")]
earliest_hour = self.parse_earliest_hour(table)
n_time_slots = self.get_number_of_time_slots(table)
for day_index, day in enumerate(days):
time = earliest_hour
for time_slot in range(1, n_time_slots + 1):
schedule_slots = table.select(
f".rsContentTable > tr:nth-child({time_slot}) > td:nth-child({day_index + 1}) > .rsWrap > div")
for schedule_slot in schedule_slots:
# if slot contains anything at all
if schedule_slot.text.strip():
# if class names specified in config, only keep classes specified
if len(self.classes) == 0 or any(class_name in schedule_slot.text for class_name in self.classes):
self.lessons.append(self.parse_lesson(schedule_slot, time, day))
time += timedelta(minutes=30)
@staticmethod
def parse_hidden_inputs(soup: BeautifulSoup):
return {
_input.get("name"): _input.get("value")
for _input in soup.select("input[type='hidden']")
}
@staticmethod
def get_course_id(soup: BeautifulSoup, course_name: str) -> str | None:
names = [li.get_text(strip=True) for li in soup.select("li.rcbItem")]
text = soup.decode()
match = re.search(r'"itemData"\s*:\s*\[(.*?)]', text, flags=re.S)
if not match:
return None
ids = re.findall(r'["\']?value["\']?\s*:\s*["\'](\d+)["\']', match.group(1))
if len(ids) != len(names):
return None
course_map = dict(zip(names, ids))
return course_map.get(course_name)
@staticmethod
def get_client_state_input_name(soup: BeautifulSoup):
return soup.select_one(".RadComboBox > input").get("name")
@staticmethod
def get_form_id(soup: BeautifulSoup):
return soup.select_one(".rcbInput").get("name")[:-9]
@staticmethod
def parse_earliest_hour(table: BeautifulSoup) -> datetime:
return datetime.strptime(table.select_one(
".rsVerticalHeaderTable > tr:nth-child(1) > th:nth-child(1) > div:nth-child(1)").text.strip(), "%H:%M")
@staticmethod
def get_number_of_time_slots(table: BeautifulSoup) -> int:
return len(table.select_one(".rsContentTable").select("table > tr"))
@staticmethod
def parse_lesson(slot: bs4.Tag, start: datetime, date: str) -> Lesson:
new_lesson = Lesson()
new_lesson_date = datetime.strptime(date, "%Y-%m-%d").date()
new_lesson.start = start.replace(year=new_lesson_date.year, month=new_lesson_date.month,
day=new_lesson_date.day)
match = re.search(r'height:\s*([\d.]+)(px|%)?', slot.get("style"))
if match:
height_value = int(match.group(1))
time = math.ceil(height_value / TIME_SLOT_SIZE_PX)
else:
time = 1
new_lesson.end = new_lesson.start + timedelta(minutes=time * 30)
metadata = slot.select_one(".rsAptOut > .rsAptMid > .rsAptIn > .rsAptContent")
new_lesson.name = metadata.contents[0].get_text(strip=True)
new_lesson.location = metadata.find('span').get_text(strip=True).strip('[]')
new_lesson.shift = metadata.contents[3].get_text(strip=True)
return new_lesson
@staticmethod
def get_weeks_between(start_date: str, end_date: str):
start = datetime.strptime(start_date, "%Y-%m-%d").date()
end = datetime.strptime(end_date, "%Y-%m-%d").date()
days_since_monday = start.weekday()
first_monday = start - timedelta(days=days_since_monday)
mondays = []
current = first_monday
while current <= end:
mondays.append(current.strftime("%Y-%m-%d"))
current += timedelta(weeks=1)
return mondays