uminho-schedule-tool/scraper.py at master · joaoalves03/uminho-schedule-tool · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import math
import re
from datetime import datetime, timedelta
from time import sleep
from progress.bar import Bar

import bs4
import requests
from bs4 import BeautifulSoup

from lesson import Lesson

SCHEDULE_URL = "https://alunos.uminho.pt/PT/estudantes/Paginas/InfoUteisHorarios.aspx"
STATE = lambda course, course_id: (
    f'{{"logEntries":[],"value":"{course_id}","text":"{course}","enabled":true,'
    f'"checkedIndices":[],"checkedItemsTextOverflows":false}}'
)
TIME_SLOT_SIZE_PX = 60


# Powered by hopes and dreams
class Scraper:
    course_name = None
    year = ""
    form_id = None

    lessons: list[Lesson] = []
    classes: list[str] = []

    def __init__(self, config: dict):
        weeks = self.get_weeks_between(config["week"]["start"], config["week"]["end"])
        if "classes" in config.keys() and type(config["classes"]) is list:
            self.classes = config["classes"]

        bar = Bar('Scraping schedule', max=len(weeks))
        bar.start()

        self.course_name = config["course_name"]
        self.year = str(config["year"])

        res = requests.get(SCHEDULE_URL)
        soup = BeautifulSoup(res.text, features="lxml")
        self.form_id = self.get_form_id(soup)

        res = requests.post(SCHEDULE_URL, headers={
            "Content-Type": "application/x-www-form-urlencoded"
        }, data={
            **self.parse_hidden_inputs(soup),
            f"{self.form_id}dataCurso": self.course_name,
            f"{self.get_client_state_input_name(soup)}": STATE(self.course_name, self.get_course_id(soup, self.course_name))
        })

        if "Mostrar horário expandido" not in res.text:
            print("Couldn't get course data. Stopping...")
            exit(-1)

        soup = BeautifulSoup(res.text, features="lxml")

        for i, week in enumerate(weeks):
            state = f'{{"enabled":true,"emptyMessage":"","validationText":"{week}-00-00-00","valueAsString":"{week}-00-00-00","minDateStr":"2025-09-15-00-00-00","maxDateStr":"2026-06-21-00-00-00","lastSetTextBoxValue":"20-10-2025"}}'

            res = requests.post(SCHEDULE_URL, headers={
                "Content-Type": "application/x-www-form-urlencoded",
                # Note: Must fake user agent, or else the website will not render the schedule correctly
                "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Gecko/20100101 Firefox/143.0"
            }, data={
                **self.parse_hidden_inputs(soup),
                f"{self.form_id}dataCurso": self.course_name,
                f"{self.form_id}dataAnoCurricular": self.year,
                # This field requires the date in YYYY-MM-DD
                f"{self.form_id}dataWeekSelect": week,
                # But this field requires the date in DD-MM-YYYY for some reason
                f"{self.form_id}dataWeekSelect$dateInput": "-".join(reversed(week.split("-"))),
                f"{self.form_id}chkMostraExpandido": "on",
                f"{self.form_id.replace('$', '_')}dataWeekSelect_dateInput_ClientState": state
            })

            self.parse_schedule(res.text)
            bar.next()

            if i != len(weeks) - 1:
                sleep(config["timeout"])

        bar.finish()

    def parse_schedule(self, raw_schedule_data: str):
        soup = BeautifulSoup(raw_schedule_data, features="lxml")
        table = soup.select_one(".rsContent > table:nth-child(1)")

        days = [th.find("a").attrs["href"][1:] for th in table.select(".rsHorizontalHeaderTable th")]
        earliest_hour = self.parse_earliest_hour(table)
        n_time_slots = self.get_number_of_time_slots(table)


        for day_index, day in enumerate(days):
            time = earliest_hour

            for time_slot in range(1, n_time_slots + 1):
                schedule_slots = table.select(
                    f".rsContentTable > tr:nth-child({time_slot}) > td:nth-child({day_index + 1}) > .rsWrap > div")

                for schedule_slot in schedule_slots:
                    # if slot contains anything at all
                    if schedule_slot.text.strip():
                        # if class names specified in config, only keep classes specified
                        if len(self.classes) == 0 or any(class_name in schedule_slot.text for class_name in self.classes):
                            self.lessons.append(self.parse_lesson(schedule_slot, time, day))

                time += timedelta(minutes=30)

    @staticmethod
    def parse_hidden_inputs(soup: BeautifulSoup):
        return {
            _input.get("name"): _input.get("value")
            for _input in soup.select("input[type='hidden']")
        }

    @staticmethod
    def get_course_id(soup: BeautifulSoup, course_name: str) -> str | None:
        names = [li.get_text(strip=True) for li in soup.select("li.rcbItem")]

        text = soup.decode()
        match = re.search(r'"itemData"\s*:\s*\[(.*?)]', text, flags=re.S)
        if not match:
            return None

        ids = re.findall(r'["\']?value["\']?\s*:\s*["\'](\d+)["\']', match.group(1))
        if len(ids) != len(names):
            return None

        course_map = dict(zip(names, ids))
        return course_map.get(course_name)

    @staticmethod
    def get_client_state_input_name(soup: BeautifulSoup):
        return soup.select_one(".RadComboBox > input").get("name")

    @staticmethod
    def get_form_id(soup: BeautifulSoup):
        return soup.select_one(".rcbInput").get("name")[:-9]

    @staticmethod
    def parse_earliest_hour(table: BeautifulSoup) -> datetime:
        return datetime.strptime(table.select_one(
            ".rsVerticalHeaderTable > tr:nth-child(1) > th:nth-child(1) > div:nth-child(1)").text.strip(), "%H:%M")

    @staticmethod
    def get_number_of_time_slots(table: BeautifulSoup) -> int:
        return len(table.select_one(".rsContentTable").select("table > tr"))

    @staticmethod
    def parse_lesson(slot: bs4.Tag, start: datetime, date: str) -> Lesson:
        new_lesson = Lesson()

        new_lesson_date = datetime.strptime(date, "%Y-%m-%d").date()
        new_lesson.start = start.replace(year=new_lesson_date.year, month=new_lesson_date.month,
                                         day=new_lesson_date.day)

        match = re.search(r'height:\s*([\d.]+)(px|%)?', slot.get("style"))
        if match:
            height_value = int(match.group(1))
            time = math.ceil(height_value / TIME_SLOT_SIZE_PX)
        else:
            time = 1

        new_lesson.end = new_lesson.start + timedelta(minutes=time * 30)

        metadata = slot.select_one(".rsAptOut > .rsAptMid > .rsAptIn > .rsAptContent")
        new_lesson.name = metadata.contents[0].get_text(strip=True)
        new_lesson.location = metadata.find('span').get_text(strip=True).strip('[]')
        new_lesson.shift = metadata.contents[3].get_text(strip=True)

        return new_lesson

    @staticmethod
    def get_weeks_between(start_date: str, end_date: str):
        start = datetime.strptime(start_date, "%Y-%m-%d").date()
        end = datetime.strptime(end_date, "%Y-%m-%d").date()

        days_since_monday = start.weekday()
        first_monday = start - timedelta(days=days_since_monday)

        mondays = []
        current = first_monday
        while current <= end:
            mondays.append(current.strftime("%Y-%m-%d"))
            current += timedelta(weeks=1)

        return mondays