import datetime import itertools from operator import sub import re import bs4 from bs4 import BeautifulSoup as BS def dateparse(datetime_str): date = "%b %d, %Y" time = "%I:%M %p" try: return datetime.datetime.strptime(datetime_str, date) except ValueError: return datetime.datetime.strptime(datetime_str, time) days = [None, "M", "T", "W", "R", "F", None] simp_exceptions = ["Grade Mode"] def datetime2date_time(dtime: datetime.datetime, mode): if mode == "date": return datetime.date(dtime.year, dtime.month, dtime.day) elif mode == "time": return datetime.time(dtime.hour, dtime.minute, dtime.second) def seconds_from_midnight(t: datetime.time): return t.hour * 60 ** 2 + t.minute * 60 + t.second def parse_horz_row(headers, row: bs4.element.Tag): data = (col.text for col in row.find_all("td")) ret = {} time_data = dict(zip(headers, data)) try: time_data["time"] except KeyError as e: print(row) raise e if time_data["time"] == "TBA": ret["time_range"] = None else: s, e = map(dateparse, time_data["time"].split(" - ")) ret["time_range"] = ( datetime2date_time(s, "time"), datetime2date_time(e, "time"), ) s, e = map(dateparse, time_data["date range"].split(" - ")) time_data["days"] = re.sub( "[^{}]".format("".join(filter(bool, days))), "", time_data["days"] ) ret["days"] = sorted( (days.index(time_data["days"][i]) for i in range(len(time_data["days"]))) ) if len(ret["days"]) > 0: class_start = (s.weekday() + 1) % 7 start = ret["days"][0] s += datetime.timedelta(days=(start - class_start)) ret["date_range"] = ( datetime2date_time(s, "date"), datetime2date_time(e, "date"), ) ret["location"] = time_data["where"] return ret class Class: def __init__( self, title, abrv, session, term, crn, instructor, grade_mode, credits, level, campus, time_range, date_range, days, location, lab=None, ): # name self.title = title self.abrv = abrv # time self.date_range = date_range self.days = days self.time_range = time_range # location self.location = location self.campus = campus # other self.session = session self.term = term self.crn = crn self.instructor = instructor self.grade_mode = grade_mode self.credits = credits self.level = level self.lab = lab @classmethod def scrape(cls, info: bs4.element.Tag, times: bs4.element.Tag): # info title, abrv, session = info.find("caption").text.split(" - ") session = int(session) rows = info.find_all("tr") params = {} for row in rows: name = row.find("th").text.rstrip(":") data = re.sub(r"^ +|[\n\r\t]", "", row.find("td").text) if name == "Status": type, date = data.split(" on ") type = type.replace("*", "") registration_date = dateparse(date) else: if name in simp_exceptions: name = name.lower().replace(" ", "_") else: name = name.lower().split(" ")[-1] if name != "instructor": data = data.lower() try: data = int(re.sub(r"\.\d+", "", data)) except: pass params[name] = data # time headers, *data = times.find_all("tr") headers = list(header.text.lower() for header in headers.find_all("th")) if len(data) > 1: data, lab = map(lambda row: parse_horz_row(headers, row), data[:2]) lab.update(params) lab = Class(title + " - Lab", abrv, session, **lab) else: lab = None data = parse_horz_row(headers, data[0]) params.update(data) return Class(title, abrv, session, lab=lab, **params) def __repr__(self): return "{} on {}".format(self.title, "".join(days[i] for i in self.days)) @property def length(self): return datetime.timedelta( seconds=sub( seconds_from_midnight(self.time_range[1]), seconds_from_midnight(self.time_range[0]), ) ) def get_classes(page): if not isinstance(page, BS): page = BS(page, "lxml") tables = page.find_all("table", attrs={"class": "datadisplaytable"}) groups = ((tables[i], tables[i + 1]) for i in range(0, len(tables), 2)) return itertools.starmap(Class.scrape, groups) if __name__ == "__main__": with open("schedule.html") as file: page = BS(file.read(), "lxml") classes = list(get_classes(page)) for _class in classes: print(repr(_class), _class.date_range)