diff --git a/.gitignore b/.gitignore index 1b3360b..591cbc3 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ __pycache__ /api_info *.html + +.dir-locals.el \ No newline at end of file diff --git a/scraper.py b/scraper.py index 1e8f447..b5215d6 100644 --- a/scraper.py +++ b/scraper.py @@ -1,61 +1,79 @@ -from bs4 import BeautifulSoup as BS import datetime -import re +import itertools from operator import sub +import re + +import bs4 +from bs4 import BeautifulSoup as BS + def dateparse(datetime_str): - date = '%b %d, %Y' - time = '%I:%M %p' + date = "%b %d, %Y" + time = "%I:%M %p" try: - return datetime.datetime.strptime(datetime_str,date) + return datetime.datetime.strptime(datetime_str, date) except ValueError: - return datetime.datetime.strptime(datetime_str,time) -days = [None,'M','T','W','R','F',None] -simp_exceptions = ['Grade Mode'] + return datetime.datetime.strptime(datetime_str, time) + + +days = [None, "M", "T", "W", "R", "F", None] +simp_exceptions = ["Grade Mode"] + + +def datetime2date_time(dtime: datetime.datetime, mode): + if mode == "date": + return datetime.date(dtime.year, dtime.month, dtime.day) + elif mode == "time": + return datetime.time(dtime.hour, dtime.minute, dtime.second) + -def datetime2date_time(dtime,mode): - if mode == 'date': - return datetime.date(dtime.year,dtime.month,dtime.day) - elif mode == 'time': - return datetime.time(dtime.hour,dtime.minute,dtime.second) +def seconds_from_midnight(t: datetime.time): + return t.hour * 60 ** 2 + t.minute * 60 + t.second -def seconds_from_midnight(t): - return t.hour*60**2+ t.minute*60+t.second -def parse_horz_row(headers,row): - data = (col.text for col in row.find_all('td')) +def parse_horz_row(headers, row: bs4.element.Tag): + data = (col.text for col in row.find_all("td")) ret = {} - time_data = dict(zip(headers,data)) + time_data = dict(zip(headers, data)) try: - time_data['time'] + time_data["time"] except KeyError as e: print(row) raise e - if time_data['time'] == 'TBA': - ret['time_range'] = None + if time_data["time"] == "TBA": + ret["time_range"] = None else: - s,e = map(dateparse,time_data['time'].split(' - ')) - ret['time_range'] = ( - datetime2date_time(s,'time'), - datetime2date_time(e,'time'), - ) - s,e = map(dateparse,time_data['date range'].split(' - ')) - time_data['days'] = re.sub('[^{}]'.format(''.join(filter(bool,days))),'',time_data['days']) - ret['days'] = sorted((days.index(time_data['days'][i]) for i in range(len(time_data['days'])))) - if len(ret['days']) > 0: - class_start = (s.weekday()+1)%7 - start = ret['days'][0] + s, e = map(dateparse, time_data["time"].split(" - ")) + ret["time_range"] = ( + datetime2date_time(s, "time"), + datetime2date_time(e, "time"), + ) + s, e = map(dateparse, time_data["date range"].split(" - ")) + time_data["days"] = re.sub( + "[^{}]".format("".join(filter(bool, days))), "", time_data["days"] + ) + ret["days"] = sorted( + (days.index(time_data["days"][i]) for i in range(len(time_data["days"]))) + ) + if len(ret["days"]) > 0: + class_start = (s.weekday() + 1) % 7 + start = ret["days"][0] s += datetime.timedelta(days=(start - class_start)) - ret['date_range'] = ( - datetime2date_time(s,'date'), - datetime2date_time(e,'date'), - ) - ret['location'] = time_data['where'] + ret["date_range"] = ( + datetime2date_time(s, "date"), + datetime2date_time(e, "date"), + ) + ret["location"] = time_data["where"] return ret + class Class: - def __init__(self, title, abrv, session, + def __init__( + self, + title, + abrv, + session, term, crn, instructor, @@ -67,19 +85,20 @@ class Class: date_range, days, location, - lab=None): + lab=None, + ): - #name + # name self.title = title self.abrv = abrv - #time + # time self.date_range = date_range self.days = days self.time_range = time_range - #location + # location self.location = location self.campus = campus - #other + # other self.session = session self.term = term self.crn = crn @@ -88,70 +107,74 @@ class Class: self.credits = credits self.level = level self.lab = lab - # data is a list of two html tables + @classmethod - def scrape(cls,data): - info,times = data + def scrape(cls, info: bs4.element.Tag, times: bs4.element.Tag): # info - title,abrv,session = info.find('caption').text.split(' - ') + title, abrv, session = info.find("caption").text.split(" - ") session = int(session) - rows = info.find_all('tr') + rows = info.find_all("tr") params = {} for row in rows: - name = row.find('th').text.rstrip(':') - data = re.sub(r'^ +|[\n\r\t]','',row.find('td').text) + name = row.find("th").text.rstrip(":") + data = re.sub(r"^ +|[\n\r\t]", "", row.find("td").text) - if name == 'Status': - type,date = data.split(' on ') - type = type.replace('*','') + if name == "Status": + type, date = data.split(" on ") + type = type.replace("*", "") registration_date = dateparse(date) else: if name in simp_exceptions: - name = name.lower().replace(' ','_') + name = name.lower().replace(" ", "_") else: - name = name.lower().split(' ')[-1] - if name != 'instructor': + name = name.lower().split(" ")[-1] + if name != "instructor": data = data.lower() try: - data = int(re.sub(r'\.\d+','',data)) + data = int(re.sub(r"\.\d+", "", data)) except: pass params[name] = data # time - headers,*data = times.find_all('tr') - headers = list(header.text.lower() for header in headers.find_all('th')) + headers, *data = times.find_all("tr") + headers = list(header.text.lower() for header in headers.find_all("th")) if len(data) > 1: - data,lab = map(lambda row: parse_horz_row(headers,row),data[:2]) + data, lab = map(lambda row: parse_horz_row(headers, row), data[:2]) lab.update(params) - lab = Class(title + " - Lab",abrv,session,**lab) + lab = Class(title + " - Lab", abrv, session, **lab) else: lab = None - data = parse_horz_row(headers,data[0]) + data = parse_horz_row(headers, data[0]) params.update(data) - return Class(title,abrv,session,lab=lab,**params) + return Class(title, abrv, session, lab=lab, **params) + def __repr__(self): - return '{} on {}'.format(self.title,''.join(days[i] for i in self.days)) + return "{} on {}".format(self.title, "".join(days[i] for i in self.days)) + @property def length(self): - return datetime.timedelta(seconds = sub( - seconds_from_midnight(self.time_range[1]), - seconds_from_midnight(self.time_range[0]), - )) + return datetime.timedelta( + seconds=sub( + seconds_from_midnight(self.time_range[1]), + seconds_from_midnight(self.time_range[0]), + ) + ) def get_classes(page): - if not isinstance(page,BS): - page = BS(page,'lxml') - tables = page.find_all('table',attrs= {'class':'datadisplaytable'}) - groups = ((tables[i],tables[i+1]) for i in range(0,len(tables),2)) - return map(Class.scrape,groups) + if not isinstance(page, BS): + page = BS(page, "lxml") + tables = page.find_all("table", attrs={"class": "datadisplaytable"}) + groups = ((tables[i], tables[i + 1]) for i in range(0, len(tables), 2)) + return itertools.starmap(Class.scrape, groups) + if __name__ == "__main__": - with open('schedule.html') as file: - page = BS(file.read(),'lxml') + with open("schedule.html") as file: + page = BS(file.read(), "lxml") classes = list(get_classes(page)) for _class in classes: - print(repr(_class),_class.date_range) \ No newline at end of file + print(repr(_class), _class.date_range)