You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

180 lines
5.0 KiB

import datetime
import itertools
from operator import sub
import re
import bs4
from bs4 import BeautifulSoup as BS
def dateparse(datetime_str):
date = "%b %d, %Y"
time = "%I:%M %p"
try:
return datetime.datetime.strptime(datetime_str, date)
except ValueError:
return datetime.datetime.strptime(datetime_str, time)
days = [None, "M", "T", "W", "R", "F", None]
simp_exceptions = ["Grade Mode"]
def datetime2date_time(dtime: datetime.datetime, mode):
if mode == "date":
return datetime.date(dtime.year, dtime.month, dtime.day)
elif mode == "time":
return datetime.time(dtime.hour, dtime.minute, dtime.second)
def seconds_from_midnight(t: datetime.time):
return t.hour * 60 ** 2 + t.minute * 60 + t.second
def parse_horz_row(headers, row: bs4.element.Tag):
data = (col.text for col in row.find_all("td"))
ret = {}
time_data = dict(zip(headers, data))
try:
time_data["time"]
except KeyError as e:
print(row)
raise e
if time_data["time"] == "TBA":
ret["time_range"] = None
else:
s, e = map(dateparse, time_data["time"].split(" - "))
ret["time_range"] = (
datetime2date_time(s, "time"),
datetime2date_time(e, "time"),
)
s, e = map(dateparse, time_data["date range"].split(" - "))
time_data["days"] = re.sub(
"[^{}]".format("".join(filter(bool, days))), "", time_data["days"]
)
ret["days"] = sorted(
(days.index(time_data["days"][i]) for i in range(len(time_data["days"])))
)
if len(ret["days"]) > 0:
class_start = (s.weekday() + 1) % 7
start = ret["days"][0]
s += datetime.timedelta(days=(start - class_start))
ret["date_range"] = (
datetime2date_time(s, "date"),
datetime2date_time(e, "date"),
)
ret["location"] = time_data["where"]
return ret
class Class:
def __init__(
self,
title,
abrv,
session,
term,
crn,
instructor,
grade_mode,
credits,
level,
campus,
time_range,
date_range,
days,
location,
lab=None,
):
# name
self.title = title
self.abrv = abrv
# time
self.date_range = date_range
self.days = days
self.time_range = time_range
# location
self.location = location
self.campus = campus
# other
self.session = session
self.term = term
self.crn = crn
self.instructor = instructor
self.grade_mode = grade_mode
self.credits = credits
self.level = level
self.lab = lab
@classmethod
def scrape(cls, info: bs4.element.Tag, times: bs4.element.Tag):
# info
title, abrv, session = info.find("caption").text.split(" - ")
session = int(session)
rows = info.find_all("tr")
params = {}
for row in rows:
name = row.find("th").text.rstrip(":")
data = re.sub(r"^ +|[\n\r\t]", "", row.find("td").text)
if name == "Status":
type, date = data.split(" on ")
type = type.replace("*", "")
registration_date = dateparse(date)
else:
if name in simp_exceptions:
name = name.lower().replace(" ", "_")
else:
name = name.lower().split(" ")[-1]
if name != "instructor":
data = data.lower()
try:
data = int(re.sub(r"\.\d+", "", data))
except:
pass
params[name] = data
# time
headers, *data = times.find_all("tr")
headers = list(header.text.lower() for header in headers.find_all("th"))
if len(data) > 1:
data, lab = map(lambda row: parse_horz_row(headers, row), data[:2])
lab.update(params)
lab = Class(title + " - Lab", abrv, session, **lab)
else:
lab = None
data = parse_horz_row(headers, data[0])
params.update(data)
return Class(title, abrv, session, lab=lab, **params)
def __repr__(self):
return "{} on {}".format(self.title, "".join(days[i] for i in self.days))
@property
def length(self):
return datetime.timedelta(
seconds=sub(
seconds_from_midnight(self.time_range[1]),
seconds_from_midnight(self.time_range[0]),
)
)
def get_classes(page):
if not isinstance(page, BS):
page = BS(page, "lxml")
tables = page.find_all("table", attrs={"class": "datadisplaytable"})
groups = ((tables[i], tables[i + 1]) for i in range(0, len(tables), 2))
return itertools.starmap(Class.scrape, groups)
if __name__ == "__main__":
with open("schedule.html") as file:
page = BS(file.read(), "lxml")
classes = list(get_classes(page))
for _class in classes:
print(repr(_class), _class.date_range)