|
|
|
@ -1,61 +1,79 @@ |
|
|
|
from bs4 import BeautifulSoup as BS |
|
|
|
import datetime |
|
|
|
import re |
|
|
|
import itertools |
|
|
|
from operator import sub |
|
|
|
import re |
|
|
|
|
|
|
|
import bs4 |
|
|
|
from bs4 import BeautifulSoup as BS |
|
|
|
|
|
|
|
|
|
|
|
def dateparse(datetime_str): |
|
|
|
date = '%b %d, %Y' |
|
|
|
time = '%I:%M %p' |
|
|
|
date = "%b %d, %Y" |
|
|
|
time = "%I:%M %p" |
|
|
|
try: |
|
|
|
return datetime.datetime.strptime(datetime_str,date) |
|
|
|
return datetime.datetime.strptime(datetime_str, date) |
|
|
|
except ValueError: |
|
|
|
return datetime.datetime.strptime(datetime_str,time) |
|
|
|
days = [None,'M','T','W','R','F',None] |
|
|
|
simp_exceptions = ['Grade Mode'] |
|
|
|
return datetime.datetime.strptime(datetime_str, time) |
|
|
|
|
|
|
|
|
|
|
|
days = [None, "M", "T", "W", "R", "F", None] |
|
|
|
simp_exceptions = ["Grade Mode"] |
|
|
|
|
|
|
|
|
|
|
|
def datetime2date_time(dtime: datetime.datetime, mode): |
|
|
|
if mode == "date": |
|
|
|
return datetime.date(dtime.year, dtime.month, dtime.day) |
|
|
|
elif mode == "time": |
|
|
|
return datetime.time(dtime.hour, dtime.minute, dtime.second) |
|
|
|
|
|
|
|
|
|
|
|
def datetime2date_time(dtime,mode): |
|
|
|
if mode == 'date': |
|
|
|
return datetime.date(dtime.year,dtime.month,dtime.day) |
|
|
|
elif mode == 'time': |
|
|
|
return datetime.time(dtime.hour,dtime.minute,dtime.second) |
|
|
|
def seconds_from_midnight(t: datetime.time): |
|
|
|
return t.hour * 60 ** 2 + t.minute * 60 + t.second |
|
|
|
|
|
|
|
def seconds_from_midnight(t): |
|
|
|
return t.hour*60**2+ t.minute*60+t.second |
|
|
|
|
|
|
|
def parse_horz_row(headers,row): |
|
|
|
data = (col.text for col in row.find_all('td')) |
|
|
|
def parse_horz_row(headers, row: bs4.element.Tag): |
|
|
|
data = (col.text for col in row.find_all("td")) |
|
|
|
ret = {} |
|
|
|
time_data = dict(zip(headers,data)) |
|
|
|
time_data = dict(zip(headers, data)) |
|
|
|
try: |
|
|
|
time_data['time'] |
|
|
|
time_data["time"] |
|
|
|
except KeyError as e: |
|
|
|
print(row) |
|
|
|
raise e |
|
|
|
if time_data['time'] == 'TBA': |
|
|
|
ret['time_range'] = None |
|
|
|
if time_data["time"] == "TBA": |
|
|
|
ret["time_range"] = None |
|
|
|
else: |
|
|
|
s,e = map(dateparse,time_data['time'].split(' - ')) |
|
|
|
ret['time_range'] = ( |
|
|
|
datetime2date_time(s,'time'), |
|
|
|
datetime2date_time(e,'time'), |
|
|
|
) |
|
|
|
s,e = map(dateparse,time_data['date range'].split(' - ')) |
|
|
|
time_data['days'] = re.sub('[^{}]'.format(''.join(filter(bool,days))),'',time_data['days']) |
|
|
|
ret['days'] = sorted((days.index(time_data['days'][i]) for i in range(len(time_data['days'])))) |
|
|
|
if len(ret['days']) > 0: |
|
|
|
class_start = (s.weekday()+1)%7 |
|
|
|
start = ret['days'][0] |
|
|
|
s, e = map(dateparse, time_data["time"].split(" - ")) |
|
|
|
ret["time_range"] = ( |
|
|
|
datetime2date_time(s, "time"), |
|
|
|
datetime2date_time(e, "time"), |
|
|
|
) |
|
|
|
s, e = map(dateparse, time_data["date range"].split(" - ")) |
|
|
|
time_data["days"] = re.sub( |
|
|
|
"[^{}]".format("".join(filter(bool, days))), "", time_data["days"] |
|
|
|
) |
|
|
|
ret["days"] = sorted( |
|
|
|
(days.index(time_data["days"][i]) for i in range(len(time_data["days"]))) |
|
|
|
) |
|
|
|
if len(ret["days"]) > 0: |
|
|
|
class_start = (s.weekday() + 1) % 7 |
|
|
|
start = ret["days"][0] |
|
|
|
s += datetime.timedelta(days=(start - class_start)) |
|
|
|
|
|
|
|
ret['date_range'] = ( |
|
|
|
datetime2date_time(s,'date'), |
|
|
|
datetime2date_time(e,'date'), |
|
|
|
) |
|
|
|
ret['location'] = time_data['where'] |
|
|
|
ret["date_range"] = ( |
|
|
|
datetime2date_time(s, "date"), |
|
|
|
datetime2date_time(e, "date"), |
|
|
|
) |
|
|
|
ret["location"] = time_data["where"] |
|
|
|
return ret |
|
|
|
|
|
|
|
|
|
|
|
class Class: |
|
|
|
def __init__(self, title, abrv, session, |
|
|
|
def __init__( |
|
|
|
self, |
|
|
|
title, |
|
|
|
abrv, |
|
|
|
session, |
|
|
|
term, |
|
|
|
crn, |
|
|
|
instructor, |
|
|
|
@ -67,19 +85,20 @@ class Class: |
|
|
|
date_range, |
|
|
|
days, |
|
|
|
location, |
|
|
|
lab=None): |
|
|
|
lab=None, |
|
|
|
): |
|
|
|
|
|
|
|
#name |
|
|
|
# name |
|
|
|
self.title = title |
|
|
|
self.abrv = abrv |
|
|
|
#time |
|
|
|
# time |
|
|
|
self.date_range = date_range |
|
|
|
self.days = days |
|
|
|
self.time_range = time_range |
|
|
|
#location |
|
|
|
# location |
|
|
|
self.location = location |
|
|
|
self.campus = campus |
|
|
|
#other |
|
|
|
# other |
|
|
|
self.session = session |
|
|
|
self.term = term |
|
|
|
self.crn = crn |
|
|
|
@ -88,70 +107,74 @@ class Class: |
|
|
|
self.credits = credits |
|
|
|
self.level = level |
|
|
|
self.lab = lab |
|
|
|
# data is a list of two html tables |
|
|
|
|
|
|
|
@classmethod |
|
|
|
def scrape(cls,data): |
|
|
|
info,times = data |
|
|
|
def scrape(cls, info: bs4.element.Tag, times: bs4.element.Tag): |
|
|
|
# info |
|
|
|
title,abrv,session = info.find('caption').text.split(' - ') |
|
|
|
title, abrv, session = info.find("caption").text.split(" - ") |
|
|
|
session = int(session) |
|
|
|
rows = info.find_all('tr') |
|
|
|
rows = info.find_all("tr") |
|
|
|
params = {} |
|
|
|
for row in rows: |
|
|
|
name = row.find('th').text.rstrip(':') |
|
|
|
data = re.sub(r'^ +|[\n\r\t]','',row.find('td').text) |
|
|
|
name = row.find("th").text.rstrip(":") |
|
|
|
data = re.sub(r"^ +|[\n\r\t]", "", row.find("td").text) |
|
|
|
|
|
|
|
if name == 'Status': |
|
|
|
type,date = data.split(' on ') |
|
|
|
type = type.replace('*','') |
|
|
|
if name == "Status": |
|
|
|
type, date = data.split(" on ") |
|
|
|
type = type.replace("*", "") |
|
|
|
registration_date = dateparse(date) |
|
|
|
else: |
|
|
|
if name in simp_exceptions: |
|
|
|
name = name.lower().replace(' ','_') |
|
|
|
name = name.lower().replace(" ", "_") |
|
|
|
else: |
|
|
|
name = name.lower().split(' ')[-1] |
|
|
|
if name != 'instructor': |
|
|
|
name = name.lower().split(" ")[-1] |
|
|
|
if name != "instructor": |
|
|
|
data = data.lower() |
|
|
|
try: |
|
|
|
data = int(re.sub(r'\.\d+','',data)) |
|
|
|
data = int(re.sub(r"\.\d+", "", data)) |
|
|
|
except: |
|
|
|
|
|
|
|
pass |
|
|
|
params[name] = data |
|
|
|
# time |
|
|
|
headers,*data = times.find_all('tr') |
|
|
|
headers = list(header.text.lower() for header in headers.find_all('th')) |
|
|
|
headers, *data = times.find_all("tr") |
|
|
|
headers = list(header.text.lower() for header in headers.find_all("th")) |
|
|
|
if len(data) > 1: |
|
|
|
data,lab = map(lambda row: parse_horz_row(headers,row),data[:2]) |
|
|
|
data, lab = map(lambda row: parse_horz_row(headers, row), data[:2]) |
|
|
|
lab.update(params) |
|
|
|
lab = Class(title + " - Lab",abrv,session,**lab) |
|
|
|
lab = Class(title + " - Lab", abrv, session, **lab) |
|
|
|
|
|
|
|
else: |
|
|
|
lab = None |
|
|
|
data = parse_horz_row(headers,data[0]) |
|
|
|
data = parse_horz_row(headers, data[0]) |
|
|
|
|
|
|
|
params.update(data) |
|
|
|
return Class(title,abrv,session,lab=lab,**params) |
|
|
|
return Class(title, abrv, session, lab=lab, **params) |
|
|
|
|
|
|
|
def __repr__(self): |
|
|
|
return '{} on {}'.format(self.title,''.join(days[i] for i in self.days)) |
|
|
|
return "{} on {}".format(self.title, "".join(days[i] for i in self.days)) |
|
|
|
|
|
|
|
@property |
|
|
|
def length(self): |
|
|
|
return datetime.timedelta(seconds = sub( |
|
|
|
seconds_from_midnight(self.time_range[1]), |
|
|
|
seconds_from_midnight(self.time_range[0]), |
|
|
|
)) |
|
|
|
return datetime.timedelta( |
|
|
|
seconds=sub( |
|
|
|
seconds_from_midnight(self.time_range[1]), |
|
|
|
seconds_from_midnight(self.time_range[0]), |
|
|
|
) |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
def get_classes(page): |
|
|
|
if not isinstance(page,BS): |
|
|
|
page = BS(page,'lxml') |
|
|
|
tables = page.find_all('table',attrs= {'class':'datadisplaytable'}) |
|
|
|
groups = ((tables[i],tables[i+1]) for i in range(0,len(tables),2)) |
|
|
|
return map(Class.scrape,groups) |
|
|
|
if not isinstance(page, BS): |
|
|
|
page = BS(page, "lxml") |
|
|
|
tables = page.find_all("table", attrs={"class": "datadisplaytable"}) |
|
|
|
groups = ((tables[i], tables[i + 1]) for i in range(0, len(tables), 2)) |
|
|
|
return itertools.starmap(Class.scrape, groups) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
with open('schedule.html') as file: |
|
|
|
page = BS(file.read(),'lxml') |
|
|
|
with open("schedule.html") as file: |
|
|
|
page = BS(file.read(), "lxml") |
|
|
|
classes = list(get_classes(page)) |
|
|
|
for _class in classes: |
|
|
|
print(repr(_class),_class.date_range) |
|
|
|
print(repr(_class), _class.date_range) |