Browse Source

Added type annotations to functions

module
Raphael Roberts 6 years ago
parent
commit
0782be4c49
  1. 2
      .gitignore
  2. 175
      scraper.py

2
.gitignore

@ -3,3 +3,5 @@
__pycache__
/api_info
*.html
.dir-locals.el

175
scraper.py

@ -1,61 +1,79 @@
from bs4 import BeautifulSoup as BS
import datetime
import re
import itertools
from operator import sub
import re
import bs4
from bs4 import BeautifulSoup as BS
def dateparse(datetime_str):
date = '%b %d, %Y'
time = '%I:%M %p'
date = "%b %d, %Y"
time = "%I:%M %p"
try:
return datetime.datetime.strptime(datetime_str,date)
return datetime.datetime.strptime(datetime_str, date)
except ValueError:
return datetime.datetime.strptime(datetime_str,time)
days = [None,'M','T','W','R','F',None]
simp_exceptions = ['Grade Mode']
return datetime.datetime.strptime(datetime_str, time)
days = [None, "M", "T", "W", "R", "F", None]
simp_exceptions = ["Grade Mode"]
def datetime2date_time(dtime: datetime.datetime, mode):
if mode == "date":
return datetime.date(dtime.year, dtime.month, dtime.day)
elif mode == "time":
return datetime.time(dtime.hour, dtime.minute, dtime.second)
def datetime2date_time(dtime,mode):
if mode == 'date':
return datetime.date(dtime.year,dtime.month,dtime.day)
elif mode == 'time':
return datetime.time(dtime.hour,dtime.minute,dtime.second)
def seconds_from_midnight(t: datetime.time):
return t.hour * 60 ** 2 + t.minute * 60 + t.second
def seconds_from_midnight(t):
return t.hour*60**2+ t.minute*60+t.second
def parse_horz_row(headers,row):
data = (col.text for col in row.find_all('td'))
def parse_horz_row(headers, row: bs4.element.Tag):
data = (col.text for col in row.find_all("td"))
ret = {}
time_data = dict(zip(headers,data))
time_data = dict(zip(headers, data))
try:
time_data['time']
time_data["time"]
except KeyError as e:
print(row)
raise e
if time_data['time'] == 'TBA':
ret['time_range'] = None
if time_data["time"] == "TBA":
ret["time_range"] = None
else:
s,e = map(dateparse,time_data['time'].split(' - '))
ret['time_range'] = (
datetime2date_time(s,'time'),
datetime2date_time(e,'time'),
)
s,e = map(dateparse,time_data['date range'].split(' - '))
time_data['days'] = re.sub('[^{}]'.format(''.join(filter(bool,days))),'',time_data['days'])
ret['days'] = sorted((days.index(time_data['days'][i]) for i in range(len(time_data['days']))))
if len(ret['days']) > 0:
class_start = (s.weekday()+1)%7
start = ret['days'][0]
s, e = map(dateparse, time_data["time"].split(" - "))
ret["time_range"] = (
datetime2date_time(s, "time"),
datetime2date_time(e, "time"),
)
s, e = map(dateparse, time_data["date range"].split(" - "))
time_data["days"] = re.sub(
"[^{}]".format("".join(filter(bool, days))), "", time_data["days"]
)
ret["days"] = sorted(
(days.index(time_data["days"][i]) for i in range(len(time_data["days"])))
)
if len(ret["days"]) > 0:
class_start = (s.weekday() + 1) % 7
start = ret["days"][0]
s += datetime.timedelta(days=(start - class_start))
ret['date_range'] = (
datetime2date_time(s,'date'),
datetime2date_time(e,'date'),
)
ret['location'] = time_data['where']
ret["date_range"] = (
datetime2date_time(s, "date"),
datetime2date_time(e, "date"),
)
ret["location"] = time_data["where"]
return ret
class Class:
def __init__(self, title, abrv, session,
def __init__(
self,
title,
abrv,
session,
term,
crn,
instructor,
@ -67,19 +85,20 @@ class Class:
date_range,
days,
location,
lab=None):
lab=None,
):
#name
# name
self.title = title
self.abrv = abrv
#time
# time
self.date_range = date_range
self.days = days
self.time_range = time_range
#location
# location
self.location = location
self.campus = campus
#other
# other
self.session = session
self.term = term
self.crn = crn
@ -88,70 +107,74 @@ class Class:
self.credits = credits
self.level = level
self.lab = lab
# data is a list of two html tables
@classmethod
def scrape(cls,data):
info,times = data
def scrape(cls, info: bs4.element.Tag, times: bs4.element.Tag):
# info
title,abrv,session = info.find('caption').text.split(' - ')
title, abrv, session = info.find("caption").text.split(" - ")
session = int(session)
rows = info.find_all('tr')
rows = info.find_all("tr")
params = {}
for row in rows:
name = row.find('th').text.rstrip(':')
data = re.sub(r'^ +|[\n\r\t]','',row.find('td').text)
name = row.find("th").text.rstrip(":")
data = re.sub(r"^ +|[\n\r\t]", "", row.find("td").text)
if name == 'Status':
type,date = data.split(' on ')
type = type.replace('*','')
if name == "Status":
type, date = data.split(" on ")
type = type.replace("*", "")
registration_date = dateparse(date)
else:
if name in simp_exceptions:
name = name.lower().replace(' ','_')
name = name.lower().replace(" ", "_")
else:
name = name.lower().split(' ')[-1]
if name != 'instructor':
name = name.lower().split(" ")[-1]
if name != "instructor":
data = data.lower()
try:
data = int(re.sub(r'\.\d+','',data))
data = int(re.sub(r"\.\d+", "", data))
except:
pass
params[name] = data
# time
headers,*data = times.find_all('tr')
headers = list(header.text.lower() for header in headers.find_all('th'))
headers, *data = times.find_all("tr")
headers = list(header.text.lower() for header in headers.find_all("th"))
if len(data) > 1:
data,lab = map(lambda row: parse_horz_row(headers,row),data[:2])
data, lab = map(lambda row: parse_horz_row(headers, row), data[:2])
lab.update(params)
lab = Class(title + " - Lab",abrv,session,**lab)
lab = Class(title + " - Lab", abrv, session, **lab)
else:
lab = None
data = parse_horz_row(headers,data[0])
data = parse_horz_row(headers, data[0])
params.update(data)
return Class(title,abrv,session,lab=lab,**params)
return Class(title, abrv, session, lab=lab, **params)
def __repr__(self):
return '{} on {}'.format(self.title,''.join(days[i] for i in self.days))
return "{} on {}".format(self.title, "".join(days[i] for i in self.days))
@property
def length(self):
return datetime.timedelta(seconds = sub(
seconds_from_midnight(self.time_range[1]),
seconds_from_midnight(self.time_range[0]),
))
return datetime.timedelta(
seconds=sub(
seconds_from_midnight(self.time_range[1]),
seconds_from_midnight(self.time_range[0]),
)
)
def get_classes(page):
if not isinstance(page,BS):
page = BS(page,'lxml')
tables = page.find_all('table',attrs= {'class':'datadisplaytable'})
groups = ((tables[i],tables[i+1]) for i in range(0,len(tables),2))
return map(Class.scrape,groups)
if not isinstance(page, BS):
page = BS(page, "lxml")
tables = page.find_all("table", attrs={"class": "datadisplaytable"})
groups = ((tables[i], tables[i + 1]) for i in range(0, len(tables), 2))
return itertools.starmap(Class.scrape, groups)
if __name__ == "__main__":
with open('schedule.html') as file:
page = BS(file.read(),'lxml')
with open("schedule.html") as file:
page = BS(file.read(), "lxml")
classes = list(get_classes(page))
for _class in classes:
print(repr(_class),_class.date_range)
print(repr(_class), _class.date_range)
Loading…
Cancel
Save