Browse Source

Added type annotations to functions

module
Raphael Roberts 6 years ago
parent
commit
0782be4c49
  1. 2
      .gitignore
  2. 131
      scraper.py

2
.gitignore

@ -3,3 +3,5 @@
__pycache__ __pycache__
/api_info /api_info
*.html *.html
.dir-locals.el

131
scraper.py

@ -1,61 +1,79 @@
from bs4 import BeautifulSoup as BS
import datetime import datetime
import re
import itertools
from operator import sub from operator import sub
import re
import bs4
from bs4 import BeautifulSoup as BS
def dateparse(datetime_str): def dateparse(datetime_str):
date = '%b %d, %Y'
time = '%I:%M %p'
date = "%b %d, %Y"
time = "%I:%M %p"
try: try:
return datetime.datetime.strptime(datetime_str, date) return datetime.datetime.strptime(datetime_str, date)
except ValueError: except ValueError:
return datetime.datetime.strptime(datetime_str, time) return datetime.datetime.strptime(datetime_str, time)
days = [None,'M','T','W','R','F',None]
simp_exceptions = ['Grade Mode']
def datetime2date_time(dtime,mode):
if mode == 'date':
days = [None, "M", "T", "W", "R", "F", None]
simp_exceptions = ["Grade Mode"]
def datetime2date_time(dtime: datetime.datetime, mode):
if mode == "date":
return datetime.date(dtime.year, dtime.month, dtime.day) return datetime.date(dtime.year, dtime.month, dtime.day)
elif mode == 'time':
elif mode == "time":
return datetime.time(dtime.hour, dtime.minute, dtime.second) return datetime.time(dtime.hour, dtime.minute, dtime.second)
def seconds_from_midnight(t):
def seconds_from_midnight(t: datetime.time):
return t.hour * 60 ** 2 + t.minute * 60 + t.second return t.hour * 60 ** 2 + t.minute * 60 + t.second
def parse_horz_row(headers,row):
data = (col.text for col in row.find_all('td'))
def parse_horz_row(headers, row: bs4.element.Tag):
data = (col.text for col in row.find_all("td"))
ret = {} ret = {}
time_data = dict(zip(headers, data)) time_data = dict(zip(headers, data))
try: try:
time_data['time']
time_data["time"]
except KeyError as e: except KeyError as e:
print(row) print(row)
raise e raise e
if time_data['time'] == 'TBA':
ret['time_range'] = None
if time_data["time"] == "TBA":
ret["time_range"] = None
else: else:
s,e = map(dateparse,time_data['time'].split(' - '))
ret['time_range'] = (
datetime2date_time(s,'time'),
datetime2date_time(e,'time'),
s, e = map(dateparse, time_data["time"].split(" - "))
ret["time_range"] = (
datetime2date_time(s, "time"),
datetime2date_time(e, "time"),
)
s, e = map(dateparse, time_data["date range"].split(" - "))
time_data["days"] = re.sub(
"[^{}]".format("".join(filter(bool, days))), "", time_data["days"]
)
ret["days"] = sorted(
(days.index(time_data["days"][i]) for i in range(len(time_data["days"])))
) )
s,e = map(dateparse,time_data['date range'].split(' - '))
time_data['days'] = re.sub('[^{}]'.format(''.join(filter(bool,days))),'',time_data['days'])
ret['days'] = sorted((days.index(time_data['days'][i]) for i in range(len(time_data['days']))))
if len(ret['days']) > 0:
if len(ret["days"]) > 0:
class_start = (s.weekday() + 1) % 7 class_start = (s.weekday() + 1) % 7
start = ret['days'][0]
start = ret["days"][0]
s += datetime.timedelta(days=(start - class_start)) s += datetime.timedelta(days=(start - class_start))
ret['date_range'] = (
datetime2date_time(s,'date'),
datetime2date_time(e,'date'),
ret["date_range"] = (
datetime2date_time(s, "date"),
datetime2date_time(e, "date"),
) )
ret['location'] = time_data['where']
ret["location"] = time_data["where"]
return ret return ret
class Class: class Class:
def __init__(self, title, abrv, session,
def __init__(
self,
title,
abrv,
session,
term, term,
crn, crn,
instructor, instructor,
@ -67,7 +85,8 @@ class Class:
date_range, date_range,
days, days,
location, location,
lab=None):
lab=None,
):
# name # name
self.title = title self.title = title
@ -88,39 +107,38 @@ class Class:
self.credits = credits self.credits = credits
self.level = level self.level = level
self.lab = lab self.lab = lab
# data is a list of two html tables
@classmethod @classmethod
def scrape(cls,data):
info,times = data
def scrape(cls, info: bs4.element.Tag, times: bs4.element.Tag):
# info # info
title,abrv,session = info.find('caption').text.split(' - ')
title, abrv, session = info.find("caption").text.split(" - ")
session = int(session) session = int(session)
rows = info.find_all('tr')
rows = info.find_all("tr")
params = {} params = {}
for row in rows: for row in rows:
name = row.find('th').text.rstrip(':')
data = re.sub(r'^ +|[\n\r\t]','',row.find('td').text)
name = row.find("th").text.rstrip(":")
data = re.sub(r"^ +|[\n\r\t]", "", row.find("td").text)
if name == 'Status':
type,date = data.split(' on ')
type = type.replace('*','')
if name == "Status":
type, date = data.split(" on ")
type = type.replace("*", "")
registration_date = dateparse(date) registration_date = dateparse(date)
else: else:
if name in simp_exceptions: if name in simp_exceptions:
name = name.lower().replace(' ','_')
name = name.lower().replace(" ", "_")
else: else:
name = name.lower().split(' ')[-1]
if name != 'instructor':
name = name.lower().split(" ")[-1]
if name != "instructor":
data = data.lower() data = data.lower()
try: try:
data = int(re.sub(r'\.\d+','',data))
data = int(re.sub(r"\.\d+", "", data))
except: except:
pass pass
params[name] = data params[name] = data
# time # time
headers,*data = times.find_all('tr')
headers = list(header.text.lower() for header in headers.find_all('th'))
headers, *data = times.find_all("tr")
headers = list(header.text.lower() for header in headers.find_all("th"))
if len(data) > 1: if len(data) > 1:
data, lab = map(lambda row: parse_horz_row(headers, row), data[:2]) data, lab = map(lambda row: parse_horz_row(headers, row), data[:2])
lab.update(params) lab.update(params)
@ -132,26 +150,31 @@ class Class:
params.update(data) params.update(data)
return Class(title, abrv, session, lab=lab, **params) return Class(title, abrv, session, lab=lab, **params)
def __repr__(self): def __repr__(self):
return '{} on {}'.format(self.title,''.join(days[i] for i in self.days))
return "{} on {}".format(self.title, "".join(days[i] for i in self.days))
@property @property
def length(self): def length(self):
return datetime.timedelta(seconds = sub(
return datetime.timedelta(
seconds=sub(
seconds_from_midnight(self.time_range[1]), seconds_from_midnight(self.time_range[1]),
seconds_from_midnight(self.time_range[0]), seconds_from_midnight(self.time_range[0]),
))
)
)
def get_classes(page): def get_classes(page):
if not isinstance(page, BS): if not isinstance(page, BS):
page = BS(page,'lxml')
tables = page.find_all('table',attrs= {'class':'datadisplaytable'})
page = BS(page, "lxml")
tables = page.find_all("table", attrs={"class": "datadisplaytable"})
groups = ((tables[i], tables[i + 1]) for i in range(0, len(tables), 2)) groups = ((tables[i], tables[i + 1]) for i in range(0, len(tables), 2))
return map(Class.scrape,groups)
return itertools.starmap(Class.scrape, groups)
if __name__ == "__main__": if __name__ == "__main__":
with open('schedule.html') as file:
page = BS(file.read(),'lxml')
with open("schedule.html") as file:
page = BS(file.read(), "lxml")
classes = list(get_classes(page)) classes = list(get_classes(page))
for _class in classes: for _class in classes:
print(repr(_class), _class.date_range) print(repr(_class), _class.date_range)
Loading…
Cancel
Save