Browse Source

Added type annotations to functions

module
Raphael Roberts 6 years ago
parent
commit
0782be4c49
  1. 2
      .gitignore
  2. 175
      scraper.py

2
.gitignore

@ -3,3 +3,5 @@
__pycache__ __pycache__
/api_info /api_info
*.html *.html
.dir-locals.el

175
scraper.py

@ -1,61 +1,79 @@
from bs4 import BeautifulSoup as BS
import datetime import datetime
import re
import itertools
from operator import sub from operator import sub
import re
import bs4
from bs4 import BeautifulSoup as BS
def dateparse(datetime_str): def dateparse(datetime_str):
date = '%b %d, %Y'
time = '%I:%M %p'
date = "%b %d, %Y"
time = "%I:%M %p"
try: try:
return datetime.datetime.strptime(datetime_str,date)
return datetime.datetime.strptime(datetime_str, date)
except ValueError: except ValueError:
return datetime.datetime.strptime(datetime_str,time)
days = [None,'M','T','W','R','F',None]
simp_exceptions = ['Grade Mode']
return datetime.datetime.strptime(datetime_str, time)
days = [None, "M", "T", "W", "R", "F", None]
simp_exceptions = ["Grade Mode"]
def datetime2date_time(dtime: datetime.datetime, mode):
if mode == "date":
return datetime.date(dtime.year, dtime.month, dtime.day)
elif mode == "time":
return datetime.time(dtime.hour, dtime.minute, dtime.second)
def datetime2date_time(dtime,mode):
if mode == 'date':
return datetime.date(dtime.year,dtime.month,dtime.day)
elif mode == 'time':
return datetime.time(dtime.hour,dtime.minute,dtime.second)
def seconds_from_midnight(t: datetime.time):
return t.hour * 60 ** 2 + t.minute * 60 + t.second
def seconds_from_midnight(t):
return t.hour*60**2+ t.minute*60+t.second
def parse_horz_row(headers,row):
data = (col.text for col in row.find_all('td'))
def parse_horz_row(headers, row: bs4.element.Tag):
data = (col.text for col in row.find_all("td"))
ret = {} ret = {}
time_data = dict(zip(headers,data))
time_data = dict(zip(headers, data))
try: try:
time_data['time']
time_data["time"]
except KeyError as e: except KeyError as e:
print(row) print(row)
raise e raise e
if time_data['time'] == 'TBA':
ret['time_range'] = None
if time_data["time"] == "TBA":
ret["time_range"] = None
else: else:
s,e = map(dateparse,time_data['time'].split(' - '))
ret['time_range'] = (
datetime2date_time(s,'time'),
datetime2date_time(e,'time'),
)
s,e = map(dateparse,time_data['date range'].split(' - '))
time_data['days'] = re.sub('[^{}]'.format(''.join(filter(bool,days))),'',time_data['days'])
ret['days'] = sorted((days.index(time_data['days'][i]) for i in range(len(time_data['days']))))
if len(ret['days']) > 0:
class_start = (s.weekday()+1)%7
start = ret['days'][0]
s, e = map(dateparse, time_data["time"].split(" - "))
ret["time_range"] = (
datetime2date_time(s, "time"),
datetime2date_time(e, "time"),
)
s, e = map(dateparse, time_data["date range"].split(" - "))
time_data["days"] = re.sub(
"[^{}]".format("".join(filter(bool, days))), "", time_data["days"]
)
ret["days"] = sorted(
(days.index(time_data["days"][i]) for i in range(len(time_data["days"])))
)
if len(ret["days"]) > 0:
class_start = (s.weekday() + 1) % 7
start = ret["days"][0]
s += datetime.timedelta(days=(start - class_start)) s += datetime.timedelta(days=(start - class_start))
ret['date_range'] = (
datetime2date_time(s,'date'),
datetime2date_time(e,'date'),
)
ret['location'] = time_data['where']
ret["date_range"] = (
datetime2date_time(s, "date"),
datetime2date_time(e, "date"),
)
ret["location"] = time_data["where"]
return ret return ret
class Class: class Class:
def __init__(self, title, abrv, session,
def __init__(
self,
title,
abrv,
session,
term, term,
crn, crn,
instructor, instructor,
@ -67,19 +85,20 @@ class Class:
date_range, date_range,
days, days,
location, location,
lab=None):
lab=None,
):
#name
# name
self.title = title self.title = title
self.abrv = abrv self.abrv = abrv
#time
# time
self.date_range = date_range self.date_range = date_range
self.days = days self.days = days
self.time_range = time_range self.time_range = time_range
#location
# location
self.location = location self.location = location
self.campus = campus self.campus = campus
#other
# other
self.session = session self.session = session
self.term = term self.term = term
self.crn = crn self.crn = crn
@ -88,70 +107,74 @@ class Class:
self.credits = credits self.credits = credits
self.level = level self.level = level
self.lab = lab self.lab = lab
# data is a list of two html tables
@classmethod @classmethod
def scrape(cls,data):
info,times = data
def scrape(cls, info: bs4.element.Tag, times: bs4.element.Tag):
# info # info
title,abrv,session = info.find('caption').text.split(' - ')
title, abrv, session = info.find("caption").text.split(" - ")
session = int(session) session = int(session)
rows = info.find_all('tr')
rows = info.find_all("tr")
params = {} params = {}
for row in rows: for row in rows:
name = row.find('th').text.rstrip(':')
data = re.sub(r'^ +|[\n\r\t]','',row.find('td').text)
name = row.find("th").text.rstrip(":")
data = re.sub(r"^ +|[\n\r\t]", "", row.find("td").text)
if name == 'Status':
type,date = data.split(' on ')
type = type.replace('*','')
if name == "Status":
type, date = data.split(" on ")
type = type.replace("*", "")
registration_date = dateparse(date) registration_date = dateparse(date)
else: else:
if name in simp_exceptions: if name in simp_exceptions:
name = name.lower().replace(' ','_')
name = name.lower().replace(" ", "_")
else: else:
name = name.lower().split(' ')[-1]
if name != 'instructor':
name = name.lower().split(" ")[-1]
if name != "instructor":
data = data.lower() data = data.lower()
try: try:
data = int(re.sub(r'\.\d+','',data))
data = int(re.sub(r"\.\d+", "", data))
except: except:
pass pass
params[name] = data params[name] = data
# time # time
headers,*data = times.find_all('tr')
headers = list(header.text.lower() for header in headers.find_all('th'))
headers, *data = times.find_all("tr")
headers = list(header.text.lower() for header in headers.find_all("th"))
if len(data) > 1: if len(data) > 1:
data,lab = map(lambda row: parse_horz_row(headers,row),data[:2])
data, lab = map(lambda row: parse_horz_row(headers, row), data[:2])
lab.update(params) lab.update(params)
lab = Class(title + " - Lab",abrv,session,**lab)
lab = Class(title + " - Lab", abrv, session, **lab)
else: else:
lab = None lab = None
data = parse_horz_row(headers,data[0])
data = parse_horz_row(headers, data[0])
params.update(data) params.update(data)
return Class(title,abrv,session,lab=lab,**params)
return Class(title, abrv, session, lab=lab, **params)
def __repr__(self): def __repr__(self):
return '{} on {}'.format(self.title,''.join(days[i] for i in self.days))
return "{} on {}".format(self.title, "".join(days[i] for i in self.days))
@property @property
def length(self): def length(self):
return datetime.timedelta(seconds = sub(
seconds_from_midnight(self.time_range[1]),
seconds_from_midnight(self.time_range[0]),
))
return datetime.timedelta(
seconds=sub(
seconds_from_midnight(self.time_range[1]),
seconds_from_midnight(self.time_range[0]),
)
)
def get_classes(page): def get_classes(page):
if not isinstance(page,BS):
page = BS(page,'lxml')
tables = page.find_all('table',attrs= {'class':'datadisplaytable'})
groups = ((tables[i],tables[i+1]) for i in range(0,len(tables),2))
return map(Class.scrape,groups)
if not isinstance(page, BS):
page = BS(page, "lxml")
tables = page.find_all("table", attrs={"class": "datadisplaytable"})
groups = ((tables[i], tables[i + 1]) for i in range(0, len(tables), 2))
return itertools.starmap(Class.scrape, groups)
if __name__ == "__main__": if __name__ == "__main__":
with open('schedule.html') as file:
page = BS(file.read(),'lxml')
with open("schedule.html") as file:
page = BS(file.read(), "lxml")
classes = list(get_classes(page)) classes = list(get_classes(page))
for _class in classes: for _class in classes:
print(repr(_class),_class.date_range)
print(repr(_class), _class.date_range)
Loading…
Cancel
Save