You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
110 lines
3.9 KiB
110 lines
3.9 KiB
from bs4 import BeautifulSoup as BS
|
|
import datetime
|
|
import re
|
|
from operator import sub
|
|
def dateparse(datetime_str):
|
|
date = '%b %d, %Y'
|
|
time = '%I:%M %p'
|
|
try:
|
|
return datetime.datetime.strptime(datetime_str,date)
|
|
except ValueError:
|
|
return datetime.datetime.strptime(datetime_str,time)
|
|
days = [None,'M','T','W','R','F',None]
|
|
simp_exceptions = ['Grade Mode']
|
|
def datetime2date_time(dtime,mode):
|
|
if mode == 'date':
|
|
return datetime.date(dtime.year,dtime.month,dtime.day)
|
|
elif mode == 'time':
|
|
return datetime.time(dtime.hour,dtime.minute,dtime.second)
|
|
def seconds_from_midnight(t):
|
|
return t.hour*60**2+ t.minute*60+t.second
|
|
class Class:
|
|
def __init__(self,title,session,days,location,time_range,date_range):
|
|
self.title = title
|
|
self.session = session
|
|
self.days = days
|
|
self.location = location
|
|
self.time_range = time_range
|
|
self.lab = None
|
|
self.date_range = date_range
|
|
# data is a list of two html tables
|
|
def scrape(self,data):
|
|
info,times = data
|
|
# info
|
|
self.title,self.abrv,self.session = info.find('caption').text.split(' - ')
|
|
self.lab = None
|
|
self.session = int(self.session)
|
|
rows = info.find_all('tr')
|
|
for row in rows:
|
|
name = row.find('th').text.rstrip(':')
|
|
data = re.sub(r'^ +|[\n\r\t]','',row.find('td').text)
|
|
|
|
if name == 'Status':
|
|
type,date = data.split(' on ')
|
|
type = type.replace('*','')
|
|
self.type = type
|
|
self.registration_date = dateparse(date)
|
|
else:
|
|
if name in simp_exceptions:
|
|
name = name.lower().replace(' ','_')
|
|
else:
|
|
name = name.lower().split(' ')[-1]
|
|
if name != 'instructor':
|
|
data = data.lower()
|
|
try:
|
|
data = int(re.sub(r'\.\d+','',data))
|
|
except:
|
|
|
|
pass
|
|
self.__dict__[name] = data
|
|
|
|
# time
|
|
headers,*data = times.find_all('tr')
|
|
if len(data) > 1:
|
|
data,lab = data[:2]
|
|
else
|
|
lab = None
|
|
data = data[0]
|
|
data = (col.text for col in data.find_all('td'))
|
|
headers = (header.text.lower() for header in headers.find_all('th'))
|
|
|
|
def parse_horz_row(headers,row):
|
|
ret = {}
|
|
time_data = dict(zip(headers,data))
|
|
if time_data['time'] == 'TBA':
|
|
ret['time_range'] = None
|
|
else:
|
|
s,e = map(dateparse,time_data['time'].split(' - '))
|
|
ret['time_range'] = (
|
|
datetime2date_time(s,'time'),
|
|
datetime2date_time(e,'time'),
|
|
)
|
|
s,e = map(dateparse,time_data['date range'].split(' - '))
|
|
ret['date_range'] = (
|
|
datetime2date_time(s,'date'),
|
|
datetime2date_time(e,'date'),
|
|
)
|
|
time_data['days'] = re.sub('[^{}]'.format(''.join(filter(bool,days))),'',time_data['days'])
|
|
ret['days'] = list(days.index(time_data['days'][i]) for i in range(len(time_data['days'])))
|
|
ret['location'] = time_data['where']
|
|
return ret
|
|
|
|
@property
|
|
def length(self):
|
|
return datetime.timedelta(seconds = sub(
|
|
seconds_from_midnight(self.time_range[1]),
|
|
seconds_from_midnight(self.time_range[0]),
|
|
))
|
|
|
|
|
|
def get_classes(page):
|
|
if not isinstance(page,BS):
|
|
page = BS(page,'lxml')
|
|
tables = page.find_all('table',attrs= {'class':'datadisplaytable'})
|
|
groups = ((tables[i],tables[i+1]) for i in range(0,len(tables),2))
|
|
return list(map(Class.scrape,groups))
|
|
|
|
if __name__ == "__main__":
|
|
with open('schedule.html') as file:
|
|
page = BS(file.read(),'lxml')
|
|
class1,*classes = get_classes(page)
|