|
|
from bs4 import BeautifulSoup as BSimport datetimeimport refrom operator import subdef dateparse(datetime_str): date = '%b %d, %Y' time = '%I:%M %p' try: return datetime.datetime.strptime(datetime_str,date) except ValueError: return datetime.datetime.strptime(datetime_str,time)days = [None,'M','T','W','R','F',None]simp_exceptions = ['Grade Mode']def datetime2date_time(dtime,mode): if mode == 'date': return datetime.date(dtime.year,dtime.month,dtime.day) elif mode == 'time': return datetime.time(dtime.hour,dtime.minute,dtime.second)def seconds_from_midnight(t): return t.hour*60**2+ t.minute*60+t.secondclass Class: def __init__(self,title,abrv,session,days,location,time_range,date_range,lab): self.title = title self.abrv = abrv self.session = session self.days = days self.location = location self.time_range = time_range self.lab = None self.date_range = date_range # data is a list of two html tables def scrape(self,data): info,times = data # info title,abrv,sesession = info.find('caption').text.split(' - ') self.lab = None self.session = int(self.session) rows = info.find_all('tr') for row in rows: name = row.find('th').text.rstrip(':') data = re.sub(r'^ +|[\n\r\t]','',row.find('td').text)
if name == 'Status': type,date = data.split(' on ') type = type.replace('*','') self.type = type self.registration_date = dateparse(date) else: if name in simp_exceptions: name = name.lower().replace(' ','_') else: name = name.lower().split(' ')[-1] if name != 'instructor': data = data.lower() try: data = int(re.sub(r'\.\d+','',data)) except:
pass self.__dict__[name] = data
# time headers,*data = times.find_all('tr') headers = (header.text.lower() for header in headers.find_all('th')) if len(data) > 1: data,lab = map(lambda row: parse_horz_row(headers,row),data[:2])
else: lab = None data = data[0]
def parse_horz_row(headers,row): data = (col.text for col in row.find_all('td')) ret = {} time_data = dict(zip(headers,data)) if time_data['time'] == 'TBA': ret['time_range'] = None else: s,e = map(dateparse,time_data['time'].split(' - ')) ret['time_range'] = ( datetime2date_time(s,'time'), datetime2date_time(e,'time'), ) s,e = map(dateparse,time_data['date range'].split(' - ')) ret['date_range'] = ( datetime2date_time(s,'date'), datetime2date_time(e,'date'), ) time_data['days'] = re.sub('[^{}]'.format(''.join(filter(bool,days))),'',time_data['days']) ret['days'] = list(days.index(time_data['days'][i]) for i in range(len(time_data['days']))) ret['location'] = time_data['where'] return ret
@property def length(self): return datetime.timedelta(seconds = sub( seconds_from_midnight(self.time_range[1]), seconds_from_midnight(self.time_range[0]), ))
def get_classes(page): if not isinstance(page,BS): page = BS(page,'lxml') tables = page.find_all('table',attrs= {'class':'datadisplaytable'}) groups = ((tables[i],tables[i+1]) for i in range(0,len(tables),2)) return map(Class.scrape,groups)
if __name__ == "__main__": with open('schedule.html') as file: page = BS(file.read(),'lxml') class1,*classes = get_classes(page)
|