from bs4 import BeautifulSoup as BS import datetime import re from operator import sub def dateparse(datetime_str): date = '%b %d, %Y' time = '%I:%M %p' try: return datetime.datetime.strptime(datetime_str,date) except ValueError: return datetime.datetime.strptime(datetime_str,time) days = [None,'M','T','W','R','F',None] simp_exceptions = ['Grade Mode'] def datetime2date_time(dtime,mode): if mode == 'date': return datetime.date(dtime.year,dtime.month,dtime.day) elif mode == 'time': return datetime.time(dtime.hour,dtime.minute,dtime.second) def seconds_from_midnight(t): return t.hour*60**2+ t.minute*60+t.second def parse_horz_row(headers,row): data = (col.text for col in row.find_all('td')) ret = {} time_data = dict(zip(headers,data)) try: time_data['time'] except KeyError as e: print(row) raise e if time_data['time'] == 'TBA': ret['time_range'] = None else: s,e = map(dateparse,time_data['time'].split(' - ')) ret['time_range'] = ( datetime2date_time(s,'time'), datetime2date_time(e,'time'), ) s,e = map(dateparse,time_data['date range'].split(' - ')) time_data['days'] = re.sub('[^{}]'.format(''.join(filter(bool,days))),'',time_data['days']) ret['days'] = sorted((days.index(time_data['days'][i]) for i in range(len(time_data['days'])))) if len(ret['days']) > 0: class_start = (s.weekday()+1)%7 start = ret['days'][0] s += datetime.timedelta(days=(start - class_start)) ret['date_range'] = ( datetime2date_time(s,'date'), datetime2date_time(e,'date'), ) ret['location'] = time_data['where'] return ret class Class: def __init__(self, title, abrv, session, term, crn, instructor, grade_mode, credits, level, campus, time_range, date_range, days, location, lab=None): #name self.title = title self.abrv = abrv #time self.date_range = date_range self.days = days self.time_range = time_range #location self.location = location self.campus = campus #other self.session = session self.term = term self.crn = crn self.instructor = instructor self.grade_mode = grade_mode self.credits = credits self.level = level self.lab = lab # data is a list of two html tables @classmethod def scrape(cls,data): info,times = data # info title,abrv,session = info.find('caption').text.split(' - ') session = int(session) rows = info.find_all('tr') params = {} for row in rows: name = row.find('th').text.rstrip(':') data = re.sub(r'^ +|[\n\r\t]','',row.find('td').text) if name == 'Status': type,date = data.split(' on ') type = type.replace('*','') registration_date = dateparse(date) else: if name in simp_exceptions: name = name.lower().replace(' ','_') else: name = name.lower().split(' ')[-1] if name != 'instructor': data = data.lower() try: data = int(re.sub(r'\.\d+','',data)) except: pass params[name] = data # time headers,*data = times.find_all('tr') headers = list(header.text.lower() for header in headers.find_all('th')) if len(data) > 1: data,lab = map(lambda row: parse_horz_row(headers,row),data[:2]) lab.update(params) lab = Class(title + " - Lab",abrv,session,**lab) else: lab = None data = parse_horz_row(headers,data[0]) params.update(data) return Class(title,abrv,session,lab=lab,**params) def __repr__(self): return '{} on {}'.format(self.title,''.join(days[i] for i in self.days)) @property def length(self): return datetime.timedelta(seconds = sub( seconds_from_midnight(self.time_range[1]), seconds_from_midnight(self.time_range[0]), )) def get_classes(page): if not isinstance(page,BS): page = BS(page,'lxml') tables = page.find_all('table',attrs= {'class':'datadisplaytable'}) groups = ((tables[i],tables[i+1]) for i in range(0,len(tables),2)) return map(Class.scrape,groups) if __name__ == "__main__": with open('schedule.html') as file: page = BS(file.read(),'lxml') classes = list(get_classes(page)) for _class in classes: print(repr(_class),_class.date_range)