from bs4 import BeautifulSoup as BS import datetime import re from operator import sub def dateparse(datetime_str): date = '%b %d, %Y' time = '%I:%M %p' try: return datetime.datetime.strptime(datetime_str,date) except ValueError: return datetime.datetime.strptime(datetime_str,time) days = [None,'M','T','W','R','F',None] simp_exceptions = ['Grade Mode'] def datetime2date_time(dtime,mode): if mode == 'date': return datetime.date(dtime.year,dtime.month,dtime.day) elif mode == 'time': return datetime.time(dtime.hour,dtime.minute,dtime.second) def seconds_from_midnight(t): return t.hour*60**2+ t.minute*60+t.second class Class: def __init__(self,data): info,times = data #info self.title,self.abrv,self.session = info.find('caption').text.split(' - ') self.session = int(self.session) rows = info.find_all('tr') for row in rows: name = row.find('th').text.rstrip(':') data = re.sub(r'^ +|[\n\r\t]','',row.find('td').text) if name == 'Status': type,date = data.split(' on ') type = type.replace('*','') self.type = type self.registration_date = dateparse(date) else: if name in simp_exceptions: name = name.lower().replace(' ','_') else: name = name.lower().split(' ')[-1] if name != 'instructor': data = data.lower() try: data = int(re.sub(r'\.\d+','',data)) except: pass self.__dict__[name] = data #time headers,data = times.find_all('tr') data = (col.text for col in data.find_all('td')) headers = (header.text.lower() for header in headers.find_all('th')) time_data = dict(zip(headers,data)) if time_data['time'] == 'TBA': self.time_range = None else: s,e = map(dateparse,time_data['time'].split(' - ')) self.time_range = ( datetime2date_time(s,'time'), datetime2date_time(e,'time'), ) s,e = map(dateparse,time_data['date range'].split(' - ')) self.date_range = ( datetime2date_time(s,'date'), datetime2date_time(e,'date'), ) time_data['days'] = re.sub('[^{}]'.format(''.join(filter(bool,days))),'',time_data['days']) self.days = list(days.index(time_data['days'][i]) for i in range(len(time_data['days']))) self.location = time_data['where'] @property def length(self): return datetime.timedelta(seconds = sub( seconds_from_midnight(self.time_range[1]), seconds_from_midnight(self.time_range[0]), )) def get_classes(page): if not isinstance(page,BS): page = BS(page,'lxml') tables = page.find_all('table',attrs= {'class':'datadisplaytable'}) groups = ((tables[i],tables[i+1]) for i in range(0,len(tables),2)) return list(map(Class,groups)) if __name__ == "__main__": with open('schedule.html') as file: page = BS(file.read(),'lxml') class1,*classes = get_classes(page)