diff --git a/scraper.py b/scraper.py index cd31101..f171960 100644 --- a/scraper.py +++ b/scraper.py @@ -18,23 +18,77 @@ def datetime2date_time(dtime,mode): return datetime.time(dtime.hour,dtime.minute,dtime.second) def seconds_from_midnight(t): return t.hour*60**2+ t.minute*60+t.second + +def parse_horz_row(headers,row): + data = (col.text for col in row.find_all('td')) + ret = {} + time_data = dict(zip(headers,data)) + try: + time_data['time'] + except KeyError as e: + print(row) + raise e + if time_data['time'] == 'TBA': + ret['time_range'] = None + else: + s,e = map(dateparse,time_data['time'].split(' - ')) + ret['time_range'] = ( + datetime2date_time(s,'time'), + datetime2date_time(e,'time'), + ) + s,e = map(dateparse,time_data['date range'].split(' - ')) + ret['date_range'] = ( + datetime2date_time(s,'date'), + datetime2date_time(e,'date'), + ) + time_data['days'] = re.sub('[^{}]'.format(''.join(filter(bool,days))),'',time_data['days']) + ret['days'] = list(days.index(time_data['days'][i]) for i in range(len(time_data['days']))) + ret['location'] = time_data['where'] + return ret + class Class: - def __init__(self,title,abrv,session,days,location,time_range,date_range,lab=None): + def __init__(self, title, abrv, session, + term, + crn, + instructor, + grade_mode, + credits, + level, + campus, + time_range, + date_range, + days, + location, + lab=None): + + #name self.title = title self.abrv = abrv - self.session = session + #time + self.date_range = date_range self.days = days - self.location = location self.time_range = time_range + #location + self.location = location + self.campus = campus + #other + self.session = session + self.term = term + self.crn = crn + self.instructor = instructor + self.grade_mode = grade_mode + self.credits = credits + self.level = level self.lab = lab - self.date_range = date_range # data is a list of two html tables - def scrape(self,data): + @classmethod + def scrape(cls,data): info,times = data # info - title,abrv,sesession = info.find('caption').text.split(' - ') - session = int(self.session) + title,abrv,session = info.find('caption').text.split(' - ') + session = int(session) rows = info.find_all('tr') + params = {} for row in rows: name = row.find('th').text.rstrip(':') data = re.sub(r'^ +|[\n\r\t]','',row.find('td').text) @@ -42,8 +96,7 @@ class Class: if name == 'Status': type,date = data.split(' on ') type = type.replace('*','') - self.type = type - self.registration_date = dateparse(date) + registration_date = dateparse(date) else: if name in simp_exceptions: name = name.lower().replace(' ','_') @@ -56,43 +109,23 @@ class Class: except: pass - self.__dict__[name] = data - + params[name] = data # time headers,*data = times.find_all('tr') - headers = (header.text.lower() for header in headers.find_all('th')) + headers = list(header.text.lower() for header in headers.find_all('th')) if len(data) > 1: data,lab = map(lambda row: parse_horz_row(headers,row),data[:2]) + lab.update(params) lab = Class(title + " - Lab",abrv,session,**lab) else: lab = None - data = data[0] - - return Class(title,abrv,session,lab=lab,**data) - - def parse_horz_row(headers,row): - data = (col.text for col in row.find_all('td')) - ret = {} - time_data = dict(zip(headers,data)) - if time_data['time'] == 'TBA': - ret['time_range'] = None - else: - s,e = map(dateparse,time_data['time'].split(' - ')) - ret['time_range'] = ( - datetime2date_time(s,'time'), - datetime2date_time(e,'time'), - ) - s,e = map(dateparse,time_data['date range'].split(' - ')) - ret['date_range'] = ( - datetime2date_time(s,'date'), - datetime2date_time(e,'date'), - ) - time_data['days'] = re.sub('[^{}]'.format(''.join(filter(bool,days))),'',time_data['days']) - ret['days'] = list(days.index(time_data['days'][i]) for i in range(len(time_data['days']))) - ret['location'] = time_data['where'] - return ret + data = parse_horz_row(headers,data[0]) + params.update(data) + return Class(title,abrv,session,lab=lab,**params) + def __repr__(self): + return '{} on {}'.format(self.title,''.join(days[i] for i in self.days)) @property def length(self): return datetime.timedelta(seconds = sub( @@ -110,5 +143,5 @@ def get_classes(page): if __name__ == "__main__": with open('schedule.html') as file: - page = BS(file.read()) + page = BS(file.read(),'lxml') class1,*classes = get_classes(page)