Browse Source

finished Class class/scraper

partial
rlbr 7 years ago
parent
commit
aede270703
  1. 109
      scraper.py

109
scraper.py

@ -18,23 +18,77 @@ def datetime2date_time(dtime,mode):
return datetime.time(dtime.hour,dtime.minute,dtime.second)
def seconds_from_midnight(t):
return t.hour*60**2+ t.minute*60+t.second
def parse_horz_row(headers,row):
data = (col.text for col in row.find_all('td'))
ret = {}
time_data = dict(zip(headers,data))
try:
time_data['time']
except KeyError as e:
print(row)
raise e
if time_data['time'] == 'TBA':
ret['time_range'] = None
else:
s,e = map(dateparse,time_data['time'].split(' - '))
ret['time_range'] = (
datetime2date_time(s,'time'),
datetime2date_time(e,'time'),
)
s,e = map(dateparse,time_data['date range'].split(' - '))
ret['date_range'] = (
datetime2date_time(s,'date'),
datetime2date_time(e,'date'),
)
time_data['days'] = re.sub('[^{}]'.format(''.join(filter(bool,days))),'',time_data['days'])
ret['days'] = list(days.index(time_data['days'][i]) for i in range(len(time_data['days'])))
ret['location'] = time_data['where']
return ret
class Class:
def __init__(self,title,abrv,session,days,location,time_range,date_range,lab=None):
def __init__(self, title, abrv, session,
term,
crn,
instructor,
grade_mode,
credits,
level,
campus,
time_range,
date_range,
days,
location,
lab=None):
#name
self.title = title
self.abrv = abrv
self.session = session
#time
self.date_range = date_range
self.days = days
self.location = location
self.time_range = time_range
#location
self.location = location
self.campus = campus
#other
self.session = session
self.term = term
self.crn = crn
self.instructor = instructor
self.grade_mode = grade_mode
self.credits = credits
self.level = level
self.lab = lab
self.date_range = date_range
# data is a list of two html tables
def scrape(self,data):
@classmethod
def scrape(cls,data):
info,times = data
# info
title,abrv,sesession = info.find('caption').text.split(' - ')
session = int(self.session)
title,abrv,session = info.find('caption').text.split(' - ')
session = int(session)
rows = info.find_all('tr')
params = {}
for row in rows:
name = row.find('th').text.rstrip(':')
data = re.sub(r'^ +|[\n\r\t]','',row.find('td').text)
@ -42,8 +96,7 @@ class Class:
if name == 'Status':
type,date = data.split(' on ')
type = type.replace('*','')
self.type = type
self.registration_date = dateparse(date)
registration_date = dateparse(date)
else:
if name in simp_exceptions:
name = name.lower().replace(' ','_')
@ -56,43 +109,23 @@ class Class:
except:
pass
self.__dict__[name] = data
params[name] = data
# time
headers,*data = times.find_all('tr')
headers = (header.text.lower() for header in headers.find_all('th'))
headers = list(header.text.lower() for header in headers.find_all('th'))
if len(data) > 1:
data,lab = map(lambda row: parse_horz_row(headers,row),data[:2])
lab.update(params)
lab = Class(title + " - Lab",abrv,session,**lab)
else:
lab = None
data = data[0]
return Class(title,abrv,session,lab=lab,**data)
def parse_horz_row(headers,row):
data = (col.text for col in row.find_all('td'))
ret = {}
time_data = dict(zip(headers,data))
if time_data['time'] == 'TBA':
ret['time_range'] = None
else:
s,e = map(dateparse,time_data['time'].split(' - '))
ret['time_range'] = (
datetime2date_time(s,'time'),
datetime2date_time(e,'time'),
)
s,e = map(dateparse,time_data['date range'].split(' - '))
ret['date_range'] = (
datetime2date_time(s,'date'),
datetime2date_time(e,'date'),
)
time_data['days'] = re.sub('[^{}]'.format(''.join(filter(bool,days))),'',time_data['days'])
ret['days'] = list(days.index(time_data['days'][i]) for i in range(len(time_data['days'])))
ret['location'] = time_data['where']
return ret
data = parse_horz_row(headers,data[0])
params.update(data)
return Class(title,abrv,session,lab=lab,**params)
def __repr__(self):
return '{} on {}'.format(self.title,''.join(days[i] for i in self.days))
@property
def length(self):
return datetime.timedelta(seconds = sub(
@ -110,5 +143,5 @@ def get_classes(page):
if __name__ == "__main__":
with open('schedule.html') as file:
page = BS(file.read())
page = BS(file.read(),'lxml')
class1,*classes = get_classes(page)
Loading…
Cancel
Save