You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

156 lines
4.8 KiB

  1. from bs4 import BeautifulSoup as BS
  2. import datetime
  3. import re
  4. from operator import sub
  5. def dateparse(datetime_str):
  6. date = '%b %d, %Y'
  7. time = '%I:%M %p'
  8. try:
  9. return datetime.datetime.strptime(datetime_str,date)
  10. except ValueError:
  11. return datetime.datetime.strptime(datetime_str,time)
  12. days = [None,'M','T','W','R','F',None]
  13. simp_exceptions = ['Grade Mode']
  14. def datetime2date_time(dtime,mode):
  15. if mode == 'date':
  16. return datetime.date(dtime.year,dtime.month,dtime.day)
  17. elif mode == 'time':
  18. return datetime.time(dtime.hour,dtime.minute,dtime.second)
  19. def seconds_from_midnight(t):
  20. return t.hour*60**2+ t.minute*60+t.second
  21. def parse_horz_row(headers,row):
  22. data = (col.text for col in row.find_all('td'))
  23. ret = {}
  24. time_data = dict(zip(headers,data))
  25. try:
  26. time_data['time']
  27. except KeyError as e:
  28. print(row)
  29. raise e
  30. if time_data['time'] == 'TBA':
  31. ret['time_range'] = None
  32. else:
  33. s,e = map(dateparse,time_data['time'].split(' - '))
  34. ret['time_range'] = (
  35. datetime2date_time(s,'time'),
  36. datetime2date_time(e,'time'),
  37. )
  38. s,e = map(dateparse,time_data['date range'].split(' - '))
  39. time_data['days'] = re.sub('[^{}]'.format(''.join(filter(bool,days))),'',time_data['days'])
  40. ret['days'] = sorted((days.index(time_data['days'][i]) for i in range(len(time_data['days']))))
  41. if len(ret['days']) > 0:
  42. class_start = (s.weekday()+1)%7
  43. start = ret['days'][0]
  44. s += datetime.timedelta(days=(start - class_start))
  45. ret['date_range'] = (
  46. datetime2date_time(s,'date'),
  47. datetime2date_time(e,'date'),
  48. )
  49. ret['location'] = time_data['where']
  50. return ret
  51. class Class:
  52. def __init__(self, title, abrv, session,
  53. term,
  54. crn,
  55. instructor,
  56. grade_mode,
  57. credits,
  58. level,
  59. campus,
  60. time_range,
  61. date_range,
  62. days,
  63. location,
  64. lab=None):
  65. #name
  66. self.title = title
  67. self.abrv = abrv
  68. #time
  69. self.date_range = date_range
  70. self.days = days
  71. self.time_range = time_range
  72. #location
  73. self.location = location
  74. self.campus = campus
  75. #other
  76. self.session = session
  77. self.term = term
  78. self.crn = crn
  79. self.instructor = instructor
  80. self.grade_mode = grade_mode
  81. self.credits = credits
  82. self.level = level
  83. self.lab = lab
  84. # data is a list of two html tables
  85. @classmethod
  86. def scrape(cls,data):
  87. info,times = data
  88. # info
  89. title,abrv,session = info.find('caption').text.split(' - ')
  90. session = int(session)
  91. rows = info.find_all('tr')
  92. params = {}
  93. for row in rows:
  94. name = row.find('th').text.rstrip(':')
  95. data = re.sub(r'^ +|[\n\r\t]','',row.find('td').text)
  96. if name == 'Status':
  97. type,date = data.split(' on ')
  98. type = type.replace('*','')
  99. registration_date = dateparse(date)
  100. else:
  101. if name in simp_exceptions:
  102. name = name.lower().replace(' ','_')
  103. else:
  104. name = name.lower().split(' ')[-1]
  105. if name != 'instructor':
  106. data = data.lower()
  107. try:
  108. data = int(re.sub(r'\.\d+','',data))
  109. except:
  110. pass
  111. params[name] = data
  112. # time
  113. headers,*data = times.find_all('tr')
  114. headers = list(header.text.lower() for header in headers.find_all('th'))
  115. if len(data) > 1:
  116. data,lab = map(lambda row: parse_horz_row(headers,row),data[:2])
  117. lab.update(params)
  118. lab = Class(title + " - Lab",abrv,session,**lab)
  119. else:
  120. lab = None
  121. data = parse_horz_row(headers,data[0])
  122. params.update(data)
  123. return Class(title,abrv,session,lab=lab,**params)
  124. def __repr__(self):
  125. return '{} on {}'.format(self.title,''.join(days[i] for i in self.days))
  126. @property
  127. def length(self):
  128. return datetime.timedelta(seconds = sub(
  129. seconds_from_midnight(self.time_range[1]),
  130. seconds_from_midnight(self.time_range[0]),
  131. ))
  132. def get_classes(page):
  133. if not isinstance(page,BS):
  134. page = BS(page,'lxml')
  135. tables = page.find_all('table',attrs= {'class':'datadisplaytable'})
  136. groups = ((tables[i],tables[i+1]) for i in range(0,len(tables),2))
  137. return map(Class.scrape,groups)
  138. if __name__ == "__main__":
  139. with open('schedule.html') as file:
  140. page = BS(file.read(),'lxml')
  141. classes = list(get_classes(page))
  142. for _class in classes:
  143. print(repr(_class),_class.date_range)