You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

150 lines
4.5 KiB

  1. from bs4 import BeautifulSoup as BS
  2. import datetime
  3. import re
  4. from operator import sub
  5. def dateparse(datetime_str):
  6. date = '%b %d, %Y'
  7. time = '%I:%M %p'
  8. try:
  9. return datetime.datetime.strptime(datetime_str,date)
  10. except ValueError:
  11. return datetime.datetime.strptime(datetime_str,time)
  12. days = [None,'M','T','W','R','F',None]
  13. simp_exceptions = ['Grade Mode']
  14. def datetime2date_time(dtime,mode):
  15. if mode == 'date':
  16. return datetime.date(dtime.year,dtime.month,dtime.day)
  17. elif mode == 'time':
  18. return datetime.time(dtime.hour,dtime.minute,dtime.second)
  19. def seconds_from_midnight(t):
  20. return t.hour*60**2+ t.minute*60+t.second
  21. def parse_horz_row(headers,row):
  22. data = (col.text for col in row.find_all('td'))
  23. ret = {}
  24. time_data = dict(zip(headers,data))
  25. try:
  26. time_data['time']
  27. except KeyError as e:
  28. print(row)
  29. raise e
  30. if time_data['time'] == 'TBA':
  31. ret['time_range'] = None
  32. else:
  33. s,e = map(dateparse,time_data['time'].split(' - '))
  34. ret['time_range'] = (
  35. datetime2date_time(s,'time'),
  36. datetime2date_time(e,'time'),
  37. )
  38. s,e = map(dateparse,time_data['date range'].split(' - '))
  39. ret['date_range'] = (
  40. datetime2date_time(s,'date'),
  41. datetime2date_time(e,'date'),
  42. )
  43. time_data['days'] = re.sub('[^{}]'.format(''.join(filter(bool,days))),'',time_data['days'])
  44. ret['days'] = list(days.index(time_data['days'][i]) for i in range(len(time_data['days'])))
  45. ret['location'] = time_data['where']
  46. return ret
  47. class Class:
  48. def __init__(self, title, abrv, session,
  49. term,
  50. crn,
  51. instructor,
  52. grade_mode,
  53. credits,
  54. level,
  55. campus,
  56. time_range,
  57. date_range,
  58. days,
  59. location,
  60. lab=None):
  61. #name
  62. self.title = title
  63. self.abrv = abrv
  64. #time
  65. self.date_range = date_range
  66. self.days = days
  67. self.time_range = time_range
  68. #location
  69. self.location = location
  70. self.campus = campus
  71. #other
  72. self.session = session
  73. self.term = term
  74. self.crn = crn
  75. self.instructor = instructor
  76. self.grade_mode = grade_mode
  77. self.credits = credits
  78. self.level = level
  79. self.lab = lab
  80. # data is a list of two html tables
  81. @classmethod
  82. def scrape(cls,data):
  83. info,times = data
  84. # info
  85. title,abrv,session = info.find('caption').text.split(' - ')
  86. session = int(session)
  87. rows = info.find_all('tr')
  88. params = {}
  89. for row in rows:
  90. name = row.find('th').text.rstrip(':')
  91. data = re.sub(r'^ +|[\n\r\t]','',row.find('td').text)
  92. if name == 'Status':
  93. type,date = data.split(' on ')
  94. type = type.replace('*','')
  95. registration_date = dateparse(date)
  96. else:
  97. if name in simp_exceptions:
  98. name = name.lower().replace(' ','_')
  99. else:
  100. name = name.lower().split(' ')[-1]
  101. if name != 'instructor':
  102. data = data.lower()
  103. try:
  104. data = int(re.sub(r'\.\d+','',data))
  105. except:
  106. pass
  107. params[name] = data
  108. # time
  109. headers,*data = times.find_all('tr')
  110. headers = list(header.text.lower() for header in headers.find_all('th'))
  111. if len(data) > 1:
  112. data,lab = map(lambda row: parse_horz_row(headers,row),data[:2])
  113. lab.update(params)
  114. lab = Class(title + " - Lab",abrv,session,**lab)
  115. else:
  116. lab = None
  117. data = parse_horz_row(headers,data[0])
  118. params.update(data)
  119. return Class(title,abrv,session,lab=lab,**params)
  120. def __repr__(self):
  121. return '{} on {}'.format(self.title,''.join(days[i] for i in self.days))
  122. @property
  123. def length(self):
  124. return datetime.timedelta(seconds = sub(
  125. seconds_from_midnight(self.time_range[1]),
  126. seconds_from_midnight(self.time_range[0]),
  127. ))
  128. def get_classes(page):
  129. if not isinstance(page,BS):
  130. page = BS(page,'lxml')
  131. tables = page.find_all('table',attrs= {'class':'datadisplaytable'})
  132. groups = ((tables[i],tables[i+1]) for i in range(0,len(tables),2))
  133. return list(map(Class.scrape,groups))
  134. if __name__ == "__main__":
  135. with open('schedule.html') as file:
  136. page = BS(file.read(),'lxml')
  137. class1,*classes = get_classes(page)