You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

113 lines
3.9 KiB

  1. from bs4 import BeautifulSoup as BS
  2. import datetime
  3. import re
  4. from operator import sub
  5. def dateparse(datetime_str):
  6. date = '%b %d, %Y'
  7. time = '%I:%M %p'
  8. try:
  9. return datetime.datetime.strptime(datetime_str,date)
  10. except ValueError:
  11. return datetime.datetime.strptime(datetime_str,time)
  12. days = [None,'M','T','W','R','F',None]
  13. simp_exceptions = ['Grade Mode']
  14. def datetime2date_time(dtime,mode):
  15. if mode == 'date':
  16. return datetime.date(dtime.year,dtime.month,dtime.day)
  17. elif mode == 'time':
  18. return datetime.time(dtime.hour,dtime.minute,dtime.second)
  19. def seconds_from_midnight(t):
  20. return t.hour*60**2+ t.minute*60+t.second
  21. class Class:
  22. def __init__(self,title,abrv,session,days,location,time_range,date_range,lab):
  23. self.title = title
  24. self.abrv = abrv
  25. self.session = session
  26. self.days = days
  27. self.location = location
  28. self.time_range = time_range
  29. self.lab = None
  30. self.date_range = date_range
  31. # data is a list of two html tables
  32. def scrape(self,data):
  33. info,times = data
  34. # info
  35. title,abrv,sesession = info.find('caption').text.split(' - ')
  36. self.lab = None
  37. self.session = int(self.session)
  38. rows = info.find_all('tr')
  39. for row in rows:
  40. name = row.find('th').text.rstrip(':')
  41. data = re.sub(r'^ +|[\n\r\t]','',row.find('td').text)
  42. if name == 'Status':
  43. type,date = data.split(' on ')
  44. type = type.replace('*','')
  45. self.type = type
  46. self.registration_date = dateparse(date)
  47. else:
  48. if name in simp_exceptions:
  49. name = name.lower().replace(' ','_')
  50. else:
  51. name = name.lower().split(' ')[-1]
  52. if name != 'instructor':
  53. data = data.lower()
  54. try:
  55. data = int(re.sub(r'\.\d+','',data))
  56. except:
  57. pass
  58. self.__dict__[name] = data
  59. # time
  60. headers,*data = times.find_all('tr')
  61. headers = (header.text.lower() for header in headers.find_all('th'))
  62. if len(data) > 1:
  63. data,lab = map(lambda row: parse_horz_row(headers,row),data[:2])
  64. else
  65. lab = None
  66. data = data[0]
  67. def parse_horz_row(headers,row):
  68. data = (col.text for col in row.find_all('td'))
  69. ret = {}
  70. time_data = dict(zip(headers,data))
  71. if time_data['time'] == 'TBA':
  72. ret['time_range'] = None
  73. else:
  74. s,e = map(dateparse,time_data['time'].split(' - '))
  75. ret['time_range'] = (
  76. datetime2date_time(s,'time'),
  77. datetime2date_time(e,'time'),
  78. )
  79. s,e = map(dateparse,time_data['date range'].split(' - '))
  80. ret['date_range'] = (
  81. datetime2date_time(s,'date'),
  82. datetime2date_time(e,'date'),
  83. )
  84. time_data['days'] = re.sub('[^{}]'.format(''.join(filter(bool,days))),'',time_data['days'])
  85. ret['days'] = list(days.index(time_data['days'][i]) for i in range(len(time_data['days'])))
  86. ret['location'] = time_data['where']
  87. return ret
  88. @property
  89. def length(self):
  90. return datetime.timedelta(seconds = sub(
  91. seconds_from_midnight(self.time_range[1]),
  92. seconds_from_midnight(self.time_range[0]),
  93. ))
  94. def get_classes(page):
  95. if not isinstance(page,BS):
  96. page = BS(page,'lxml')
  97. tables = page.find_all('table',attrs= {'class':'datadisplaytable'})
  98. groups = ((tables[i],tables[i+1]) for i in range(0,len(tables),2))
  99. return list(map(Class.scrape,groups))
  100. if __name__ == "__main__":
  101. with open('schedule.html') as file:
  102. page = BS(file.read(),'lxml')
  103. class1,*classes = get_classes(page)