You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

110 lines
3.9 KiB

  1. from bs4 import BeautifulSoup as BS
  2. import datetime
  3. import re
  4. from operator import sub
  5. def dateparse(datetime_str):
  6. date = '%b %d, %Y'
  7. time = '%I:%M %p'
  8. try:
  9. return datetime.datetime.strptime(datetime_str,date)
  10. except ValueError:
  11. return datetime.datetime.strptime(datetime_str,time)
  12. days = [None,'M','T','W','R','F',None]
  13. simp_exceptions = ['Grade Mode']
  14. def datetime2date_time(dtime,mode):
  15. if mode == 'date':
  16. return datetime.date(dtime.year,dtime.month,dtime.day)
  17. elif mode == 'time':
  18. return datetime.time(dtime.hour,dtime.minute,dtime.second)
  19. def seconds_from_midnight(t):
  20. return t.hour*60**2+ t.minute*60+t.second
  21. class Class:
  22. def __init__(self,title,session,days,location,time_range,date_range):
  23. self.title = title
  24. self.session = session
  25. self.days = days
  26. self.location = location
  27. self.time_range = time_range
  28. self.lab = None
  29. self.date_range = date_range
  30. # data is a list of two html tables
  31. def scrape(self,data):
  32. info,times = data
  33. # info
  34. self.title,self.abrv,self.session = info.find('caption').text.split(' - ')
  35. self.lab = None
  36. self.session = int(self.session)
  37. rows = info.find_all('tr')
  38. for row in rows:
  39. name = row.find('th').text.rstrip(':')
  40. data = re.sub(r'^ +|[\n\r\t]','',row.find('td').text)
  41. if name == 'Status':
  42. type,date = data.split(' on ')
  43. type = type.replace('*','')
  44. self.type = type
  45. self.registration_date = dateparse(date)
  46. else:
  47. if name in simp_exceptions:
  48. name = name.lower().replace(' ','_')
  49. else:
  50. name = name.lower().split(' ')[-1]
  51. if name != 'instructor':
  52. data = data.lower()
  53. try:
  54. data = int(re.sub(r'\.\d+','',data))
  55. except:
  56. pass
  57. self.__dict__[name] = data
  58. # time
  59. headers,*data = times.find_all('tr')
  60. if len(data) > 1:
  61. data,lab = data[:2]
  62. else
  63. lab = None
  64. data = data[0]
  65. data = (col.text for col in data.find_all('td'))
  66. headers = (header.text.lower() for header in headers.find_all('th'))
  67. def parse_horz_row(headers,row):
  68. ret = {}
  69. time_data = dict(zip(headers,data))
  70. if time_data['time'] == 'TBA':
  71. ret['time_range'] = None
  72. else:
  73. s,e = map(dateparse,time_data['time'].split(' - '))
  74. ret['time_range'] = (
  75. datetime2date_time(s,'time'),
  76. datetime2date_time(e,'time'),
  77. )
  78. s,e = map(dateparse,time_data['date range'].split(' - '))
  79. ret['date_range'] = (
  80. datetime2date_time(s,'date'),
  81. datetime2date_time(e,'date'),
  82. )
  83. time_data['days'] = re.sub('[^{}]'.format(''.join(filter(bool,days))),'',time_data['days'])
  84. ret['days'] = list(days.index(time_data['days'][i]) for i in range(len(time_data['days'])))
  85. ret['location'] = time_data['where']
  86. return ret
  87. @property
  88. def length(self):
  89. return datetime.timedelta(seconds = sub(
  90. seconds_from_midnight(self.time_range[1]),
  91. seconds_from_midnight(self.time_range[0]),
  92. ))
  93. def get_classes(page):
  94. if not isinstance(page,BS):
  95. page = BS(page,'lxml')
  96. tables = page.find_all('table',attrs= {'class':'datadisplaytable'})
  97. groups = ((tables[i],tables[i+1]) for i in range(0,len(tables),2))
  98. return list(map(Class.scrape,groups))
  99. if __name__ == "__main__":
  100. with open('schedule.html') as file:
  101. page = BS(file.read(),'lxml')
  102. class1,*classes = get_classes(page)