You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

114 lines
3.9 KiB

  1. from bs4 import BeautifulSoup as BS
  2. import datetime
  3. import re
  4. from operator import sub
  5. def dateparse(datetime_str):
  6. date = '%b %d, %Y'
  7. time = '%I:%M %p'
  8. try:
  9. return datetime.datetime.strptime(datetime_str,date)
  10. except ValueError:
  11. return datetime.datetime.strptime(datetime_str,time)
  12. days = [None,'M','T','W','R','F',None]
  13. simp_exceptions = ['Grade Mode']
  14. def datetime2date_time(dtime,mode):
  15. if mode == 'date':
  16. return datetime.date(dtime.year,dtime.month,dtime.day)
  17. elif mode == 'time':
  18. return datetime.time(dtime.hour,dtime.minute,dtime.second)
  19. def seconds_from_midnight(t):
  20. return t.hour*60**2+ t.minute*60+t.second
  21. class Class:
  22. def __init__(self,title,abrv,session,days,location,time_range,date_range,lab=None):
  23. self.title = title
  24. self.abrv = abrv
  25. self.session = session
  26. self.days = days
  27. self.location = location
  28. self.time_range = time_range
  29. self.lab = lab
  30. self.date_range = date_range
  31. # data is a list of two html tables
  32. def scrape(self,data):
  33. info,times = data
  34. # info
  35. title,abrv,sesession = info.find('caption').text.split(' - ')
  36. session = int(self.session)
  37. rows = info.find_all('tr')
  38. for row in rows:
  39. name = row.find('th').text.rstrip(':')
  40. data = re.sub(r'^ +|[\n\r\t]','',row.find('td').text)
  41. if name == 'Status':
  42. type,date = data.split(' on ')
  43. type = type.replace('*','')
  44. self.type = type
  45. self.registration_date = dateparse(date)
  46. else:
  47. if name in simp_exceptions:
  48. name = name.lower().replace(' ','_')
  49. else:
  50. name = name.lower().split(' ')[-1]
  51. if name != 'instructor':
  52. data = data.lower()
  53. try:
  54. data = int(re.sub(r'\.\d+','',data))
  55. except:
  56. pass
  57. self.__dict__[name] = data
  58. # time
  59. headers,*data = times.find_all('tr')
  60. headers = (header.text.lower() for header in headers.find_all('th'))
  61. if len(data) > 1:
  62. data,lab = map(lambda row: parse_horz_row(headers,row),data[:2])
  63. lab = Class(title + " - Lab",abrv,session,**lab)
  64. else:
  65. lab = None
  66. data = data[0]
  67. return Class(title,abrv,session,lab=lab,**data)
  68. def parse_horz_row(headers,row):
  69. data = (col.text for col in row.find_all('td'))
  70. ret = {}
  71. time_data = dict(zip(headers,data))
  72. if time_data['time'] == 'TBA':
  73. ret['time_range'] = None
  74. else:
  75. s,e = map(dateparse,time_data['time'].split(' - '))
  76. ret['time_range'] = (
  77. datetime2date_time(s,'time'),
  78. datetime2date_time(e,'time'),
  79. )
  80. s,e = map(dateparse,time_data['date range'].split(' - '))
  81. ret['date_range'] = (
  82. datetime2date_time(s,'date'),
  83. datetime2date_time(e,'date'),
  84. )
  85. time_data['days'] = re.sub('[^{}]'.format(''.join(filter(bool,days))),'',time_data['days'])
  86. ret['days'] = list(days.index(time_data['days'][i]) for i in range(len(time_data['days'])))
  87. ret['location'] = time_data['where']
  88. return ret
  89. @property
  90. def length(self):
  91. return datetime.timedelta(seconds = sub(
  92. seconds_from_midnight(self.time_range[1]),
  93. seconds_from_midnight(self.time_range[0]),
  94. ))
  95. def get_classes(page):
  96. if not isinstance(page,BS):
  97. page = BS(page,'lxml')
  98. tables = page.find_all('table',attrs= {'class':'datadisplaytable'})
  99. groups = ((tables[i],tables[i+1]) for i in range(0,len(tables),2))
  100. return list(map(Class.scrape,groups))
  101. if __name__ == "__main__":
  102. with open('schedule.html') as file:
  103. page = BS(file.read())
  104. class1,*classes = get_classes(page)