You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

107 lines
3.8 KiB

7 years ago
  1. from bs4 import BeautifulSoup as BS
  2. import datetime
  3. import re
  4. from operator import sub
  5. def dateparse(datetime_str):
  6. date = '%b %d, %Y'
  7. time = '%I:%M %p'
  8. try:
  9. return datetime.datetime.strptime(datetime_str,date)
  10. except ValueError:
  11. return datetime.datetime.strptime(datetime_str,time)
  12. days = [None,'M','T','W','R','F',None]
  13. simp_exceptions = ['Grade Mode']
  14. def datetime2date_time(dtime,mode):
  15. if mode == 'date':
  16. return datetime.date(dtime.year,dtime.month,dtime.day)
  17. elif mode == 'time':
  18. return datetime.time(dtime.hour,dtime.minute,dtime.second)
  19. def seconds_from_midnight(t):
  20. return t.hour*60**2+ t.minute*60+t.second
  21. class Class:
  22. def __init__(self,title,session,days,location,time_range):
  23. self.title = title
  24. self.session = session
  25. self.days = days
  26. self.location = location
  27. self.time_range = time_range
  28. self.lab = None
  29. # data is a list of two html tables
  30. def scrape(self,data):
  31. info,times = data
  32. # info
  33. self.title,self.abrv,self.session = info.find('caption').text.split(' - ')
  34. self.lab = None
  35. self.session = int(self.session)
  36. rows = info.find_all('tr')
  37. for row in rows:
  38. name = row.find('th').text.rstrip(':')
  39. data = re.sub(r'^ +|[\n\r\t]','',row.find('td').text)
  40. if name == 'Status':
  41. type,date = data.split(' on ')
  42. type = type.replace('*','')
  43. self.type = type
  44. self.registration_date = dateparse(date)
  45. else:
  46. if name in simp_exceptions:
  47. name = name.lower().replace(' ','_')
  48. else:
  49. name = name.lower().split(' ')[-1]
  50. if name != 'instructor':
  51. data = data.lower()
  52. try:
  53. data = int(re.sub(r'\.\d+','',data))
  54. except:
  55. pass
  56. self.__dict__[name] = data
  57. # time
  58. headers,*data = times.find_all('tr')
  59. if len(data) > 1:
  60. data,lab = data[:2]
  61. else
  62. lab = None
  63. data = data[0]
  64. data = (col.text for col in data.find_all('td'))
  65. headers = (header.text.lower() for header in headers.find_all('th'))
  66. def parse_horz_row(headers,row):
  67. ret = {}
  68. time_data = dict(zip(headers,data))
  69. if time_data['time'] == 'TBA':
  70. self.time_range = None
  71. else:
  72. s,e = map(dateparse,time_data['time'].split(' - '))
  73. self.time_range = (
  74. datetime2date_time(s,'time'),
  75. datetime2date_time(e,'time'),
  76. )
  77. s,e = map(dateparse,time_data['date range'].split(' - '))
  78. self.date_range = (
  79. datetime2date_time(s,'date'),
  80. datetime2date_time(e,'date'),
  81. )
  82. time_data['days'] = re.sub('[^{}]'.format(''.join(filter(bool,days))),'',time_data['days'])
  83. self.days = list(days.index(time_data['days'][i]) for i in range(len(time_data['days'])))
  84. self.location = time_data['where']
  85. @property
  86. def length(self):
  87. return datetime.timedelta(seconds = sub(
  88. seconds_from_midnight(self.time_range[1]),
  89. seconds_from_midnight(self.time_range[0]),
  90. ))
  91. def get_classes(page):
  92. if not isinstance(page,BS):
  93. page = BS(page,'lxml')
  94. tables = page.find_all('table',attrs= {'class':'datadisplaytable'})
  95. groups = ((tables[i],tables[i+1]) for i in range(0,len(tables),2))
  96. return list(map(Class.scrape,groups))
  97. if __name__ == "__main__":
  98. with open('schedule.html') as file:
  99. page = BS(file.read(),'lxml')
  100. class1,*classes = get_classes(page)