You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

180 lines
5.0 KiB

  1. import datetime
  2. import itertools
  3. from operator import sub
  4. import re
  5. import bs4
  6. from bs4 import BeautifulSoup as BS
  7. def dateparse(datetime_str):
  8. date = "%b %d, %Y"
  9. time = "%I:%M %p"
  10. try:
  11. return datetime.datetime.strptime(datetime_str, date)
  12. except ValueError:
  13. return datetime.datetime.strptime(datetime_str, time)
  14. days = [None, "M", "T", "W", "R", "F", None]
  15. simp_exceptions = ["Grade Mode"]
  16. def datetime2date_time(dtime: datetime.datetime, mode):
  17. if mode == "date":
  18. return datetime.date(dtime.year, dtime.month, dtime.day)
  19. elif mode == "time":
  20. return datetime.time(dtime.hour, dtime.minute, dtime.second)
  21. def seconds_from_midnight(t: datetime.time):
  22. return t.hour * 60 ** 2 + t.minute * 60 + t.second
  23. def parse_horz_row(headers, row: bs4.element.Tag):
  24. data = (col.text for col in row.find_all("td"))
  25. ret = {}
  26. time_data = dict(zip(headers, data))
  27. try:
  28. time_data["time"]
  29. except KeyError as e:
  30. print(row)
  31. raise e
  32. if time_data["time"] == "TBA":
  33. ret["time_range"] = None
  34. else:
  35. s, e = map(dateparse, time_data["time"].split(" - "))
  36. ret["time_range"] = (
  37. datetime2date_time(s, "time"),
  38. datetime2date_time(e, "time"),
  39. )
  40. s, e = map(dateparse, time_data["date range"].split(" - "))
  41. time_data["days"] = re.sub(
  42. "[^{}]".format("".join(filter(bool, days))), "", time_data["days"]
  43. )
  44. ret["days"] = sorted(
  45. (days.index(time_data["days"][i]) for i in range(len(time_data["days"])))
  46. )
  47. if len(ret["days"]) > 0:
  48. class_start = (s.weekday() + 1) % 7
  49. start = ret["days"][0]
  50. s += datetime.timedelta(days=(start - class_start))
  51. ret["date_range"] = (
  52. datetime2date_time(s, "date"),
  53. datetime2date_time(e, "date"),
  54. )
  55. ret["location"] = time_data["where"]
  56. return ret
  57. class Class:
  58. def __init__(
  59. self,
  60. title,
  61. abrv,
  62. session,
  63. term,
  64. crn,
  65. instructor,
  66. grade_mode,
  67. credits,
  68. level,
  69. campus,
  70. time_range,
  71. date_range,
  72. days,
  73. location,
  74. lab=None,
  75. ):
  76. # name
  77. self.title = title
  78. self.abrv = abrv
  79. # time
  80. self.date_range = date_range
  81. self.days = days
  82. self.time_range = time_range
  83. # location
  84. self.location = location
  85. self.campus = campus
  86. # other
  87. self.session = session
  88. self.term = term
  89. self.crn = crn
  90. self.instructor = instructor
  91. self.grade_mode = grade_mode
  92. self.credits = credits
  93. self.level = level
  94. self.lab = lab
  95. @classmethod
  96. def scrape(cls, info: bs4.element.Tag, times: bs4.element.Tag):
  97. # info
  98. title, abrv, session = info.find("caption").text.split(" - ")
  99. session = int(session)
  100. rows = info.find_all("tr")
  101. params = {}
  102. for row in rows:
  103. name = row.find("th").text.rstrip(":")
  104. data = re.sub(r"^ +|[\n\r\t]", "", row.find("td").text)
  105. if name == "Status":
  106. type, date = data.split(" on ")
  107. type = type.replace("*", "")
  108. registration_date = dateparse(date)
  109. else:
  110. if name in simp_exceptions:
  111. name = name.lower().replace(" ", "_")
  112. else:
  113. name = name.lower().split(" ")[-1]
  114. if name != "instructor":
  115. data = data.lower()
  116. try:
  117. data = int(re.sub(r"\.\d+", "", data))
  118. except:
  119. pass
  120. params[name] = data
  121. # time
  122. headers, *data = times.find_all("tr")
  123. headers = list(header.text.lower() for header in headers.find_all("th"))
  124. if len(data) > 1:
  125. data, lab = map(lambda row: parse_horz_row(headers, row), data[:2])
  126. lab.update(params)
  127. lab = Class(title + " - Lab", abrv, session, **lab)
  128. else:
  129. lab = None
  130. data = parse_horz_row(headers, data[0])
  131. params.update(data)
  132. return Class(title, abrv, session, lab=lab, **params)
  133. def __repr__(self):
  134. return "{} on {}".format(self.title, "".join(days[i] for i in self.days))
  135. @property
  136. def length(self):
  137. return datetime.timedelta(
  138. seconds=sub(
  139. seconds_from_midnight(self.time_range[1]),
  140. seconds_from_midnight(self.time_range[0]),
  141. )
  142. )
  143. def get_classes(page):
  144. if not isinstance(page, BS):
  145. page = BS(page, "lxml")
  146. tables = page.find_all("table", attrs={"class": "datadisplaytable"})
  147. groups = ((tables[i], tables[i + 1]) for i in range(0, len(tables), 2))
  148. return itertools.starmap(Class.scrape, groups)
  149. if __name__ == "__main__":
  150. with open("schedule.html") as file:
  151. page = BS(file.read(), "lxml")
  152. classes = list(get_classes(page))
  153. for _class in classes:
  154. print(repr(_class), _class.date_range)