You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

68 lines
2.5 KiB

8 years ago
8 years ago
8 years ago
  1. import urllib
  2. from fake_useragent import UserAgent
  3. from bs4 import BeautifulSoup as BS
  4. import re
  5. import datetime
  6. user_agent = UserAgent().chrome
  7. re_words = lambda n: re.compile(r"( ?[^ ]+ ?)"+"{0,"+str(n-1)+"}"+r"[^ ]+")
  8. debug = None
  9. def get_page(url):
  10. page = None
  11. while not page:
  12. page = urllib.request.Request(url,headers = {"User-Agent":user_agent})
  13. page = str(urllib.request.urlopen(page).read())
  14. return page
  15. def get_BS(url):
  16. return BS(get_page(url),"lxml")
  17. class price_finder:
  18. page_funcs = {
  19. "www.amazon.com":{
  20. "name":lambda page: re.sub(r"( {2,}|\n|\\n)","",page.find("span",id="productTitle").text),
  21. "price":lambda page: page.find(name = "span",id = re.compile("priceblock.*")).text
  22. },
  23. "www.banggood.com":{
  24. "name":lambda page: page.find("h1",attrs = {"itemprop":"name"}).text,
  25. "price":lambda page: page.find("div",attrs = {"class":"now"}).get("oriprice")
  26. },
  27. "www.dalprops.com":{
  28. "name":lambda page: page.find("h1",attrs = {"class":"product_title"}).text,
  29. "price":lambda page: page.find("meta",attrs = {"itemprop":"price"}).get("content")
  30. },
  31. "www.gearbest.com":{
  32. "name":lambda page: page.find("div",attrs = {"class":"goods-info-top"}).find("h1").text,
  33. "price":lambda page: page.find(id="unit_price").get("data-orgp")
  34. },
  35. "hobbyking.com":{
  36. "name":lambda page: page.find("h1",attrs={"class":"product-name"}).text,
  37. "price":lambda page: page.find("span",id = re.compile(r"product-price.*")).find("span",attrs={"class":"price"}).text
  38. }
  39. }
  40. def __init__(self,url,space_seperated_categories = 7,bs=None):
  41. self.url=url
  42. self.info_url = urllib.parse.urlparse(url)
  43. if self.info_url.netloc not in price_finder.page_funcs.keys():
  44. raise NotImplementedError("Not implemented for {}".format(self.info_url.netloc))
  45. if bs:
  46. self.bs= bs
  47. else:
  48. self.bs = get_BS(url)
  49. self.words = re_words(space_seperated_categories)
  50. self.time = datetime.datetime.today()
  51. self.info_product = self._get_product_info_()
  52. def _get_product_info_(self):
  53. funcs = price_finder.page_funcs[self.info_url.netloc]
  54. print(self.url)
  55. return {
  56. "product_name":self.words.match(
  57. funcs["name"](self.bs)
  58. ).group(0),
  59. "price":funcs["price"](self.bs).replace("$",""),
  60. }