You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

80 lines
3.1 KiB

8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
  1. import urllib
  2. from fake_useragent import UserAgent
  3. from bs4 import BeautifulSoup as BS
  4. from requests_html import HTMLSession
  5. import re
  6. import datetime
  7. # import pytz
  8. import copy
  9. user_agent = UserAgent().chrome
  10. debug = None
  11. def get_words(string,n):
  12. words = re.finditer(r"(\b[^ \n]+\b)",string)
  13. word_list = list(match.group(0) for match in words)
  14. if len(word_list) > n:
  15. word_list = word_list[:n]
  16. return ' '.join(word_list)
  17. def get_page(url):
  18. page = None
  19. while not page:
  20. page = urllib.request.Request(url,headers = {"User-Agent":user_agent})
  21. page = str(urllib.request.urlopen(page).read())
  22. return page
  23. def get_BS(url):
  24. return BS(get_page(url),"lxml")
  25. class price_finder:
  26. page_funcs = {
  27. "www.amazon.com":{
  28. "name":lambda page: re.sub(r"( {2,}|\n|\\n)","",page.find("span",id="productTitle").text),
  29. "price":lambda page: page.find(name = "span",id = re.compile("priceblock.*")).text
  30. },
  31. "www.banggood.com":{
  32. "name":lambda page: page.find("h1",attrs = {"itemprop":"name"}).text,
  33. "price":lambda page: page.find("div",attrs = {"class":"now"}).get("oriprice")
  34. },
  35. "www.dalprops.com":{
  36. "name":lambda page: page.find("h1",attrs = {"class":"product_title"}).text,
  37. "price":lambda page: page.find("meta",attrs = {"itemprop":"price"}).get("content")
  38. },
  39. "www.gearbest.com":{
  40. "name":lambda page:re.sub(" {2,}|\n","",page.find("div",attrs = {"class":"goodsIntro_titleWrap"}).find("h1").text),
  41. "price":lambda page: page.find("span",attrs={"class":"goodsIntro_price"}).text
  42. },
  43. "hobbyking.com":{
  44. "name":lambda page: page.find("h1",attrs={"class":"product-name"}).text,
  45. "price":lambda page: page.find("span",id = re.compile(r"product-price.*")).find("span",attrs={"class":"price"}).text
  46. },
  47. "www.getfpv.com":{
  48. "name": lambda page: re.sub(r"\\n|\n","", page.find("div",attrs={"class":"product-name"}).text),
  49. "price": lambda page: re.sub(r"\\n|\n","", page.find("span",attrs={"id":re.compile("product-price.*")}).text)
  50. }
  51. }
  52. def __init__(self,url,space_seperated_categories = 7,bs=None):
  53. self.url=url
  54. self.info_url = urllib.parse.urlparse(url)
  55. self.word_len = space_seperated_categories
  56. if self.info_url.netloc not in price_finder.page_funcs.keys():
  57. raise NotImplementedError("Not implemented for {}".format(self.info_url.netloc))
  58. if bs:
  59. self.bs= bs
  60. else:
  61. self.bs = get_BS(url)
  62. # self.words = re_words(space_seperated_categories)
  63. self.time = datetime.datetime.today()
  64. self.info_product = self._get_product_info_()
  65. def _get_product_info_(self):
  66. funcs = price_finder.page_funcs[self.info_url.netloc]
  67. # print(self.url)
  68. return {
  69. "product_name":get_words(funcs["name"](self.bs),self.word_len),
  70. "price":funcs["price"](self.bs).replace("$",""),
  71. }
  72. # def to_json(self):
  73. # ret = copy.deepcopy(self.__dict__)
  74. # ret['time'] = ret['time'].