You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

61 lines
1.6 KiB

8 years ago
7 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
  1. import re
  2. import datetime
  3. import copy
  4. import json
  5. with open('xpaths.json') as file:
  6. xpaths = json.load(file)
  7. def get_words(raw,n):
  8. words = re.finditer(r"(\b[^ \n]+\b)",raw)
  9. word_list = list(match.group(0) for match in words)
  10. if len(word_list) > n:
  11. word_list = word_list[:n]
  12. return ' '.join(word_list)
  13. def format_price(raw):
  14. return re.search(r'\d+(\.\d)?',raw).group(0)
  15. class ParseResult:
  16. def __init__(self,url,tree,space_seperated_categories = 7,):
  17. self.url=url
  18. self.info_url = urllib.parse.urlparse(url)
  19. self.word_len = space_seperated_categories
  20. self.tree = tree
  21. self.time = datetime.datetime.today()
  22. self.info_product = self._get_product_info_()
  23. def _get_product_info_(self):
  24. host = self.info_url.netloc
  25. product_name = get_words(
  26. self.tree.xpath(xpaths[host]['name'])[0].text
  27. )
  28. other_raw = None
  29. try:
  30. other_raw = self.tree.xpath(xpaths[host]['other'])[0].text
  31. except KeyError:
  32. pass
  33. if host in ['www.gearbest.com']:
  34. if other.raw:
  35. price = '0.00'
  36. else:
  37. price = format_price(
  38. self.tree.xpath(xpaths[host]['price'])[0].text
  39. )
  40. else:
  41. price = format_price(
  42. self.tree.xpath(xpaths[host]['price'])[0].text
  43. )
  44. return {
  45. "product_name":product_name,
  46. "price":price,
  47. }