You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

62 lines
1.6 KiB

import re
import datetime
import copy
import json
with open('xpaths.json') as file:
xpaths = json.load(file)
def get_words(raw,n):
words = re.finditer(r"(\b[^ \n]+\b)",raw)
word_list = list(match.group(0) for match in words)
if len(word_list) > n:
word_list = word_list[:n]
return ' '.join(word_list)
def format_price(raw):
return re.search(r'\d+(\.\d)?',raw).group(0)
class ParseResult:
def __init__(self,url,tree,space_seperated_categories = 7,):
self.url=url
self.info_url = urllib.parse.urlparse(url)
self.word_len = space_seperated_categories
self.tree = tree
self.time = datetime.datetime.today()
self.info_product = self._get_product_info_()
def _get_product_info_(self):
host = self.info_url.netloc
product_name = get_words(
self.tree.xpath(xpaths[host]['name'])[0].text
)
other_raw = None
try:
other_raw = self.tree.xpath(xpaths[host]['other'])[0].text
except KeyError:
pass
if host in ['www.gearbest.com']:
if other.raw:
price = '0.00'
else:
price = format_price(
self.tree.xpath(xpaths[host]['price'])[0].text
)
else:
price = format_price(
self.tree.xpath(xpaths[host]['price'])[0].text
)
return {
"product_name":product_name,
"price":price,
}