import re import datetime import copy import json with open('xpaths.json') as file: xpaths = json.load(file) def get_words(raw,n): words = re.finditer(r"(\b[^ \n]+\b)",raw) word_list = list(match.group(0) for match in words) if len(word_list) > n: word_list = word_list[:n] return ' '.join(word_list) def format_price(raw): return re.search(r'\d+(\.\d)?',raw).group(0) class ParseResult: def __init__(self,url,tree,space_seperated_categories = 7,): self.url=url self.info_url = urllib.parse.urlparse(url) self.word_len = space_seperated_categories self.tree = tree self.time = datetime.datetime.today() self.info_product = self._get_product_info_() def _get_product_info_(self): host = self.info_url.netloc product_name = get_words( self.tree.xpath(xpaths[host]['name'])[0].text ) other_raw = None try: other_raw = self.tree.xpath(xpaths[host]['other'])[0].text except KeyError: pass if host in ['www.gearbest.com']: if other.raw: price = '0.00' else: price = format_price( self.tree.xpath(xpaths[host]['price'])[0].text ) else: price = format_price( self.tree.xpath(xpaths[host]['price'])[0].text ) return { "product_name":product_name, "price":price, }