You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
import reimport datetimeimport copyimport jsonwith open('xpaths.json') as file: xpaths = json.load(file)
def get_words(raw,n): words = re.finditer(r"(\b[^ \n]+\b)",raw) word_list = list(match.group(0) for match in words) if len(word_list) > n: word_list = word_list[:n] return ' '.join(word_list)
def format_price(raw): return re.search(r'\d+(\.\d)?',raw).group(0)
class ParseResult:
def __init__(self,url,tree,space_seperated_categories = 7,): self.url=url self.info_url = urllib.parse.urlparse(url) self.word_len = space_seperated_categories self.tree = tree self.time = datetime.datetime.today() self.info_product = self._get_product_info_()
def _get_product_info_(self): host = self.info_url.netloc product_name = get_words( self.tree.xpath(xpaths[host]['name'])[0].text ) other_raw = None try: other_raw = self.tree.xpath(xpaths[host]['other'])[0].text except KeyError: pass if host in ['www.gearbest.com']: if other.raw: price = '0.00' else: price = format_price( self.tree.xpath(xpaths[host]['price'])[0].text ) else: price = format_price( self.tree.xpath(xpaths[host]['price'])[0].text ) return { "product_name":product_name, "price":price, }
|