You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
62 lines
1.6 KiB
62 lines
1.6 KiB
import re
|
|
import datetime
|
|
import copy
|
|
import json
|
|
with open('xpaths.json') as file:
|
|
xpaths = json.load(file)
|
|
|
|
def get_words(raw,n):
|
|
words = re.finditer(r"(\b[^ \n]+\b)",raw)
|
|
word_list = list(match.group(0) for match in words)
|
|
if len(word_list) > n:
|
|
word_list = word_list[:n]
|
|
return ' '.join(word_list)
|
|
|
|
def format_price(raw):
|
|
return re.search(r'\d+(\.\d)?',raw).group(0)
|
|
|
|
class ParseResult:
|
|
|
|
def __init__(self,url,tree,space_seperated_categories = 7,):
|
|
self.url=url
|
|
self.info_url = urllib.parse.urlparse(url)
|
|
self.word_len = space_seperated_categories
|
|
self.tree = tree
|
|
|
|
|
|
self.time = datetime.datetime.today()
|
|
self.info_product = self._get_product_info_()
|
|
|
|
|
|
def _get_product_info_(self):
|
|
|
|
|
|
host = self.info_url.netloc
|
|
|
|
product_name = get_words(
|
|
self.tree.xpath(xpaths[host]['name'])[0].text
|
|
)
|
|
|
|
other_raw = None
|
|
try:
|
|
other_raw = self.tree.xpath(xpaths[host]['other'])[0].text
|
|
except KeyError:
|
|
pass
|
|
|
|
if host in ['www.gearbest.com']:
|
|
if other.raw:
|
|
price = '0.00'
|
|
else:
|
|
price = format_price(
|
|
self.tree.xpath(xpaths[host]['price'])[0].text
|
|
)
|
|
|
|
else:
|
|
price = format_price(
|
|
self.tree.xpath(xpaths[host]['price'])[0].text
|
|
)
|
|
|
|
return {
|
|
"product_name":product_name,
|
|
"price":price,
|
|
}
|