From b0dbd9fa036c7d59485ca5562ef455d87fbc5003 Mon Sep 17 00:00:00 2001 From: Raphael Roberts Date: Sat, 15 Sep 2018 18:34:31 -0500 Subject: [PATCH] Not done more changes will be made but big first steps --- .gitignore | 4 +- batch_process.py | 102 +++++++++------------------------------------ get_link.py | 44 +++++++++++++------- price_finder.py | 105 +++++++++++++++++++---------------------------- xpaths.json | 6 ++- 5 files changed, 101 insertions(+), 160 deletions(-) diff --git a/.gitignore b/.gitignore index 710d74c..536deeb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ __pycache__ .gitignore /uBlock0.chromium -/bg.html \ No newline at end of file +/bg.html +/test_this_bullshit.py +/output \ No newline at end of file diff --git a/batch_process.py b/batch_process.py index 419d2af..8e6c246 100644 --- a/batch_process.py +++ b/batch_process.py @@ -1,10 +1,20 @@ -from price_finder import price_finder,BS +from price_finder import ParseResult +from lxml import etree +from bs4 import BeautifulSoup as BS from itertools import cycle import requests +from urllib.parse import urlparse # import requests_html import sys from ipaddress import ip_address from get_link import get_link +import json +with open('xpaths.json') as file: + xpaths_data = json.load(file) + +parser = etree.HTMLParser() +def text2tree(text): + return etree.fromstring(text,parser) def get_proxies(link='https://free-proxy-list.net/',country = 'United States'): ## ses = requests_html.HTMLSession() @@ -54,44 +64,18 @@ class proxy_iter: return self def blacklist(self,proxy): self.bad_proxies.add(proxy) -# def render_page(link,proxies,ses): - # print(link) - # bad_proxies = set() - # page = None - # render_attempts = 0 - # for proxy in proxies: - # print(proxy) - # try: - # r = ses.get(link,proxies={'http':proxy,'https':proxy}) - # print('got') - # except (requests.exceptions.ProxyError,requests.exceptions.SSLError): - # print('!g!'+proxy) - # bad_proxies.add(proxy) - # continue - # if render_attempts < 3: - # render_attempts += 1 - # try: - # r.html.render(timeout=10, sleep=10) - # print('rendered') - # except requests_html.MaxRetries: - # print('!r!'+proxy) - # bad_proxies.add(proxy) - # continue - # page = r.html.raw_html - # break - # if page: - # return page,{proxy},bad_proxies - # else: - # raise Exception("All proxies used up") + def get_prices(links,use_proxies = True): pages = {} + xpaths = {link:xpaths_data[urlparse(link).netloc] for link in links} + # print(xpaths) if use_proxies: proxies = proxy_iter(get_proxies() + get_proxies('https://www.us-proxy.org/')) for link in links: for proxy in proxies: print(link,proxy) try: - page = get_link(link,proxy=proxy) + page = get_link(link,xpaths,proxy=proxy) pages[link] = page break except Exception as e: @@ -100,57 +84,11 @@ def get_prices(links,use_proxies = True): if len(links) != len(pages.keys()): raise Exception('all proxies suck') else: - pages = get_link(links) + pages = get_link(links,xpaths) ret = [] for link in links: - ret.append(price_finder( - link,bs=BS(pages[link],'lxml') - )) + tree = text2tree(pages[link]) + ret.append( + ParseResult(link,tree) + ) return ret - - - -def get_prices_old(links,no_reuse = True,use_proxies=True): - if use_proxies: - proxies = set(get_proxies() + get_proxies('https://www.us-proxy.org/')) - ses = requests_html.HTMLSession() - ret = [] - if use_proxies: - prev = set() - if use_proxies: - bad_proxies_set= set() - for link in links: - if use_proxies: - if no_reuse: - working_set = proxies-prev - # if use_proxies: - else: - working_set = proxies - page,prev,bad_proxies = render_page(link,working_set,ses) - else: - r=ses.get(link) - r.html.render() - page = r.html.raw_html - - ret.append(price_finder(link,bs=BS(page,'lxml'))) - if use_proxies: - bad_proxies_set |= bad_proxies - proxies -= bad_proxies - if use_proxies: - print(bad_proxies_set) - ses.close() - return ret - -if __name__ == "__main__": - # ses = requests_html.HTMLSession() - # proxies = get_proxies('https://www.us-proxy.org/') - # page = render_page('https://www.banggood.com/Aomway-Commander-Goggles-V1-2D-3D-40CH-5_8G-FPV-Video-Headset-Support-HDMI-DVR-Headtracker-p-1107684.html?cur_warehouse=CN', - # proxies, - # ses) - - import saveto - import random - ql = saveto.load('quad_links') - random.shuffle(ql) - products = get_prices(ql,use_proxies=False) - # pass diff --git a/get_link.py b/get_link.py index 3124882..3fb9b31 100644 --- a/get_link.py +++ b/get_link.py @@ -1,50 +1,66 @@ import pyppeteer +import pyppeteer.errors import asyncio import os -async def _get_link(browser,link): + +async def _get_link(browser,link,xpath): pages = await browser.pages() page = pages[0] - await page.goto(link,timeout=60_000) + await page.goto(link,waitUntil='documentloaded') + + xpath = [xpath['name'],xpath['price']] + for _xpath in xpath: + print(repr(_xpath)) + try: + await page.waitForXPath(_xpath) + except pyppeteer.errors.TimeoutError: + pass + await asyncio.sleep(1) webpage = None for i in range(20): try: webpage = await page.content() break except: - time.sleep(1) + await asyncio.sleep(1) return webpage -async def _single_link(browser,link): - webpage = await _get_link(browser,link) +async def _single_link(browser,link,xpath): + webpage = await _get_link(browser,link,xpath) await browser.close() return webpage -async def _multi_link(browser,links): +async def _multi_link(browser,links,xpaths): results = {} for link in links: - webpage = await _get_link(browser,link) + xpath = xpaths[link] + webpage = await _get_link(browser,link,xpath) results[link] = webpage await browser.close() return results -def get_link(links,headless = True,proxy = None): - ext = os.path.join(os.path.dirname(__file__),'uBlock0.chromium') +def get_link(links,xpaths,headless = False,proxy = None): loop = asyncio.get_event_loop() run = loop.run_until_complete opts = { 'headless':headless, } - opts['args'] = [f'--disable-extensions-except={ext}', f'--load-extension={ext}'] if proxy: - opts['args'] += [f'--proxy-server={proxy}'] + opts['args'] = [f'--proxy-server={proxy}'] + + else: + opts['args'] = [] + ext = os.path.join(os.path.dirname(__file__),'uBlock0.chromium') + opts['args'] += [f'--disable-extensions-except={ext}', f'--load-extension={ext}'] # print(opts) browser = run(pyppeteer.launch(**opts)) try: if isinstance(links,list): - result = run(_multi_link(browser,links)) + result = run(_multi_link(browser,links,xpaths)) else: - result = run(_single_link(browser,links)) + result = run(_single_link(browser,links,xpaths[links])) return result except Exception as e: run(browser.close()) - raise e \ No newline at end of file + raise e + \ No newline at end of file diff --git a/price_finder.py b/price_finder.py index e27f0dd..97fb7e9 100644 --- a/price_finder.py +++ b/price_finder.py @@ -1,81 +1,62 @@ -import urllib -from fake_useragent import UserAgent -from bs4 import BeautifulSoup as BS -from requests_html import HTMLSession import re import datetime -# import pytz import copy +import json +with open('xpaths.json') as file: + xpaths = json.load(file) -user_agent = UserAgent().chrome -debug = None -def get_words(string,n): - words = re.finditer(r"(\b[^ \n]+\b)",string) +def get_words(raw,n): + words = re.finditer(r"(\b[^ \n]+\b)",raw) word_list = list(match.group(0) for match in words) if len(word_list) > n: word_list = word_list[:n] return ' '.join(word_list) -def get_page(url): - page = None - while not page: - page = urllib.request.Request(url,headers = {"User-Agent":user_agent}) - page = str(urllib.request.urlopen(page).read()) - - return page -def get_BS(url): - return BS(get_page(url),"lxml") +def format_price(raw): + return re.search(r'\d+(\.\d)?',raw).group(0) + +class ParseResult: -class price_finder: - page_funcs = { - "www.amazon.com":{ - "name":lambda page: re.sub(r"( {2,}|\n|\\n)","",page.find("span",id="productTitle").text), - "price":lambda page: page.find(name = "span",id = re.compile("priceblock.*")).text - }, - "www.banggood.com":{ - "name":lambda page: page.find("h1",attrs = {"itemprop":"name"}).text, - "price":lambda page: page.find("div",attrs = {"class":"now"}).get("oriprice") - }, - "www.dalprops.com":{ - "name":lambda page: page.find("h1",attrs = {"class":"product_title"}).text, - "price":lambda page: page.find("meta",attrs = {"itemprop":"price"}).get("content") - }, - "www.gearbest.com":{ - "name":lambda page:re.sub(" {2,}|\n","",page.find("div",attrs = {"class":"goodsIntro_titleWrap"}).find("h1").text), - "price":lambda page: page.find("span",attrs={"class":"goodsIntro_price"}).text - }, - "hobbyking.com":{ - "name":lambda page: page.find("h1",attrs={"class":"product-name"}).text, - "price":lambda page: page.find("span",id = re.compile(r"product-price.*")).find("span",attrs={"class":"price"}).text - }, - "www.getfpv.com":{ - "name": lambda page: re.sub(r"\\n|\n","", page.find("div",attrs={"class":"product-name"}).text), - "price": lambda page: re.sub(r"\\n|\n","", page.find("span",attrs={"id":re.compile("product-price.*")}).text) - } - } - def __init__(self,url,space_seperated_categories = 7,bs=None): + def __init__(self,url,tree,space_seperated_categories = 7,): self.url=url self.info_url = urllib.parse.urlparse(url) self.word_len = space_seperated_categories - if self.info_url.netloc not in price_finder.page_funcs.keys(): - raise NotImplementedError("Not implemented for {}".format(self.info_url.netloc)) - if bs: - self.bs= bs - else: - self.bs = get_BS(url) - # self.words = re_words(space_seperated_categories) + self.tree = tree + + self.time = datetime.datetime.today() self.info_product = self._get_product_info_() def _get_product_info_(self): - funcs = price_finder.page_funcs[self.info_url.netloc] - # print(self.url) - + + + host = self.info_url.netloc + + product_name = get_words( + self.tree.xpath(xpaths[host]['name'])[0].text + ) + + other_raw = None + try: + other_raw = self.tree.xpath(xpaths[host]['other'])[0].text + except KeyError: + pass + + if host in ['www.gearbest.com']: + if other.raw: + price = '0.00' + else: + price = format_price( + self.tree.xpath(xpaths[host]['price'])[0].text + ) + + else: + price = format_price( + self.tree.xpath(xpaths[host]['price'])[0].text + ) + return { - "product_name":get_words(funcs["name"](self.bs),self.word_len), - "price":funcs["price"](self.bs).replace("$",""), - } - # def to_json(self): - # ret = copy.deepcopy(self.__dict__) - # ret['time'] = ret['time']. \ No newline at end of file + "product_name":product_name, + "price":price, + } \ No newline at end of file diff --git a/xpaths.json b/xpaths.json index 1648c41..876a324 100644 --- a/xpaths.json +++ b/xpaths.json @@ -19,5 +19,9 @@ "www.dalprops.com": { "name": "//h1[@itemprop='name']", "price": "//*[@id='product-price']" - } + }, + "hobbyking.com": { + "name": "//h1[contains(@class,'product-name')]", + "price": "//p[@class='special-price']/span[@class='price'] | //span[@class='regular-price']/span[@class='price']" + } } \ No newline at end of file