From 891684741a4844018ceee1c36a0b57fe2f8abf4c Mon Sep 17 00:00:00 2001 From: Raphael Roberts Date: Sat, 15 Sep 2018 16:14:11 -0500 Subject: [PATCH] initial commit --- .gitignore | 4 +- batch_process.py | 167 +++++++++++++++++++++++++++++++++++------------ get_link.py | 50 ++++++++++++++ price_finder.py | 6 +- proxy_class.py | 21 ++++++ xpaths.json | 23 +++++++ 6 files changed, 226 insertions(+), 45 deletions(-) create mode 100644 get_link.py create mode 100644 proxy_class.py create mode 100644 xpaths.json diff --git a/.gitignore b/.gitignore index 3056b4a..710d74c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ __pycache__ -.gitignore \ No newline at end of file +.gitignore +/uBlock0.chromium +/bg.html \ No newline at end of file diff --git a/batch_process.py b/batch_process.py index 01fa96f..419d2af 100644 --- a/batch_process.py +++ b/batch_process.py @@ -1,12 +1,15 @@ from price_finder import price_finder,BS from itertools import cycle import requests -import requests_html +# import requests_html +import sys from ipaddress import ip_address -def get_proxies(country = 'United States'): - ses = requests_html.HTMLSession() - r = ses.get('https://free-proxy-list.net/') - page = BS(r.html.raw_html,'lxml') +from get_link import get_link + +def get_proxies(link='https://free-proxy-list.net/',country = 'United States'): +## ses = requests_html.HTMLSession() + r = requests.get(link) + page = BS(r.content,'lxml') table = page.find(id='proxylisttable') headers,*rows = table.find_all('tr') headers = list(tag.text.lower() for tag in headers.find_all('th')) @@ -21,55 +24,133 @@ def get_proxies(country = 'United States'): try: ip_address(tr[ip]) assert int(port) >= 0 and int(port) < 2**16 - if tr[https_support] == "yes" and tr[country_id] == country: + if (tr[https_support] == "yes" or False) and tr[country_id] == country: proxies.append('{}:{}'.format(tr[ip],tr[port])) except (ValueError,AssertionError): pass except Exception as e: print(row) raise e - return cycle(proxies) -def get_prices(links): - proxies = get_proxies() - s = requests_html.HTMLSession() + return proxies + +class proxy_iter: + def __init__(self,proxies): + self._proxies = set(proxies) + self.proxies = self._proxies.copy() + self.bad_proxies = set() + # self.used_proxies = {} + + def __next__(self): + self.proxies -= self.bad_proxies + if len(self.proxies) == 0: + raise StopIteration + + elem = self.proxies.pop() + if len(self.proxies) == 0: + self.proxies = self._proxies.copy() + return elem + + def __iter__(self): + return self + def blacklist(self,proxy): + self.bad_proxies.add(proxy) +# def render_page(link,proxies,ses): + # print(link) + # bad_proxies = set() + # page = None + # render_attempts = 0 + # for proxy in proxies: + # print(proxy) + # try: + # r = ses.get(link,proxies={'http':proxy,'https':proxy}) + # print('got') + # except (requests.exceptions.ProxyError,requests.exceptions.SSLError): + # print('!g!'+proxy) + # bad_proxies.add(proxy) + # continue + # if render_attempts < 3: + # render_attempts += 1 + # try: + # r.html.render(timeout=10, sleep=10) + # print('rendered') + # except requests_html.MaxRetries: + # print('!r!'+proxy) + # bad_proxies.add(proxy) + # continue + # page = r.html.raw_html + # break + # if page: + # return page,{proxy},bad_proxies + # else: + # raise Exception("All proxies used up") +def get_prices(links,use_proxies = True): + pages = {} + if use_proxies: + proxies = proxy_iter(get_proxies() + get_proxies('https://www.us-proxy.org/')) + for link in links: + for proxy in proxies: + print(link,proxy) + try: + page = get_link(link,proxy=proxy) + pages[link] = page + break + except Exception as e: + print(type(e),e,file=sys.stdout) + proxies.blacklist(proxy) + if len(links) != len(pages.keys()): + raise Exception('all proxies suck') + else: + pages = get_link(links) ret = [] - bad_proxies= set() for link in links: - page = None - render_tries = 0 - print(link) - while not page: - proxy = next(proxies) - while proxy in bad_proxies: - proxy = next(proxies) - print(proxy) - try: - r = s.get(link,proxies={'http':proxy,'https':proxy}) - print('got') - try: - render_tries += 1 - r.html.render() - print('rendered') - except requests_html.MaxRetries: - if render_tries > 2: - pass - else: - print('!'+proxy) - bad_proxies.update([proxy]) - continue - page = r.html.raw_html - ret.append(price_finder(link,bs=BS(page,'lxml'))) - - except (requests.exceptions.ProxyError,requests.exceptions.SSLError): - print('!'+proxy) - bad_proxies.update([proxy]) - - print(bad_proxies) - s.close() + ret.append(price_finder( + link,bs=BS(pages[link],'lxml') + )) + return ret + + + +def get_prices_old(links,no_reuse = True,use_proxies=True): + if use_proxies: + proxies = set(get_proxies() + get_proxies('https://www.us-proxy.org/')) + ses = requests_html.HTMLSession() + ret = [] + if use_proxies: + prev = set() + if use_proxies: + bad_proxies_set= set() + for link in links: + if use_proxies: + if no_reuse: + working_set = proxies-prev + # if use_proxies: + else: + working_set = proxies + page,prev,bad_proxies = render_page(link,working_set,ses) + else: + r=ses.get(link) + r.html.render() + page = r.html.raw_html + + ret.append(price_finder(link,bs=BS(page,'lxml'))) + if use_proxies: + bad_proxies_set |= bad_proxies + proxies -= bad_proxies + if use_proxies: + print(bad_proxies_set) + ses.close() return ret + if __name__ == "__main__": + # ses = requests_html.HTMLSession() + # proxies = get_proxies('https://www.us-proxy.org/') + # page = render_page('https://www.banggood.com/Aomway-Commander-Goggles-V1-2D-3D-40CH-5_8G-FPV-Video-Headset-Support-HDMI-DVR-Headtracker-p-1107684.html?cur_warehouse=CN', + # proxies, + # ses) + import saveto import random ql = saveto.load('quad_links') random.shuffle(ql) - products = get_prices(ql) \ No newline at end of file + products = get_prices(ql,use_proxies=False) + # pass diff --git a/get_link.py b/get_link.py new file mode 100644 index 0000000..3124882 --- /dev/null +++ b/get_link.py @@ -0,0 +1,50 @@ +import pyppeteer +import asyncio +import os +async def _get_link(browser,link): + pages = await browser.pages() + page = pages[0] + await page.goto(link,timeout=60_000) + webpage = None + for i in range(20): + try: + webpage = await page.content() + break + except: + time.sleep(1) + return webpage + +async def _single_link(browser,link): + webpage = await _get_link(browser,link) + await browser.close() + return webpage + +async def _multi_link(browser,links): + results = {} + for link in links: + webpage = await _get_link(browser,link) + results[link] = webpage + await browser.close() + return results + +def get_link(links,headless = True,proxy = None): + ext = os.path.join(os.path.dirname(__file__),'uBlock0.chromium') + loop = asyncio.get_event_loop() + run = loop.run_until_complete + opts = { + 'headless':headless, + } + opts['args'] = [f'--disable-extensions-except={ext}', f'--load-extension={ext}'] + if proxy: + opts['args'] += [f'--proxy-server={proxy}'] + # print(opts) + browser = run(pyppeteer.launch(**opts)) + try: + if isinstance(links,list): + result = run(_multi_link(browser,links)) + else: + result = run(_single_link(browser,links)) + return result + except Exception as e: + run(browser.close()) + raise e \ No newline at end of file diff --git a/price_finder.py b/price_finder.py index 7eca6bc..e27f0dd 100644 --- a/price_finder.py +++ b/price_finder.py @@ -4,6 +4,8 @@ from bs4 import BeautifulSoup as BS from requests_html import HTMLSession import re import datetime +# import pytz +import copy user_agent = UserAgent().chrome debug = None @@ -74,4 +76,6 @@ class price_finder: "product_name":get_words(funcs["name"](self.bs),self.word_len), "price":funcs["price"](self.bs).replace("$",""), } - + # def to_json(self): + # ret = copy.deepcopy(self.__dict__) + # ret['time'] = ret['time']. \ No newline at end of file diff --git a/proxy_class.py b/proxy_class.py new file mode 100644 index 0000000..8b52f03 --- /dev/null +++ b/proxy_class.py @@ -0,0 +1,21 @@ +class proxy_iter: + def __init__(self,proxies): + self._proxies = set(proxies) + self.proxies = self._proxies.copy() + self.bad_proxies = set() + # self.used_proxies = {} + + def __next__(self): + self.proxies -= self.bad_proxies + if len(self.proxies) == 0: + raise StopIteration + + elem = self.proxies.pop() + if len(self.proxies) == 0: + self.proxies = self._proxies.copy() + return elem + + def __iter__(self): + return self + def blacklist(self,proxy): + self.bad_proxies.add(proxy) \ No newline at end of file diff --git a/xpaths.json b/xpaths.json new file mode 100644 index 0000000..1648c41 --- /dev/null +++ b/xpaths.json @@ -0,0 +1,23 @@ +{ + "www.banggood.com": { + "name": "//h1[@itemprop='name']", + "price": "//div[@class='now']" + }, + "www.gearbest.com": { + "name": "//h1[@class='goodsIntro_title']", + "price": "//span[contains(@class,'goodsIntro_price')]", + "other": "//div[@class='goodsIntro_noticeSubmit']" + }, + "www.amazon.com": { + "name": "//span[@id='priceblock_dealprice' or @id='priceblock_ourprice']", + "price": "//span[@id='productTitle']" + }, + "www.getfpv.com": { + "name": "//div[@class='product-name']/span", + "price": "//div[@class='price-box']/p[@class='special-price']/span[@class='price'] | //div[@class='price-box']/span[@class='regular-price']/span" + }, + "www.dalprops.com": { + "name": "//h1[@itemprop='name']", + "price": "//*[@id='product-price']" + } +} \ No newline at end of file