From f76f5dfefc00cff55ef3c193eff4fc3ed260416d Mon Sep 17 00:00:00 2001 From: Raphael Roberts Date: Sat, 1 Sep 2018 16:10:50 -0500 Subject: [PATCH] get_link uses chrome to open links we'll see how this one works --- batch_process.py | 130 ++++++++++++++++++++++++++++++----------------- get_link.py | 42 +++++++++++++++ price_finder.py | 6 ++- 3 files changed, 131 insertions(+), 47 deletions(-) create mode 100644 get_link.py diff --git a/batch_process.py b/batch_process.py index 01fa96f..4152605 100644 --- a/batch_process.py +++ b/batch_process.py @@ -1,11 +1,13 @@ from price_finder import price_finder,BS from itertools import cycle import requests -import requests_html +# import requests_html from ipaddress import ip_address -def get_proxies(country = 'United States'): +from get_link import get_link + +def get_proxies(link='https://free-proxy-list.net/',country = 'United States'): ses = requests_html.HTMLSession() - r = ses.get('https://free-proxy-list.net/') + r = requests.get(link) page = BS(r.html.raw_html,'lxml') table = page.find(id='proxylisttable') headers,*rows = table.find_all('tr') @@ -21,55 +23,91 @@ def get_proxies(country = 'United States'): try: ip_address(tr[ip]) assert int(port) >= 0 and int(port) < 2**16 - if tr[https_support] == "yes" and tr[country_id] == country: + if (tr[https_support] == "yes" or False) and tr[country_id] == country: proxies.append('{}:{}'.format(tr[ip],tr[port])) except (ValueError,AssertionError): pass except Exception as e: print(row) raise e - return cycle(proxies) -def get_prices(links): - proxies = get_proxies() - s = requests_html.HTMLSession() + return proxies + +# def render_page(link,proxies,ses): + # print(link) + # bad_proxies = set() + # page = None + # render_attempts = 0 + # for proxy in proxies: + # print(proxy) + # try: + # r = ses.get(link,proxies={'http':proxy,'https':proxy}) + # print('got') + # except (requests.exceptions.ProxyError,requests.exceptions.SSLError): + # print('!g!'+proxy) + # bad_proxies.add(proxy) + # continue + # if render_attempts < 3: + # render_attempts += 1 + # try: + # r.html.render(timeout=10, sleep=10) + # print('rendered') + # except requests_html.MaxRetries: + # print('!r!'+proxy) + # bad_proxies.add(proxy) + # continue + # page = r.html.raw_html + # break + # if page: + # return page,{proxy},bad_proxies + # else: + # raise Exception("All proxies used up") +def get_prices(links,no_reuse = True,use_proxies = True): + if use_proxies: + + else: + + +def get_prices_old(links,no_reuse = True,use_proxies=True): + if use_proxies: + proxies = set(get_proxies() + get_proxies('https://www.us-proxy.org/')) + ses = requests_html.HTMLSession() ret = [] - bad_proxies= set() + if use_proxies: + prev = set() + if use_proxies: + bad_proxies_set= set() for link in links: - page = None - render_tries = 0 - print(link) - while not page: - proxy = next(proxies) - while proxy in bad_proxies: - proxy = next(proxies) - print(proxy) - try: - r = s.get(link,proxies={'http':proxy,'https':proxy}) - print('got') - try: - render_tries += 1 - r.html.render() - print('rendered') - except requests_html.MaxRetries: - if render_tries > 2: - pass - else: - print('!'+proxy) - bad_proxies.update([proxy]) - continue - page = r.html.raw_html - ret.append(price_finder(link,bs=BS(page,'lxml'))) - - except (requests.exceptions.ProxyError,requests.exceptions.SSLError): - print('!'+proxy) - bad_proxies.update([proxy]) - - print(bad_proxies) - s.close() + if use_proxies: + if no_reuse: + working_set = proxies-prev + # if use_proxies: + else: + working_set = proxies + page,prev,bad_proxies = render_page(link,working_set,ses) + else: + r=ses.get(link) + r.html.render() + page = r.html.raw_html + + ret.append(price_finder(link,bs=BS(page,'lxml'))) + if use_proxies: + bad_proxies_set |= bad_proxies + proxies -= bad_proxies + if use_proxies: + print(bad_proxies_set) + ses.close() return ret -if __name__ == "__main__": - import saveto - import random - ql = saveto.load('quad_links') - random.shuffle(ql) - products = get_prices(ql) \ No newline at end of file + +# if __name__ == "__main__": + # ses = requests_html.HTMLSession() + # proxies = get_proxies('https://www.us-proxy.org/') + # page = render_page('https://www.banggood.com/Aomway-Commander-Goggles-V1-2D-3D-40CH-5_8G-FPV-Video-Headset-Support-HDMI-DVR-Headtracker-p-1107684.html?cur_warehouse=CN', + # proxies, + # ses) + + # import saveto + # import random + # ql = saveto.load('quad_links') + # random.shuffle(ql) + # products = get_prices(ql) + # pass \ No newline at end of file diff --git a/get_link.py b/get_link.py new file mode 100644 index 0000000..8f71b0f --- /dev/null +++ b/get_link.py @@ -0,0 +1,42 @@ +import pyppeteer +import asyncio +async def _get_link(browser,link): + pages = await browser.pages() + page = pages[0] + await page.goto(link) + webpage = None + for i in range(20): + try: + webpage = await page.content() + break + except: + time.sleep(1) + return webpage + +async def _single_link(brower,link): + webpage = await _get_link(brower,link) + await brower.close() + return webpage + +async def _multi_link(brower,links): + results = {} + for link in links: + webpage = await _get_link(brower,link) + result[link] = webpage + await brower.close() + return results + +def get_link(links,headless = False,proxy = None): + loop = asyncio.get_event_loop() + run = loop.run_until_complete + opts = { + 'headless':headless, + } + if proxy: + opts['args'] = ['--proxy-server={}'.format(proxy)] + browser = run(pyppeteer.launch(**opts)) + if isinstance(links,list): + result = run(_multi_link(brower,links)) + else: + result = run(_single_link(browser,links)) + return result diff --git a/price_finder.py b/price_finder.py index 7eca6bc..e27f0dd 100644 --- a/price_finder.py +++ b/price_finder.py @@ -4,6 +4,8 @@ from bs4 import BeautifulSoup as BS from requests_html import HTMLSession import re import datetime +# import pytz +import copy user_agent = UserAgent().chrome debug = None @@ -74,4 +76,6 @@ class price_finder: "product_name":get_words(funcs["name"](self.bs),self.word_len), "price":funcs["price"](self.bs).replace("$",""), } - + # def to_json(self): + # ret = copy.deepcopy(self.__dict__) + # ret['time'] = ret['time']. \ No newline at end of file