|
|
|
@ -1,11 +1,13 @@ |
|
|
|
from price_finder import price_finder,BS |
|
|
|
from itertools import cycle |
|
|
|
import requests |
|
|
|
import requests_html |
|
|
|
# import requests_html |
|
|
|
from ipaddress import ip_address |
|
|
|
def get_proxies(country = 'United States'): |
|
|
|
from get_link import get_link |
|
|
|
|
|
|
|
def get_proxies(link='https://free-proxy-list.net/',country = 'United States'): |
|
|
|
ses = requests_html.HTMLSession() |
|
|
|
r = ses.get('https://free-proxy-list.net/') |
|
|
|
r = requests.get(link) |
|
|
|
page = BS(r.html.raw_html,'lxml') |
|
|
|
table = page.find(id='proxylisttable') |
|
|
|
headers,*rows = table.find_all('tr') |
|
|
|
@ -21,55 +23,91 @@ def get_proxies(country = 'United States'): |
|
|
|
try: |
|
|
|
ip_address(tr[ip]) |
|
|
|
assert int(port) >= 0 and int(port) < 2**16 |
|
|
|
if tr[https_support] == "yes" and tr[country_id] == country: |
|
|
|
if (tr[https_support] == "yes" or False) and tr[country_id] == country: |
|
|
|
proxies.append('{}:{}'.format(tr[ip],tr[port])) |
|
|
|
except (ValueError,AssertionError): |
|
|
|
pass |
|
|
|
except Exception as e: |
|
|
|
print(row) |
|
|
|
raise e |
|
|
|
return cycle(proxies) |
|
|
|
def get_prices(links): |
|
|
|
proxies = get_proxies() |
|
|
|
s = requests_html.HTMLSession() |
|
|
|
return proxies |
|
|
|
|
|
|
|
# def render_page(link,proxies,ses): |
|
|
|
# print(link) |
|
|
|
# bad_proxies = set() |
|
|
|
# page = None |
|
|
|
# render_attempts = 0 |
|
|
|
# for proxy in proxies: |
|
|
|
# print(proxy) |
|
|
|
# try: |
|
|
|
# r = ses.get(link,proxies={'http':proxy,'https':proxy}) |
|
|
|
# print('got') |
|
|
|
# except (requests.exceptions.ProxyError,requests.exceptions.SSLError): |
|
|
|
# print('!g!'+proxy) |
|
|
|
# bad_proxies.add(proxy) |
|
|
|
# continue |
|
|
|
# if render_attempts < 3: |
|
|
|
# render_attempts += 1 |
|
|
|
# try: |
|
|
|
# r.html.render(timeout=10, sleep=10) |
|
|
|
# print('rendered') |
|
|
|
# except requests_html.MaxRetries: |
|
|
|
# print('!r!'+proxy) |
|
|
|
# bad_proxies.add(proxy) |
|
|
|
# continue |
|
|
|
# page = r.html.raw_html |
|
|
|
# break |
|
|
|
# if page: |
|
|
|
# return page,{proxy},bad_proxies |
|
|
|
# else: |
|
|
|
# raise Exception("All proxies used up") |
|
|
|
def get_prices(links,no_reuse = True,use_proxies = True): |
|
|
|
if use_proxies: |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
|
|
|
|
def get_prices_old(links,no_reuse = True,use_proxies=True): |
|
|
|
if use_proxies: |
|
|
|
proxies = set(get_proxies() + get_proxies('https://www.us-proxy.org/')) |
|
|
|
ses = requests_html.HTMLSession() |
|
|
|
ret = [] |
|
|
|
bad_proxies= set() |
|
|
|
if use_proxies: |
|
|
|
prev = set() |
|
|
|
if use_proxies: |
|
|
|
bad_proxies_set= set() |
|
|
|
for link in links: |
|
|
|
page = None |
|
|
|
render_tries = 0 |
|
|
|
print(link) |
|
|
|
while not page: |
|
|
|
proxy = next(proxies) |
|
|
|
while proxy in bad_proxies: |
|
|
|
proxy = next(proxies) |
|
|
|
print(proxy) |
|
|
|
try: |
|
|
|
r = s.get(link,proxies={'http':proxy,'https':proxy}) |
|
|
|
print('got') |
|
|
|
try: |
|
|
|
render_tries += 1 |
|
|
|
r.html.render() |
|
|
|
print('rendered') |
|
|
|
except requests_html.MaxRetries: |
|
|
|
if render_tries > 2: |
|
|
|
pass |
|
|
|
else: |
|
|
|
print('!'+proxy) |
|
|
|
bad_proxies.update([proxy]) |
|
|
|
continue |
|
|
|
page = r.html.raw_html |
|
|
|
ret.append(price_finder(link,bs=BS(page,'lxml'))) |
|
|
|
|
|
|
|
except (requests.exceptions.ProxyError,requests.exceptions.SSLError): |
|
|
|
print('!'+proxy) |
|
|
|
bad_proxies.update([proxy]) |
|
|
|
|
|
|
|
print(bad_proxies) |
|
|
|
s.close() |
|
|
|
if use_proxies: |
|
|
|
if no_reuse: |
|
|
|
working_set = proxies-prev |
|
|
|
# if use_proxies: |
|
|
|
else: |
|
|
|
working_set = proxies |
|
|
|
page,prev,bad_proxies = render_page(link,working_set,ses) |
|
|
|
else: |
|
|
|
r=ses.get(link) |
|
|
|
r.html.render() |
|
|
|
page = r.html.raw_html |
|
|
|
|
|
|
|
ret.append(price_finder(link,bs=BS(page,'lxml'))) |
|
|
|
if use_proxies: |
|
|
|
bad_proxies_set |= bad_proxies |
|
|
|
proxies -= bad_proxies |
|
|
|
if use_proxies: |
|
|
|
print(bad_proxies_set) |
|
|
|
ses.close() |
|
|
|
return ret |
|
|
|
if __name__ == "__main__": |
|
|
|
import saveto |
|
|
|
import random |
|
|
|
ql = saveto.load('quad_links') |
|
|
|
random.shuffle(ql) |
|
|
|
products = get_prices(ql) |
|
|
|
|
|
|
|
# if __name__ == "__main__": |
|
|
|
# ses = requests_html.HTMLSession() |
|
|
|
# proxies = get_proxies('https://www.us-proxy.org/') |
|
|
|
# page = render_page('https://www.banggood.com/Aomway-Commander-Goggles-V1-2D-3D-40CH-5_8G-FPV-Video-Headset-Support-HDMI-DVR-Headtracker-p-1107684.html?cur_warehouse=CN', |
|
|
|
# proxies, |
|
|
|
# ses) |
|
|
|
|
|
|
|
# import saveto |
|
|
|
# import random |
|
|
|
# ql = saveto.load('quad_links') |
|
|
|
# random.shuffle(ql) |
|
|
|
# products = get_prices(ql) |
|
|
|
# pass |