|
|
from price_finder import price_finder,BSfrom itertools import cycleimport requests# import requests_htmlfrom ipaddress import ip_addressfrom get_link import get_link
def get_proxies(link='https://free-proxy-list.net/',country = 'United States'): ses = requests_html.HTMLSession() r = requests.get(link) page = BS(r.html.raw_html,'lxml') table = page.find(id='proxylisttable') headers,*rows = table.find_all('tr') headers = list(tag.text.lower() for tag in headers.find_all('th')) ip,port = headers.index('ip address'),headers.index('port') https_support = headers.index('https') country_id = headers.index('country') proxies = [] for row in rows: if row.find('td'): tr = list(tag.text for tag in row.find_all('td')) try: try: ip_address(tr[ip]) assert int(port) >= 0 and int(port) < 2**16 if (tr[https_support] == "yes" or False) and tr[country_id] == country: proxies.append('{}:{}'.format(tr[ip],tr[port])) except (ValueError,AssertionError): pass except Exception as e: print(row) raise e return proxies
# def render_page(link,proxies,ses): # print(link) # bad_proxies = set() # page = None # render_attempts = 0 # for proxy in proxies: # print(proxy) # try: # r = ses.get(link,proxies={'http':proxy,'https':proxy}) # print('got') # except (requests.exceptions.ProxyError,requests.exceptions.SSLError): # print('!g!'+proxy) # bad_proxies.add(proxy) # continue # if render_attempts < 3: # render_attempts += 1 # try: # r.html.render(timeout=10, sleep=10) # print('rendered') # except requests_html.MaxRetries: # print('!r!'+proxy) # bad_proxies.add(proxy) # continue # page = r.html.raw_html # break # if page: # return page,{proxy},bad_proxies # else: # raise Exception("All proxies used up")def get_prices(links,no_reuse = True,use_proxies = True): if use_proxies: else:
def get_prices_old(links,no_reuse = True,use_proxies=True): if use_proxies: proxies = set(get_proxies() + get_proxies('https://www.us-proxy.org/')) ses = requests_html.HTMLSession() ret = [] if use_proxies: prev = set() if use_proxies: bad_proxies_set= set() for link in links: if use_proxies: if no_reuse: working_set = proxies-prev # if use_proxies: else: working_set = proxies page,prev,bad_proxies = render_page(link,working_set,ses) else: r=ses.get(link) r.html.render() page = r.html.raw_html
ret.append(price_finder(link,bs=BS(page,'lxml'))) if use_proxies: bad_proxies_set |= bad_proxies proxies -= bad_proxies if use_proxies: print(bad_proxies_set) ses.close() return ret
# if __name__ == "__main__": # ses = requests_html.HTMLSession() # proxies = get_proxies('https://www.us-proxy.org/') # page = render_page('https://www.banggood.com/Aomway-Commander-Goggles-V1-2D-3D-40CH-5_8G-FPV-Video-Headset-Support-HDMI-DVR-Headtracker-p-1107684.html?cur_warehouse=CN', # proxies, # ses) # import saveto # import random # ql = saveto.load('quad_links') # random.shuffle(ql) # products = get_prices(ql) # pass
|