|
|
from price_finder import price_finder,BSfrom itertools import cycleimport requests# import requests_htmlimport sysfrom ipaddress import ip_addressfrom get_link import get_link
def get_proxies(link='https://free-proxy-list.net/',country = 'United States'):## ses = requests_html.HTMLSession() r = requests.get(link) page = BS(r.content,'lxml') table = page.find(id='proxylisttable') headers,*rows = table.find_all('tr') headers = list(tag.text.lower() for tag in headers.find_all('th')) ip,port = headers.index('ip address'),headers.index('port') https_support = headers.index('https') country_id = headers.index('country') proxies = [] for row in rows: if row.find('td'): tr = list(tag.text for tag in row.find_all('td')) try: try: ip_address(tr[ip]) assert int(port) >= 0 and int(port) < 2**16 if (tr[https_support] == "yes" or False) and tr[country_id] == country: proxies.append('{}:{}'.format(tr[ip],tr[port])) except (ValueError,AssertionError): pass except Exception as e: print(row) raise e return proxies
class proxy_iter: def __init__(self,proxies): self._proxies = set(proxies) self.proxies = self._proxies.copy() self.bad_proxies = set() # self.used_proxies = {} def __next__(self): self.proxies -= self.bad_proxies if len(self.proxies) == 0: raise StopIteration
elem = self.proxies.pop() if len(self.proxies) == 0: self.proxies = self._proxies.copy() return elem def __iter__(self): return self def blacklist(self,proxy): self.bad_proxies.add(proxy)# def render_page(link,proxies,ses): # print(link) # bad_proxies = set() # page = None # render_attempts = 0 # for proxy in proxies: # print(proxy) # try: # r = ses.get(link,proxies={'http':proxy,'https':proxy}) # print('got') # except (requests.exceptions.ProxyError,requests.exceptions.SSLError): # print('!g!'+proxy) # bad_proxies.add(proxy) # continue # if render_attempts < 3: # render_attempts += 1 # try: # r.html.render(timeout=10, sleep=10) # print('rendered') # except requests_html.MaxRetries: # print('!r!'+proxy) # bad_proxies.add(proxy) # continue # page = r.html.raw_html # break # if page: # return page,{proxy},bad_proxies # else: # raise Exception("All proxies used up")def get_prices(links,use_proxies = True): pages = {} if use_proxies: proxies = proxy_iter(get_proxies() + get_proxies('https://www.us-proxy.org/')) for link in links: for proxy in proxies: print(link,proxy) try: page = get_link(link,proxy=proxy) pages[link] = page break except Exception as e: print(type(e),e,file=sys.stdout) proxies.blacklist(proxy) if len(links) != len(pages.keys()): raise Exception('all proxies suck') else: pages = get_link(links) ret = [] for link in links: ret.append(price_finder( link,bs=BS(pages[link],'lxml') )) return ret
def get_prices_old(links,no_reuse = True,use_proxies=True): if use_proxies: proxies = set(get_proxies() + get_proxies('https://www.us-proxy.org/')) ses = requests_html.HTMLSession() ret = [] if use_proxies: prev = set() if use_proxies: bad_proxies_set= set() for link in links: if use_proxies: if no_reuse: working_set = proxies-prev # if use_proxies: else: working_set = proxies page,prev,bad_proxies = render_page(link,working_set,ses) else: r=ses.get(link) r.html.render() page = r.html.raw_html
ret.append(price_finder(link,bs=BS(page,'lxml'))) if use_proxies: bad_proxies_set |= bad_proxies proxies -= bad_proxies if use_proxies: print(bad_proxies_set) ses.close() return ret
if __name__ == "__main__": # ses = requests_html.HTMLSession() # proxies = get_proxies('https://www.us-proxy.org/') # page = render_page('https://www.banggood.com/Aomway-Commander-Goggles-V1-2D-3D-40CH-5_8G-FPV-Video-Headset-Support-HDMI-DVR-Headtracker-p-1107684.html?cur_warehouse=CN', # proxies, # ses) import saveto import random ql = saveto.load('quad_links') random.shuffle(ql) products = get_prices(ql,use_proxies=False) # pass
|