from price_finder import price_finder,BS from itertools import cycle import requests import requests_html from ipaddress import ip_address def get_proxies(country = 'United States'): ses = requests_html.HTMLSession() r = ses.get('https://free-proxy-list.net/') page = BS(r.html.raw_html,'lxml') table = page.find(id='proxylisttable') headers,*rows = table.find_all('tr') headers = list(tag.text.lower() for tag in headers.find_all('th')) ip,port = headers.index('ip address'),headers.index('port') https_support = headers.index('https') country_id = headers.index('country') proxies = [] for row in rows: if row.find('td'): tr = list(tag.text for tag in row.find_all('td')) try: try: ip_address(tr[ip]) assert int(port) >= 0 and int(port) < 2**16 if tr[https_support] == "yes" and tr[country_id] == country: proxies.append('{}:{}'.format(tr[ip],tr[port])) except (ValueError,AssertionError): pass except Exception as e: print(row) raise e return cycle(proxies) def get_prices(links): proxies = get_proxies() s = requests_html.HTMLSession() ret = [] bad_proxies= set() for link in links: page = None render_tries = 0 print(link) while not page: proxy = next(proxies) while proxy in bad_proxies: proxy = next(proxies) print(proxy) try: r = s.get(link,proxies={'http':proxy,'https':proxy}) print('got') try: render_tries += 1 r.html.render() print('rendered') except requests_html.MaxRetries: if render_tries > 2: pass else: print('!'+proxy) bad_proxies.update([proxy]) continue page = r.html.raw_html ret.append(price_finder(link,bs=BS(page,'lxml'))) except (requests.exceptions.ProxyError,requests.exceptions.SSLError): print('!'+proxy) bad_proxies.update([proxy]) print(bad_proxies) s.close() return ret if __name__ == "__main__": import saveto import random ql = saveto.load('quad_links') random.shuffle(ql) products = get_prices(ql)