from price_finder import ParseResult from lxml import etree from bs4 import BeautifulSoup as BS from itertools import cycle import requests from urllib.parse import urlparse # import requests_html import sys from ipaddress import ip_address from get_link import get_link import json with open('xpaths.json') as file: xpaths_data = json.load(file) parser = etree.HTMLParser() def text2tree(text): return etree.fromstring(text,parser) def get_proxies(link='https://free-proxy-list.net/',country = 'United States'): ## ses = requests_html.HTMLSession() r = requests.get(link) page = BS(r.content,'lxml') table = page.find(id='proxylisttable') headers,*rows = table.find_all('tr') headers = list(tag.text.lower() for tag in headers.find_all('th')) ip,port = headers.index('ip address'),headers.index('port') https_support = headers.index('https') country_id = headers.index('country') proxies = [] for row in rows: if row.find('td'): tr = list(tag.text for tag in row.find_all('td')) try: try: ip_address(tr[ip]) assert int(port) >= 0 and int(port) < 2**16 if (tr[https_support] == "yes" or False) and tr[country_id] == country: proxies.append('{}:{}'.format(tr[ip],tr[port])) except (ValueError,AssertionError): pass except Exception as e: print(row) raise e return proxies class proxy_iter: def __init__(self,proxies): self._proxies = set(proxies) self.proxies = self._proxies.copy() self.bad_proxies = set() # self.used_proxies = {} def __next__(self): self.proxies -= self.bad_proxies if len(self.proxies) == 0: raise StopIteration elem = self.proxies.pop() if len(self.proxies) == 0: self.proxies = self._proxies.copy() return elem def __iter__(self): return self def blacklist(self,proxy): self.bad_proxies.add(proxy) def get_prices(links,use_proxies = True): pages = {} xpaths = {link:xpaths_data[urlparse(link).netloc] for link in links} # print(xpaths) if use_proxies: proxies = proxy_iter(get_proxies() + get_proxies('https://www.us-proxy.org/')) for link in links: for proxy in proxies: print(link,proxy) try: page = get_link(link,xpaths,proxy=proxy) pages[link] = page break except Exception as e: print(type(e),e,file=sys.stdout) proxies.blacklist(proxy) if len(links) != len(pages.keys()): raise Exception('all proxies suck') else: pages = get_link(links,xpaths) ret = [] for link in links: tree = text2tree(pages[link]) ret.append( ParseResult(link,tree) ) return ret