price_finder/batch_process.py

from price_finder import ParseResult
from lxml import etree
from bs4 import BeautifulSoup as BS
from itertools import cycle
import requests
from urllib.parse import urlparse
# import requests_html
import sys
from ipaddress import ip_address
from get_link import get_link
import json
with open('xpaths.json') as file:
    xpaths_data = json.load(file)

parser = etree.HTMLParser()
def text2tree(text):
    return etree.fromstring(text,parser)

def get_proxies(link='https://free-proxy-list.net/',country = 'United States'):
##    ses = requests_html.HTMLSession()
    r = requests.get(link)
    page = BS(r.content,'lxml')
    table = page.find(id='proxylisttable')
    headers,*rows = table.find_all('tr')
    headers = list(tag.text.lower() for tag in headers.find_all('th'))
    ip,port = headers.index('ip address'),headers.index('port')
    https_support = headers.index('https')
    country_id = headers.index('country')
    proxies = []
    for row in rows:
        if row.find('td'): 
            tr = list(tag.text for tag in row.find_all('td'))
            try:
                try:
                    ip_address(tr[ip])
                    assert int(port) >= 0 and int(port) < 2**16
                    if (tr[https_support] == "yes" or False) and tr[country_id] == country:
                        proxies.append('{}:{}'.format(tr[ip],tr[port]))
                except (ValueError,AssertionError):
                    pass
            except Exception as e:
                print(row)
                raise e
    return proxies

class proxy_iter:
    def __init__(self,proxies):
        self._proxies = set(proxies)
        self.proxies = self._proxies.copy()
        self.bad_proxies = set()
        # self.used_proxies = {}
        
    def __next__(self):
        self.proxies -= self.bad_proxies
        if len(self.proxies) == 0:
            raise StopIteration

        elem = self.proxies.pop()
        if len(self.proxies) == 0:
            self.proxies = self._proxies.copy()
        return elem
        
    def __iter__(self):
        return self
    def blacklist(self,proxy):
        self.bad_proxies.add(proxy)

def get_prices(links,use_proxies = True):
    pages = {}
    xpaths = {link:xpaths_data[urlparse(link).netloc] for link in links}
    # print(xpaths)
    if use_proxies: 
        proxies = proxy_iter(get_proxies() + get_proxies('https://www.us-proxy.org/'))
        for link in links:
            for proxy in proxies:
                print(link,proxy)
                try:
                    page = get_link(link,xpaths,proxy=proxy)
                    pages[link] = page
                    break
                except Exception as e:
                    print(type(e),e,file=sys.stdout)
                    proxies.blacklist(proxy)
        if len(links) != len(pages.keys()):
            raise Exception('all proxies suck')
    else:
        pages = get_link(links,xpaths)
    ret = []
    for link in links:
        tree = text2tree(pages[link])
        ret.append(
            ParseResult(link,tree)
        )
    return ret