price_finder/batch_process.py


								from price_finder import ParseResult

								from lxml import etree

								from bs4 import BeautifulSoup as BS

								from itertools import cycle

								import requests

								from urllib.parse import urlparse

								# import requests_html

								import sys

								from ipaddress import ip_address

								from get_link import get_link

								import json

								with open('xpaths.json') as file:

								    xpaths_data = json.load(file)


								parser = etree.HTMLParser()

								def text2tree(text):

								    return etree.fromstring(text,parser)


								def get_proxies(link='https://free-proxy-list.net/',country = 'United States'):

								##    ses = requests_html.HTMLSession()

								    r = requests.get(link)

								    page = BS(r.content,'lxml')

								    table = page.find(id='proxylisttable')

								    headers,*rows = table.find_all('tr')

								    headers = list(tag.text.lower() for tag in headers.find_all('th'))

								    ip,port = headers.index('ip address'),headers.index('port')

								    https_support = headers.index('https')

								    country_id = headers.index('country')

								    proxies = []

								    for row in rows:

								        if row.find('td'):

								            tr = list(tag.text for tag in row.find_all('td'))

								            try:

								                try:

								                    ip_address(tr[ip])

								                    assert int(port) >= 0 and int(port) < 2**16

								                    if (tr[https_support] == "yes" or False) and tr[country_id] == country:

								                        proxies.append('{}:{}'.format(tr[ip],tr[port]))

								                except (ValueError,AssertionError):

								                    pass

								            except Exception as e:

								                print(row)

								                raise e

								    return proxies


								class proxy_iter:

								    def __init__(self,proxies):

								        self._proxies = set(proxies)

								        self.proxies = self._proxies.copy()

								        self.bad_proxies = set()

								        # self.used_proxies = {}


								    def __next__(self):

								        self.proxies -= self.bad_proxies

								        if len(self.proxies) == 0:

								            raise StopIteration


								        elem = self.proxies.pop()

								        if len(self.proxies) == 0:

								            self.proxies = self._proxies.copy()

								        return elem


								    def __iter__(self):

								        return self

								    def blacklist(self,proxy):

								        self.bad_proxies.add(proxy)


								def get_prices(links,use_proxies = True):

								    pages = {}

								    xpaths = {link:xpaths_data[urlparse(link).netloc] for link in links}

								    # print(xpaths)

								    if use_proxies:

								        proxies = proxy_iter(get_proxies() + get_proxies('https://www.us-proxy.org/'))

								        for link in links:

								            for proxy in proxies:

								                print(link,proxy)

								                try:

								                    page = get_link(link,xpaths,proxy=proxy)

								                    pages[link] = page

								                    break

								                except Exception as e:

								                    print(type(e),e,file=sys.stdout)

								                    proxies.blacklist(proxy)

								        if len(links) != len(pages.keys()):

								            raise Exception('all proxies suck')

								    else:

								        pages = get_link(links,xpaths)

								    ret = []

								    for link in links:

								        tree = text2tree(pages[link])

								        ret.append(

								            ParseResult(link,tree)

								        )

								    return ret