You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

94 lines
3.0 KiB

from price_finder import ParseResult
from lxml import etree
from bs4 import BeautifulSoup as BS
from itertools import cycle
import requests
from urllib.parse import urlparse
# import requests_html
import sys
from ipaddress import ip_address
from get_link import get_link
import json
with open('xpaths.json') as file:
xpaths_data = json.load(file)
parser = etree.HTMLParser()
def text2tree(text):
return etree.fromstring(text,parser)
def get_proxies(link='https://free-proxy-list.net/',country = 'United States'):
## ses = requests_html.HTMLSession()
r = requests.get(link)
page = BS(r.content,'lxml')
table = page.find(id='proxylisttable')
headers,*rows = table.find_all('tr')
headers = list(tag.text.lower() for tag in headers.find_all('th'))
ip,port = headers.index('ip address'),headers.index('port')
https_support = headers.index('https')
country_id = headers.index('country')
proxies = []
for row in rows:
if row.find('td'):
tr = list(tag.text for tag in row.find_all('td'))
try:
try:
ip_address(tr[ip])
assert int(port) >= 0 and int(port) < 2**16
if (tr[https_support] == "yes" or False) and tr[country_id] == country:
proxies.append('{}:{}'.format(tr[ip],tr[port]))
except (ValueError,AssertionError):
pass
except Exception as e:
print(row)
raise e
return proxies
class proxy_iter:
def __init__(self,proxies):
self._proxies = set(proxies)
self.proxies = self._proxies.copy()
self.bad_proxies = set()
# self.used_proxies = {}
def __next__(self):
self.proxies -= self.bad_proxies
if len(self.proxies) == 0:
raise StopIteration
elem = self.proxies.pop()
if len(self.proxies) == 0:
self.proxies = self._proxies.copy()
return elem
def __iter__(self):
return self
def blacklist(self,proxy):
self.bad_proxies.add(proxy)
def get_prices(links,use_proxies = True):
pages = {}
xpaths = {link:xpaths_data[urlparse(link).netloc] for link in links}
# print(xpaths)
if use_proxies:
proxies = proxy_iter(get_proxies() + get_proxies('https://www.us-proxy.org/'))
for link in links:
for proxy in proxies:
print(link,proxy)
try:
page = get_link(link,xpaths,proxy=proxy)
pages[link] = page
break
except Exception as e:
print(type(e),e,file=sys.stdout)
proxies.blacklist(proxy)
if len(links) != len(pages.keys()):
raise Exception('all proxies suck')
else:
pages = get_link(links,xpaths)
ret = []
for link in links:
tree = text2tree(pages[link])
ret.append(
ParseResult(link,tree)
)
return ret