You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

75 lines
2.6 KiB

from price_finder import price_finder,BS
from itertools import cycle
import requests
import requests_html
from ipaddress import ip_address
def get_proxies(country = 'United States'):
ses = requests_html.HTMLSession()
r = ses.get('https://free-proxy-list.net/')
page = BS(r.html.raw_html,'lxml')
table = page.find(id='proxylisttable')
headers,*rows = table.find_all('tr')
headers = list(tag.text.lower() for tag in headers.find_all('th'))
ip,port = headers.index('ip address'),headers.index('port')
https_support = headers.index('https')
country_id = headers.index('country')
proxies = []
for row in rows:
if row.find('td'):
tr = list(tag.text for tag in row.find_all('td'))
try:
try:
ip_address(tr[ip])
assert int(port) >= 0 and int(port) < 2**16
if tr[https_support] == "yes" and tr[country_id] == country:
proxies.append('{}:{}'.format(tr[ip],tr[port]))
except (ValueError,AssertionError):
pass
except Exception as e:
print(row)
raise e
return cycle(proxies)
def get_prices(links):
proxies = get_proxies()
s = requests_html.HTMLSession()
ret = []
bad_proxies= set()
for link in links:
page = None
render_tries = 0
print(link)
while not page:
proxy = next(proxies)
while proxy in bad_proxies:
proxy = next(proxies)
print(proxy)
try:
r = s.get(link,proxies={'http':proxy,'https':proxy})
print('got')
try:
render_tries += 1
r.html.render()
print('rendered')
except requests_html.MaxRetries:
if render_tries > 2:
pass
else:
print('!'+proxy)
bad_proxies.update([proxy])
continue
page = r.html.raw_html
ret.append(price_finder(link,bs=BS(page,'lxml')))
except (requests.exceptions.ProxyError,requests.exceptions.SSLError):
print('!'+proxy)
bad_proxies.update([proxy])
print(bad_proxies)
s.close()
return ret
if __name__ == "__main__":
import saveto
import random
ql = saveto.load('quad_links')
random.shuffle(ql)
products = get_prices(ql)