You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
156 lines
5.0 KiB
156 lines
5.0 KiB
from price_finder import price_finder,BS
|
|
from itertools import cycle
|
|
import requests
|
|
# import requests_html
|
|
import sys
|
|
from ipaddress import ip_address
|
|
from get_link import get_link
|
|
|
|
def get_proxies(link='https://free-proxy-list.net/',country = 'United States'):
|
|
## ses = requests_html.HTMLSession()
|
|
r = requests.get(link)
|
|
page = BS(r.content,'lxml')
|
|
table = page.find(id='proxylisttable')
|
|
headers,*rows = table.find_all('tr')
|
|
headers = list(tag.text.lower() for tag in headers.find_all('th'))
|
|
ip,port = headers.index('ip address'),headers.index('port')
|
|
https_support = headers.index('https')
|
|
country_id = headers.index('country')
|
|
proxies = []
|
|
for row in rows:
|
|
if row.find('td'):
|
|
tr = list(tag.text for tag in row.find_all('td'))
|
|
try:
|
|
try:
|
|
ip_address(tr[ip])
|
|
assert int(port) >= 0 and int(port) < 2**16
|
|
if (tr[https_support] == "yes" or False) and tr[country_id] == country:
|
|
proxies.append('{}:{}'.format(tr[ip],tr[port]))
|
|
except (ValueError,AssertionError):
|
|
pass
|
|
except Exception as e:
|
|
print(row)
|
|
raise e
|
|
return proxies
|
|
|
|
class proxy_iter:
|
|
def __init__(self,proxies):
|
|
self._proxies = set(proxies)
|
|
self.proxies = self._proxies.copy()
|
|
self.bad_proxies = set()
|
|
# self.used_proxies = {}
|
|
|
|
def __next__(self):
|
|
self.proxies -= self.bad_proxies
|
|
if len(self.proxies) == 0:
|
|
raise StopIteration
|
|
|
|
elem = self.proxies.pop()
|
|
if len(self.proxies) == 0:
|
|
self.proxies = self._proxies.copy()
|
|
return elem
|
|
|
|
def __iter__(self):
|
|
return self
|
|
def blacklist(self,proxy):
|
|
self.bad_proxies.add(proxy)
|
|
# def render_page(link,proxies,ses):
|
|
# print(link)
|
|
# bad_proxies = set()
|
|
# page = None
|
|
# render_attempts = 0
|
|
# for proxy in proxies:
|
|
# print(proxy)
|
|
# try:
|
|
# r = ses.get(link,proxies={'http':proxy,'https':proxy})
|
|
# print('got')
|
|
# except (requests.exceptions.ProxyError,requests.exceptions.SSLError):
|
|
# print('!g!'+proxy)
|
|
# bad_proxies.add(proxy)
|
|
# continue
|
|
# if render_attempts < 3:
|
|
# render_attempts += 1
|
|
# try:
|
|
# r.html.render(timeout=10, sleep=10)
|
|
# print('rendered')
|
|
# except requests_html.MaxRetries:
|
|
# print('!r!'+proxy)
|
|
# bad_proxies.add(proxy)
|
|
# continue
|
|
# page = r.html.raw_html
|
|
# break
|
|
# if page:
|
|
# return page,{proxy},bad_proxies
|
|
# else:
|
|
# raise Exception("All proxies used up")
|
|
def get_prices(links,use_proxies = True):
|
|
pages = {}
|
|
if use_proxies:
|
|
proxies = proxy_iter(get_proxies() + get_proxies('https://www.us-proxy.org/'))
|
|
for link in links:
|
|
for proxy in proxies:
|
|
print(link,proxy)
|
|
try:
|
|
page = get_link(link,proxy=proxy)
|
|
pages[link] = page
|
|
break
|
|
except Exception as e:
|
|
print(type(e),e,file=sys.stdout)
|
|
proxies.blacklist(proxy)
|
|
if len(links) != len(pages.keys()):
|
|
raise Exception('all proxies suck')
|
|
else:
|
|
pages = get_link(links)
|
|
ret = []
|
|
for link in links:
|
|
ret.append(price_finder(
|
|
link,bs=BS(pages[link],'lxml')
|
|
))
|
|
return ret
|
|
|
|
|
|
|
|
def get_prices_old(links,no_reuse = True,use_proxies=True):
|
|
if use_proxies:
|
|
proxies = set(get_proxies() + get_proxies('https://www.us-proxy.org/'))
|
|
ses = requests_html.HTMLSession()
|
|
ret = []
|
|
if use_proxies:
|
|
prev = set()
|
|
if use_proxies:
|
|
bad_proxies_set= set()
|
|
for link in links:
|
|
if use_proxies:
|
|
if no_reuse:
|
|
working_set = proxies-prev
|
|
# if use_proxies:
|
|
else:
|
|
working_set = proxies
|
|
page,prev,bad_proxies = render_page(link,working_set,ses)
|
|
else:
|
|
r=ses.get(link)
|
|
r.html.render()
|
|
page = r.html.raw_html
|
|
|
|
ret.append(price_finder(link,bs=BS(page,'lxml')))
|
|
if use_proxies:
|
|
bad_proxies_set |= bad_proxies
|
|
proxies -= bad_proxies
|
|
if use_proxies:
|
|
print(bad_proxies_set)
|
|
ses.close()
|
|
return ret
|
|
|
|
if __name__ == "__main__":
|
|
# ses = requests_html.HTMLSession()
|
|
# proxies = get_proxies('https://www.us-proxy.org/')
|
|
# page = render_page('https://www.banggood.com/Aomway-Commander-Goggles-V1-2D-3D-40CH-5_8G-FPV-Video-Headset-Support-HDMI-DVR-Headtracker-p-1107684.html?cur_warehouse=CN',
|
|
# proxies,
|
|
# ses)
|
|
|
|
import saveto
|
|
import random
|
|
ql = saveto.load('quad_links')
|
|
random.shuffle(ql)
|
|
products = get_prices(ql,use_proxies=False)
|
|
# pass
|