Browse Source

initial commit

xpath
Raphael Roberts 7 years ago
parent
commit
891684741a
  1. 4
      .gitignore
  2. 167
      batch_process.py
  3. 50
      get_link.py
  4. 6
      price_finder.py
  5. 21
      proxy_class.py
  6. 23
      xpaths.json

4
.gitignore

@ -1,2 +1,4 @@
__pycache__
.gitignore
.gitignore
/uBlock0.chromium
/bg.html

167
batch_process.py

@ -1,12 +1,15 @@
from price_finder import price_finder,BS
from itertools import cycle
import requests
import requests_html
# import requests_html
import sys
from ipaddress import ip_address
def get_proxies(country = 'United States'):
ses = requests_html.HTMLSession()
r = ses.get('https://free-proxy-list.net/')
page = BS(r.html.raw_html,'lxml')
from get_link import get_link
def get_proxies(link='https://free-proxy-list.net/',country = 'United States'):
## ses = requests_html.HTMLSession()
r = requests.get(link)
page = BS(r.content,'lxml')
table = page.find(id='proxylisttable')
headers,*rows = table.find_all('tr')
headers = list(tag.text.lower() for tag in headers.find_all('th'))
@ -21,55 +24,133 @@ def get_proxies(country = 'United States'):
try:
ip_address(tr[ip])
assert int(port) >= 0 and int(port) < 2**16
if tr[https_support] == "yes" and tr[country_id] == country:
if (tr[https_support] == "yes" or False) and tr[country_id] == country:
proxies.append('{}:{}'.format(tr[ip],tr[port]))
except (ValueError,AssertionError):
pass
except Exception as e:
print(row)
raise e
return cycle(proxies)
def get_prices(links):
proxies = get_proxies()
s = requests_html.HTMLSession()
return proxies
class proxy_iter:
def __init__(self,proxies):
self._proxies = set(proxies)
self.proxies = self._proxies.copy()
self.bad_proxies = set()
# self.used_proxies = {}
def __next__(self):
self.proxies -= self.bad_proxies
if len(self.proxies) == 0:
raise StopIteration
elem = self.proxies.pop()
if len(self.proxies) == 0:
self.proxies = self._proxies.copy()
return elem
def __iter__(self):
return self
def blacklist(self,proxy):
self.bad_proxies.add(proxy)
# def render_page(link,proxies,ses):
# print(link)
# bad_proxies = set()
# page = None
# render_attempts = 0
# for proxy in proxies:
# print(proxy)
# try:
# r = ses.get(link,proxies={'http':proxy,'https':proxy})
# print('got')
# except (requests.exceptions.ProxyError,requests.exceptions.SSLError):
# print('!g!'+proxy)
# bad_proxies.add(proxy)
# continue
# if render_attempts < 3:
# render_attempts += 1
# try:
# r.html.render(timeout=10, sleep=10)
# print('rendered')
# except requests_html.MaxRetries:
# print('!r!'+proxy)
# bad_proxies.add(proxy)
# continue
# page = r.html.raw_html
# break
# if page:
# return page,{proxy},bad_proxies
# else:
# raise Exception("All proxies used up")
def get_prices(links,use_proxies = True):
pages = {}
if use_proxies:
proxies = proxy_iter(get_proxies() + get_proxies('https://www.us-proxy.org/'))
for link in links:
for proxy in proxies:
print(link,proxy)
try:
page = get_link(link,proxy=proxy)
pages[link] = page
break
except Exception as e:
print(type(e),e,file=sys.stdout)
proxies.blacklist(proxy)
if len(links) != len(pages.keys()):
raise Exception('all proxies suck')
else:
pages = get_link(links)
ret = []
bad_proxies= set()
for link in links:
page = None
render_tries = 0
print(link)
while not page:
proxy = next(proxies)
while proxy in bad_proxies:
proxy = next(proxies)
print(proxy)
try:
r = s.get(link,proxies={'http':proxy,'https':proxy})
print('got')
try:
render_tries += 1
r.html.render()
print('rendered')
except requests_html.MaxRetries:
if render_tries > 2:
pass
else:
print('!'+proxy)
bad_proxies.update([proxy])
continue
page = r.html.raw_html
ret.append(price_finder(link,bs=BS(page,'lxml')))
except (requests.exceptions.ProxyError,requests.exceptions.SSLError):
print('!'+proxy)
bad_proxies.update([proxy])
print(bad_proxies)
s.close()
ret.append(price_finder(
link,bs=BS(pages[link],'lxml')
))
return ret
def get_prices_old(links,no_reuse = True,use_proxies=True):
if use_proxies:
proxies = set(get_proxies() + get_proxies('https://www.us-proxy.org/'))
ses = requests_html.HTMLSession()
ret = []
if use_proxies:
prev = set()
if use_proxies:
bad_proxies_set= set()
for link in links:
if use_proxies:
if no_reuse:
working_set = proxies-prev
# if use_proxies:
else:
working_set = proxies
page,prev,bad_proxies = render_page(link,working_set,ses)
else:
r=ses.get(link)
r.html.render()
page = r.html.raw_html
ret.append(price_finder(link,bs=BS(page,'lxml')))
if use_proxies:
bad_proxies_set |= bad_proxies
proxies -= bad_proxies
if use_proxies:
print(bad_proxies_set)
ses.close()
return ret
if __name__ == "__main__":
# ses = requests_html.HTMLSession()
# proxies = get_proxies('https://www.us-proxy.org/')
# page = render_page('https://www.banggood.com/Aomway-Commander-Goggles-V1-2D-3D-40CH-5_8G-FPV-Video-Headset-Support-HDMI-DVR-Headtracker-p-1107684.html?cur_warehouse=CN',
# proxies,
# ses)
import saveto
import random
ql = saveto.load('quad_links')
random.shuffle(ql)
products = get_prices(ql)
products = get_prices(ql,use_proxies=False)
# pass

50
get_link.py

@ -0,0 +1,50 @@
import pyppeteer
import asyncio
import os
async def _get_link(browser,link):
pages = await browser.pages()
page = pages[0]
await page.goto(link,timeout=60_000)
webpage = None
for i in range(20):
try:
webpage = await page.content()
break
except:
time.sleep(1)
return webpage
async def _single_link(browser,link):
webpage = await _get_link(browser,link)
await browser.close()
return webpage
async def _multi_link(browser,links):
results = {}
for link in links:
webpage = await _get_link(browser,link)
results[link] = webpage
await browser.close()
return results
def get_link(links,headless = True,proxy = None):
ext = os.path.join(os.path.dirname(__file__),'uBlock0.chromium')
loop = asyncio.get_event_loop()
run = loop.run_until_complete
opts = {
'headless':headless,
}
opts['args'] = [f'--disable-extensions-except={ext}', f'--load-extension={ext}']
if proxy:
opts['args'] += [f'--proxy-server={proxy}']
# print(opts)
browser = run(pyppeteer.launch(**opts))
try:
if isinstance(links,list):
result = run(_multi_link(browser,links))
else:
result = run(_single_link(browser,links))
return result
except Exception as e:
run(browser.close())
raise e

6
price_finder.py

@ -4,6 +4,8 @@ from bs4 import BeautifulSoup as BS
from requests_html import HTMLSession
import re
import datetime
# import pytz
import copy
user_agent = UserAgent().chrome
debug = None
@ -74,4 +76,6 @@ class price_finder:
"product_name":get_words(funcs["name"](self.bs),self.word_len),
"price":funcs["price"](self.bs).replace("$",""),
}
# def to_json(self):
# ret = copy.deepcopy(self.__dict__)
# ret['time'] = ret['time'].

21
proxy_class.py

@ -0,0 +1,21 @@
class proxy_iter:
def __init__(self,proxies):
self._proxies = set(proxies)
self.proxies = self._proxies.copy()
self.bad_proxies = set()
# self.used_proxies = {}
def __next__(self):
self.proxies -= self.bad_proxies
if len(self.proxies) == 0:
raise StopIteration
elem = self.proxies.pop()
if len(self.proxies) == 0:
self.proxies = self._proxies.copy()
return elem
def __iter__(self):
return self
def blacklist(self,proxy):
self.bad_proxies.add(proxy)

23
xpaths.json

@ -0,0 +1,23 @@
{
"www.banggood.com": {
"name": "//h1[@itemprop='name']",
"price": "//div[@class='now']"
},
"www.gearbest.com": {
"name": "//h1[@class='goodsIntro_title']",
"price": "//span[contains(@class,'goodsIntro_price')]",
"other": "//div[@class='goodsIntro_noticeSubmit']"
},
"www.amazon.com": {
"name": "//span[@id='priceblock_dealprice' or @id='priceblock_ourprice']",
"price": "//span[@id='productTitle']"
},
"www.getfpv.com": {
"name": "//div[@class='product-name']/span",
"price": "//div[@class='price-box']/p[@class='special-price']/span[@class='price'] | //div[@class='price-box']/span[@class='regular-price']/span"
},
"www.dalprops.com": {
"name": "//h1[@itemprop='name']",
"price": "//*[@id='product-price']"
}
}
Loading…
Cancel
Save