Browse Source

Not done more changes will be made but big first steps

xpath
Raphael Roberts 7 years ago
parent
commit
b0dbd9fa03
  1. 4
      .gitignore
  2. 102
      batch_process.py
  3. 44
      get_link.py
  4. 105
      price_finder.py
  5. 6
      xpaths.json

4
.gitignore

@ -1,4 +1,6 @@
__pycache__
.gitignore
/uBlock0.chromium
/bg.html
/bg.html
/test_this_bullshit.py
/output

102
batch_process.py

@ -1,10 +1,20 @@
from price_finder import price_finder,BS
from price_finder import ParseResult
from lxml import etree
from bs4 import BeautifulSoup as BS
from itertools import cycle
import requests
from urllib.parse import urlparse
# import requests_html
import sys
from ipaddress import ip_address
from get_link import get_link
import json
with open('xpaths.json') as file:
xpaths_data = json.load(file)
parser = etree.HTMLParser()
def text2tree(text):
return etree.fromstring(text,parser)
def get_proxies(link='https://free-proxy-list.net/',country = 'United States'):
## ses = requests_html.HTMLSession()
@ -54,44 +64,18 @@ class proxy_iter:
return self
def blacklist(self,proxy):
self.bad_proxies.add(proxy)
# def render_page(link,proxies,ses):
# print(link)
# bad_proxies = set()
# page = None
# render_attempts = 0
# for proxy in proxies:
# print(proxy)
# try:
# r = ses.get(link,proxies={'http':proxy,'https':proxy})
# print('got')
# except (requests.exceptions.ProxyError,requests.exceptions.SSLError):
# print('!g!'+proxy)
# bad_proxies.add(proxy)
# continue
# if render_attempts < 3:
# render_attempts += 1
# try:
# r.html.render(timeout=10, sleep=10)
# print('rendered')
# except requests_html.MaxRetries:
# print('!r!'+proxy)
# bad_proxies.add(proxy)
# continue
# page = r.html.raw_html
# break
# if page:
# return page,{proxy},bad_proxies
# else:
# raise Exception("All proxies used up")
def get_prices(links,use_proxies = True):
pages = {}
xpaths = {link:xpaths_data[urlparse(link).netloc] for link in links}
# print(xpaths)
if use_proxies:
proxies = proxy_iter(get_proxies() + get_proxies('https://www.us-proxy.org/'))
for link in links:
for proxy in proxies:
print(link,proxy)
try:
page = get_link(link,proxy=proxy)
page = get_link(link,xpaths,proxy=proxy)
pages[link] = page
break
except Exception as e:
@ -100,57 +84,11 @@ def get_prices(links,use_proxies = True):
if len(links) != len(pages.keys()):
raise Exception('all proxies suck')
else:
pages = get_link(links)
pages = get_link(links,xpaths)
ret = []
for link in links:
ret.append(price_finder(
link,bs=BS(pages[link],'lxml')
))
tree = text2tree(pages[link])
ret.append(
ParseResult(link,tree)
)
return ret
def get_prices_old(links,no_reuse = True,use_proxies=True):
if use_proxies:
proxies = set(get_proxies() + get_proxies('https://www.us-proxy.org/'))
ses = requests_html.HTMLSession()
ret = []
if use_proxies:
prev = set()
if use_proxies:
bad_proxies_set= set()
for link in links:
if use_proxies:
if no_reuse:
working_set = proxies-prev
# if use_proxies:
else:
working_set = proxies
page,prev,bad_proxies = render_page(link,working_set,ses)
else:
r=ses.get(link)
r.html.render()
page = r.html.raw_html
ret.append(price_finder(link,bs=BS(page,'lxml')))
if use_proxies:
bad_proxies_set |= bad_proxies
proxies -= bad_proxies
if use_proxies:
print(bad_proxies_set)
ses.close()
return ret
if __name__ == "__main__":
# ses = requests_html.HTMLSession()
# proxies = get_proxies('https://www.us-proxy.org/')
# page = render_page('https://www.banggood.com/Aomway-Commander-Goggles-V1-2D-3D-40CH-5_8G-FPV-Video-Headset-Support-HDMI-DVR-Headtracker-p-1107684.html?cur_warehouse=CN',
# proxies,
# ses)
import saveto
import random
ql = saveto.load('quad_links')
random.shuffle(ql)
products = get_prices(ql,use_proxies=False)
# pass

44
get_link.py

@ -1,50 +1,66 @@
import pyppeteer
import pyppeteer.errors
import asyncio
import os
async def _get_link(browser,link):
async def _get_link(browser,link,xpath):
pages = await browser.pages()
page = pages[0]
await page.goto(link,timeout=60_000)
await page.goto(link,waitUntil='documentloaded')
xpath = [xpath['name'],xpath['price']]
for _xpath in xpath:
print(repr(_xpath))
try:
await page.waitForXPath(_xpath)
except pyppeteer.errors.TimeoutError:
pass
await asyncio.sleep(1)
webpage = None
for i in range(20):
try:
webpage = await page.content()
break
except:
time.sleep(1)
await asyncio.sleep(1)
return webpage
async def _single_link(browser,link):
webpage = await _get_link(browser,link)
async def _single_link(browser,link,xpath):
webpage = await _get_link(browser,link,xpath)
await browser.close()
return webpage
async def _multi_link(browser,links):
async def _multi_link(browser,links,xpaths):
results = {}
for link in links:
webpage = await _get_link(browser,link)
xpath = xpaths[link]
webpage = await _get_link(browser,link,xpath)
results[link] = webpage
await browser.close()
return results
def get_link(links,headless = True,proxy = None):
ext = os.path.join(os.path.dirname(__file__),'uBlock0.chromium')
def get_link(links,xpaths,headless = False,proxy = None):
loop = asyncio.get_event_loop()
run = loop.run_until_complete
opts = {
'headless':headless,
}
opts['args'] = [f'--disable-extensions-except={ext}', f'--load-extension={ext}']
if proxy:
opts['args'] += [f'--proxy-server={proxy}']
opts['args'] = [f'--proxy-server={proxy}']
else:
opts['args'] = []
ext = os.path.join(os.path.dirname(__file__),'uBlock0.chromium')
opts['args'] += [f'--disable-extensions-except={ext}', f'--load-extension={ext}']
# print(opts)
browser = run(pyppeteer.launch(**opts))
try:
if isinstance(links,list):
result = run(_multi_link(browser,links))
result = run(_multi_link(browser,links,xpaths))
else:
result = run(_single_link(browser,links))
result = run(_single_link(browser,links,xpaths[links]))
return result
except Exception as e:
run(browser.close())
raise e
raise e

105
price_finder.py

@ -1,81 +1,62 @@
import urllib
from fake_useragent import UserAgent
from bs4 import BeautifulSoup as BS
from requests_html import HTMLSession
import re
import datetime
# import pytz
import copy
import json
with open('xpaths.json') as file:
xpaths = json.load(file)
user_agent = UserAgent().chrome
debug = None
def get_words(string,n):
words = re.finditer(r"(\b[^ \n]+\b)",string)
def get_words(raw,n):
words = re.finditer(r"(\b[^ \n]+\b)",raw)
word_list = list(match.group(0) for match in words)
if len(word_list) > n:
word_list = word_list[:n]
return ' '.join(word_list)
def get_page(url):
page = None
while not page:
page = urllib.request.Request(url,headers = {"User-Agent":user_agent})
page = str(urllib.request.urlopen(page).read())
return page
def get_BS(url):
return BS(get_page(url),"lxml")
def format_price(raw):
return re.search(r'\d+(\.\d)?',raw).group(0)
class ParseResult:
class price_finder:
page_funcs = {
"www.amazon.com":{
"name":lambda page: re.sub(r"( {2,}|\n|\\n)","",page.find("span",id="productTitle").text),
"price":lambda page: page.find(name = "span",id = re.compile("priceblock.*")).text
},
"www.banggood.com":{
"name":lambda page: page.find("h1",attrs = {"itemprop":"name"}).text,
"price":lambda page: page.find("div",attrs = {"class":"now"}).get("oriprice")
},
"www.dalprops.com":{
"name":lambda page: page.find("h1",attrs = {"class":"product_title"}).text,
"price":lambda page: page.find("meta",attrs = {"itemprop":"price"}).get("content")
},
"www.gearbest.com":{
"name":lambda page:re.sub(" {2,}|\n","",page.find("div",attrs = {"class":"goodsIntro_titleWrap"}).find("h1").text),
"price":lambda page: page.find("span",attrs={"class":"goodsIntro_price"}).text
},
"hobbyking.com":{
"name":lambda page: page.find("h1",attrs={"class":"product-name"}).text,
"price":lambda page: page.find("span",id = re.compile(r"product-price.*")).find("span",attrs={"class":"price"}).text
},
"www.getfpv.com":{
"name": lambda page: re.sub(r"\\n|\n","", page.find("div",attrs={"class":"product-name"}).text),
"price": lambda page: re.sub(r"\\n|\n","", page.find("span",attrs={"id":re.compile("product-price.*")}).text)
}
}
def __init__(self,url,space_seperated_categories = 7,bs=None):
def __init__(self,url,tree,space_seperated_categories = 7,):
self.url=url
self.info_url = urllib.parse.urlparse(url)
self.word_len = space_seperated_categories
if self.info_url.netloc not in price_finder.page_funcs.keys():
raise NotImplementedError("Not implemented for {}".format(self.info_url.netloc))
if bs:
self.bs= bs
else:
self.bs = get_BS(url)
# self.words = re_words(space_seperated_categories)
self.tree = tree
self.time = datetime.datetime.today()
self.info_product = self._get_product_info_()
def _get_product_info_(self):
funcs = price_finder.page_funcs[self.info_url.netloc]
# print(self.url)
host = self.info_url.netloc
product_name = get_words(
self.tree.xpath(xpaths[host]['name'])[0].text
)
other_raw = None
try:
other_raw = self.tree.xpath(xpaths[host]['other'])[0].text
except KeyError:
pass
if host in ['www.gearbest.com']:
if other.raw:
price = '0.00'
else:
price = format_price(
self.tree.xpath(xpaths[host]['price'])[0].text
)
else:
price = format_price(
self.tree.xpath(xpaths[host]['price'])[0].text
)
return {
"product_name":get_words(funcs["name"](self.bs),self.word_len),
"price":funcs["price"](self.bs).replace("$",""),
}
# def to_json(self):
# ret = copy.deepcopy(self.__dict__)
# ret['time'] = ret['time'].
"product_name":product_name,
"price":price,
}

6
xpaths.json

@ -19,5 +19,9 @@
"www.dalprops.com": {
"name": "//h1[@itemprop='name']",
"price": "//*[@id='product-price']"
}
},
"hobbyking.com": {
"name": "//h1[contains(@class,'product-name')]",
"price": "//p[@class='special-price']/span[@class='price'] | //span[@class='regular-price']/span[@class='price']"
}
}
Loading…
Cancel
Save