Browse Source

added get_words func to make finding the words more robust than just re, modifications to get_prices

master
Raphael Roberts 7 years ago
parent
commit
08a01f4a33
  1. 46
      batch_process.py
  2. 22
      price_finder.py

46
batch_process.py

@ -1,9 +1,10 @@
from price_finder import price_finder,BS
from itertools import cycle
from requests_html import HTMLSession
import requests
import requests_html
from ipaddress import ip_address
def get_proxies(country = 'United States'):
ses = HTMLSession()
ses = requests_html.HTMLSession()
r = ses.get('https://free-proxy-list.net/')
page = BS(r.html.raw_html,'lxml')
table = page.find(id='proxylisttable')
@ -28,15 +29,16 @@ def get_proxies(country = 'United States'):
print(row)
raise e
return cycle(proxies)
proxies = get_proxies()
def get_prices(links):
ret = []
s = HTMLSession()
proxies = get_proxies()
s = requests_html.HTMLSession()
ret = []
bad_proxies= set()
for link in links:
page = None
render_tries = 0
print(link)
while True:
while not page:
proxy = next(proxies)
while proxy in bad_proxies:
proxy = next(proxies)
@ -44,14 +46,30 @@ def get_prices(links):
try:
r = s.get(link,proxies={'http':proxy,'https':proxy})
print('got')
r.html.render()
print('rendered')
ret.append(price_finder(link,bs=BS(r.html.raw_html,'lxml')))
break
except Exception as e:
print(e)
try:
render_tries += 1
r.html.render()
print('rendered')
except requests_html.MaxRetries:
if render_tries > 2:
pass
else:
print('!'+proxy)
bad_proxies.update([proxy])
continue
page = r.html.raw_html
ret.append(price_finder(link,bs=BS(page,'lxml')))
except (requests.exceptions.ProxyError,requests.exceptions.SSLError):
print('!'+proxy)
bad_proxies.update([proxy])
pass
print(bad_proxies)
s.close()
return ret
return ret
if __name__ == "__main__":
import saveto
import random
ql = saveto.load('quad_links')
random.shuffle(ql)
products = get_prices(ql)

22
price_finder.py

@ -6,8 +6,13 @@ import re
import datetime
user_agent = UserAgent().chrome
re_words = lambda n: re.compile(r"( ?[^ ]+ ?)"+"{0,"+str(n-1)+"}"+r"[^ ]+")
debug = None
def get_words(string,n):
words = re.finditer(r"(\b[^ \n]+\b)",string)
word_list = list(match.group(0) for match in words)
if len(word_list) > n:
word_list = word_list[:n]
return ' '.join(word_list)
def get_page(url):
page = None
while not page:
@ -34,28 +39,29 @@ class price_finder:
"price":lambda page: page.find("meta",attrs = {"itemprop":"price"}).get("content")
},
"www.gearbest.com":{
"name":lambda page:re.sub(' {2,}|\n','',page.find("div",attrs = {"class":"goodsIntro_titleWrap"}).find("h1").text),
"name":lambda page:re.sub(" {2,}|\n","",page.find("div",attrs = {"class":"goodsIntro_titleWrap"}).find("h1").text),
"price":lambda page: page.find("span",attrs={"class":"goodsIntro_price"}).text
},
"hobbyking.com":{
"name":lambda page: page.find("h1",attrs={"class":"product-name"}).text,
"price":lambda page: page.find("span",id = re.compile(r"product-price.*")).find("span",attrs={"class":"price"}).text
},
'www.getfpv.com':{
'name': lambda page: re.sub(r'\\n|\n','', page.find("div",attrs={"class":"product-name"}).text),
'price': lambda page: re.sub(r'\\n|\n','', page.find('span',attrs={'id':re.compile('product-price.*')}).text)
"www.getfpv.com":{
"name": lambda page: re.sub(r"\\n|\n","", page.find("div",attrs={"class":"product-name"}).text),
"price": lambda page: re.sub(r"\\n|\n","", page.find("span",attrs={"id":re.compile("product-price.*")}).text)
}
}
def __init__(self,url,space_seperated_categories = 7,bs=None):
self.url=url
self.info_url = urllib.parse.urlparse(url)
self.word_len = space_seperated_categories
if self.info_url.netloc not in price_finder.page_funcs.keys():
raise NotImplementedError("Not implemented for {}".format(self.info_url.netloc))
if bs:
self.bs= bs
else:
self.bs = get_BS(url)
self.words = re_words(space_seperated_categories)
# self.words = re_words(space_seperated_categories)
self.time = datetime.datetime.today()
self.info_product = self._get_product_info_()
@ -65,9 +71,7 @@ class price_finder:
# print(self.url)
return {
"product_name":self.words.match(
funcs["name"](self.bs)
).group(0),
"product_name":get_words(funcs["name"](self.bs),self.word_len),
"price":funcs["price"](self.bs).replace("$",""),
}
Loading…
Cancel
Save