Browse Source

added get_words func to make finding the words more robust than just re, modifications to get_prices

master
Raphael Roberts 7 years ago
parent
commit
08a01f4a33
  1. 46
      batch_process.py
  2. 22
      price_finder.py

46
batch_process.py

@ -1,9 +1,10 @@
from price_finder import price_finder,BS from price_finder import price_finder,BS
from itertools import cycle from itertools import cycle
from requests_html import HTMLSession
import requests
import requests_html
from ipaddress import ip_address from ipaddress import ip_address
def get_proxies(country = 'United States'): def get_proxies(country = 'United States'):
ses = HTMLSession()
ses = requests_html.HTMLSession()
r = ses.get('https://free-proxy-list.net/') r = ses.get('https://free-proxy-list.net/')
page = BS(r.html.raw_html,'lxml') page = BS(r.html.raw_html,'lxml')
table = page.find(id='proxylisttable') table = page.find(id='proxylisttable')
@ -28,15 +29,16 @@ def get_proxies(country = 'United States'):
print(row) print(row)
raise e raise e
return cycle(proxies) return cycle(proxies)
proxies = get_proxies()
def get_prices(links): def get_prices(links):
ret = []
s = HTMLSession()
proxies = get_proxies()
s = requests_html.HTMLSession()
ret = [] ret = []
bad_proxies= set() bad_proxies= set()
for link in links: for link in links:
page = None
render_tries = 0
print(link) print(link)
while True:
while not page:
proxy = next(proxies) proxy = next(proxies)
while proxy in bad_proxies: while proxy in bad_proxies:
proxy = next(proxies) proxy = next(proxies)
@ -44,14 +46,30 @@ def get_prices(links):
try: try:
r = s.get(link,proxies={'http':proxy,'https':proxy}) r = s.get(link,proxies={'http':proxy,'https':proxy})
print('got') print('got')
r.html.render()
print('rendered')
ret.append(price_finder(link,bs=BS(r.html.raw_html,'lxml')))
break
except Exception as e:
print(e)
try:
render_tries += 1
r.html.render()
print('rendered')
except requests_html.MaxRetries:
if render_tries > 2:
pass
else:
print('!'+proxy)
bad_proxies.update([proxy])
continue
page = r.html.raw_html
ret.append(price_finder(link,bs=BS(page,'lxml')))
except (requests.exceptions.ProxyError,requests.exceptions.SSLError):
print('!'+proxy) print('!'+proxy)
bad_proxies.update([proxy]) bad_proxies.update([proxy])
pass
print(bad_proxies)
s.close() s.close()
return ret
return ret
if __name__ == "__main__":
import saveto
import random
ql = saveto.load('quad_links')
random.shuffle(ql)
products = get_prices(ql)

22
price_finder.py

@ -6,8 +6,13 @@ import re
import datetime import datetime
user_agent = UserAgent().chrome user_agent = UserAgent().chrome
re_words = lambda n: re.compile(r"( ?[^ ]+ ?)"+"{0,"+str(n-1)+"}"+r"[^ ]+")
debug = None debug = None
def get_words(string,n):
words = re.finditer(r"(\b[^ \n]+\b)",string)
word_list = list(match.group(0) for match in words)
if len(word_list) > n:
word_list = word_list[:n]
return ' '.join(word_list)
def get_page(url): def get_page(url):
page = None page = None
while not page: while not page:
@ -34,28 +39,29 @@ class price_finder:
"price":lambda page: page.find("meta",attrs = {"itemprop":"price"}).get("content") "price":lambda page: page.find("meta",attrs = {"itemprop":"price"}).get("content")
}, },
"www.gearbest.com":{ "www.gearbest.com":{
"name":lambda page:re.sub(' {2,}|\n','',page.find("div",attrs = {"class":"goodsIntro_titleWrap"}).find("h1").text),
"name":lambda page:re.sub(" {2,}|\n","",page.find("div",attrs = {"class":"goodsIntro_titleWrap"}).find("h1").text),
"price":lambda page: page.find("span",attrs={"class":"goodsIntro_price"}).text "price":lambda page: page.find("span",attrs={"class":"goodsIntro_price"}).text
}, },
"hobbyking.com":{ "hobbyking.com":{
"name":lambda page: page.find("h1",attrs={"class":"product-name"}).text, "name":lambda page: page.find("h1",attrs={"class":"product-name"}).text,
"price":lambda page: page.find("span",id = re.compile(r"product-price.*")).find("span",attrs={"class":"price"}).text "price":lambda page: page.find("span",id = re.compile(r"product-price.*")).find("span",attrs={"class":"price"}).text
}, },
'www.getfpv.com':{
'name': lambda page: re.sub(r'\\n|\n','', page.find("div",attrs={"class":"product-name"}).text),
'price': lambda page: re.sub(r'\\n|\n','', page.find('span',attrs={'id':re.compile('product-price.*')}).text)
"www.getfpv.com":{
"name": lambda page: re.sub(r"\\n|\n","", page.find("div",attrs={"class":"product-name"}).text),
"price": lambda page: re.sub(r"\\n|\n","", page.find("span",attrs={"id":re.compile("product-price.*")}).text)
} }
} }
def __init__(self,url,space_seperated_categories = 7,bs=None): def __init__(self,url,space_seperated_categories = 7,bs=None):
self.url=url self.url=url
self.info_url = urllib.parse.urlparse(url) self.info_url = urllib.parse.urlparse(url)
self.word_len = space_seperated_categories
if self.info_url.netloc not in price_finder.page_funcs.keys(): if self.info_url.netloc not in price_finder.page_funcs.keys():
raise NotImplementedError("Not implemented for {}".format(self.info_url.netloc)) raise NotImplementedError("Not implemented for {}".format(self.info_url.netloc))
if bs: if bs:
self.bs= bs self.bs= bs
else: else:
self.bs = get_BS(url) self.bs = get_BS(url)
self.words = re_words(space_seperated_categories)
# self.words = re_words(space_seperated_categories)
self.time = datetime.datetime.today() self.time = datetime.datetime.today()
self.info_product = self._get_product_info_() self.info_product = self._get_product_info_()
@ -65,9 +71,7 @@ class price_finder:
# print(self.url) # print(self.url)
return { return {
"product_name":self.words.match(
funcs["name"](self.bs)
).group(0),
"product_name":get_words(funcs["name"](self.bs),self.word_len),
"price":funcs["price"](self.bs).replace("$",""), "price":funcs["price"](self.bs).replace("$",""),
} }
Loading…
Cancel
Save