From 08a01f4a334741cfdefa52e861ece168a47bf3d0 Mon Sep 17 00:00:00 2001 From: rlbr Date: Wed, 25 Jul 2018 13:39:35 -0500 Subject: [PATCH] added get_words func to make finding the words more robust than just re, modifications to get_prices --- batch_process.py | 46 ++++++++++++++++++++++++++++++++-------------- price_finder.py | 22 +++++++++++++--------- 2 files changed, 45 insertions(+), 23 deletions(-) diff --git a/batch_process.py b/batch_process.py index f166c4b..01fa96f 100644 --- a/batch_process.py +++ b/batch_process.py @@ -1,9 +1,10 @@ from price_finder import price_finder,BS from itertools import cycle -from requests_html import HTMLSession +import requests +import requests_html from ipaddress import ip_address def get_proxies(country = 'United States'): - ses = HTMLSession() + ses = requests_html.HTMLSession() r = ses.get('https://free-proxy-list.net/') page = BS(r.html.raw_html,'lxml') table = page.find(id='proxylisttable') @@ -28,15 +29,16 @@ def get_proxies(country = 'United States'): print(row) raise e return cycle(proxies) -proxies = get_proxies() def get_prices(links): - ret = [] - s = HTMLSession() + proxies = get_proxies() + s = requests_html.HTMLSession() ret = [] bad_proxies= set() for link in links: + page = None + render_tries = 0 print(link) - while True: + while not page: proxy = next(proxies) while proxy in bad_proxies: proxy = next(proxies) @@ -44,14 +46,30 @@ def get_prices(links): try: r = s.get(link,proxies={'http':proxy,'https':proxy}) print('got') - r.html.render() - print('rendered') - ret.append(price_finder(link,bs=BS(r.html.raw_html,'lxml'))) - break - except Exception as e: - print(e) + try: + render_tries += 1 + r.html.render() + print('rendered') + except requests_html.MaxRetries: + if render_tries > 2: + pass + else: + print('!'+proxy) + bad_proxies.update([proxy]) + continue + page = r.html.raw_html + ret.append(price_finder(link,bs=BS(page,'lxml'))) + + except (requests.exceptions.ProxyError,requests.exceptions.SSLError): print('!'+proxy) bad_proxies.update([proxy]) - pass + + print(bad_proxies) s.close() - return ret \ No newline at end of file + return ret +if __name__ == "__main__": + import saveto + import random + ql = saveto.load('quad_links') + random.shuffle(ql) + products = get_prices(ql) \ No newline at end of file diff --git a/price_finder.py b/price_finder.py index ed313e2..7eca6bc 100644 --- a/price_finder.py +++ b/price_finder.py @@ -6,8 +6,13 @@ import re import datetime user_agent = UserAgent().chrome -re_words = lambda n: re.compile(r"( ?[^ ]+ ?)"+"{0,"+str(n-1)+"}"+r"[^ ]+") debug = None +def get_words(string,n): + words = re.finditer(r"(\b[^ \n]+\b)",string) + word_list = list(match.group(0) for match in words) + if len(word_list) > n: + word_list = word_list[:n] + return ' '.join(word_list) def get_page(url): page = None while not page: @@ -34,28 +39,29 @@ class price_finder: "price":lambda page: page.find("meta",attrs = {"itemprop":"price"}).get("content") }, "www.gearbest.com":{ - "name":lambda page:re.sub(' {2,}|\n','',page.find("div",attrs = {"class":"goodsIntro_titleWrap"}).find("h1").text), + "name":lambda page:re.sub(" {2,}|\n","",page.find("div",attrs = {"class":"goodsIntro_titleWrap"}).find("h1").text), "price":lambda page: page.find("span",attrs={"class":"goodsIntro_price"}).text }, "hobbyking.com":{ "name":lambda page: page.find("h1",attrs={"class":"product-name"}).text, "price":lambda page: page.find("span",id = re.compile(r"product-price.*")).find("span",attrs={"class":"price"}).text }, - 'www.getfpv.com':{ - 'name': lambda page: re.sub(r'\\n|\n','', page.find("div",attrs={"class":"product-name"}).text), - 'price': lambda page: re.sub(r'\\n|\n','', page.find('span',attrs={'id':re.compile('product-price.*')}).text) + "www.getfpv.com":{ + "name": lambda page: re.sub(r"\\n|\n","", page.find("div",attrs={"class":"product-name"}).text), + "price": lambda page: re.sub(r"\\n|\n","", page.find("span",attrs={"id":re.compile("product-price.*")}).text) } } def __init__(self,url,space_seperated_categories = 7,bs=None): self.url=url self.info_url = urllib.parse.urlparse(url) + self.word_len = space_seperated_categories if self.info_url.netloc not in price_finder.page_funcs.keys(): raise NotImplementedError("Not implemented for {}".format(self.info_url.netloc)) if bs: self.bs= bs else: self.bs = get_BS(url) - self.words = re_words(space_seperated_categories) + # self.words = re_words(space_seperated_categories) self.time = datetime.datetime.today() self.info_product = self._get_product_info_() @@ -65,9 +71,7 @@ class price_finder: # print(self.url) return { - "product_name":self.words.match( - funcs["name"](self.bs) - ).group(0), + "product_name":get_words(funcs["name"](self.bs),self.word_len), "price":funcs["price"](self.bs).replace("$",""), }