From bf5171e6608afef0571d3b058ca9ad8e19b3331f Mon Sep 17 00:00:00 2001 From: Raphael Roberts Date: Sun, 18 Feb 2018 00:54:30 -0600 Subject: [PATCH] initial commit --- .gitignore | 1 + __init__.py | 0 batch_process.py | 23 +++++++++++++++++ price_finder.py | 64 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 88 insertions(+) create mode 100644 .gitignore create mode 100644 __init__.py create mode 100644 batch_process.py create mode 100644 price_finder.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ed8ebf5 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__ \ No newline at end of file diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/batch_process.py b/batch_process.py new file mode 100644 index 0000000..eccf025 --- /dev/null +++ b/batch_process.py @@ -0,0 +1,23 @@ +from selenium import webdriver +from price_finder import price_finder,BS +def get_prices(links): + try: + opts = webdriver.chrome.options.Options() + opts.add_argument('--headless') + driver = webdriver.Chrome(chrome_options = opts,headless= True) + results = [] + for link in links: + driver.get(link) + results.append( + price_finder( + url = link,bs=BS(driver.page_source,'lxml') + ) + ) + driver.quit() + return results + except Exception as excpt: + driver.quit() + raise excpt +import saveto +links = saveto.load('quad_links') +products = get_prices(links) diff --git a/price_finder.py b/price_finder.py new file mode 100644 index 0000000..e1698aa --- /dev/null +++ b/price_finder.py @@ -0,0 +1,64 @@ +import urllib +from fake_useragent import UserAgent +from bs4 import BeautifulSoup as BS +import re +import datetime + +user_agent = UserAgent().chrome +re_words = lambda n: re.compile(r"( ?[^ ]+ ?)"+"{0,"+str(n-1)+"}"+r"[^ ]+") +def get_page(url): + page = None + while not page: + page = urllib.request.Request(url,headers = {"User-Agent":user_agent}) + page = str(urllib.request.urlopen(page).read()) + + return page + +def get_BS(url): + return BS(get_page(url),"lxml") + +class price_finder: + page_funcs = { + "www.amazon.com":{ + "name":lambda page: re.sub(r"( {2,}|\n|\\n)","",page.find("span",id="productTitle").text), + "price":lambda page: page.find(name = "span",id = re.compile("priceblock.*")).text + }, + "www.banggood.com":{ + "name":lambda page: page.find("h1",attrs = {"itemprop":"name"}).text, + "price":lambda page: page.find("div",attrs = {"class":"now"}).get("oriprice") + }, + "www.dalprops.com":{ + "name":lambda page: page.find("h1",attrs = {"class":"product_title"}).text, + "price":lambda page: page.find("meta",attrs = {"itemprop":"price"}).get("content") + }, + "www.gearbest.com":{ + "name":lambda page: page.find("div",attrs = {"class":"goods-info-top"}).find("h1").text, + "price":lambda page: page.find(id="unit_price").get("data-orgp") + }, + "hobbyking.com":{ + "name":lambda page: page.find("h1",attrs={"class":"product-name"}).text, + "price":lambda page: page.find("span",id = re.compile(r"product-price.*")).find("span",attrs={"class":"price"}).text + } + } + def __init__(self,url,space_seperated_categories = 7,bs=None): + self.url=url + self.info_url = urllib.parse.urlparse(url) + if self.info_url.netloc not in price_finder.page_funcs.keys(): + raise NotImplementedError("Not implemented for {}".format(self.info_url.netloc)) + if bs: + self.bs= bs + else: + self.bs = get_BS(url) + self.words = re_words(space_seperated_categories) + self.time = datetime.datetime.today() + self.info_product = self._get_product_info_() + + + def _get_product_info_(self): + funcs = price_finder.page_funcs[self.info_url.netloc] + return { + "product_name":self.words.match( + funcs["name"](self.bs) + ).group(0), + "price":funcs["price"](self.bs).replace("$",""), + }