Browse Source

initial commit

master
Raphael Roberts 8 years ago
commit
bf5171e660
  1. 1
      .gitignore
  2. 0
      __init__.py
  3. 23
      batch_process.py
  4. 64
      price_finder.py

1
.gitignore

@ -0,0 +1 @@
__pycache__

0
__init__.py

23
batch_process.py

@ -0,0 +1,23 @@
from selenium import webdriver
from price_finder import price_finder,BS
def get_prices(links):
try:
opts = webdriver.chrome.options.Options()
opts.add_argument('--headless')
driver = webdriver.Chrome(chrome_options = opts,headless= True)
results = []
for link in links:
driver.get(link)
results.append(
price_finder(
url = link,bs=BS(driver.page_source,'lxml')
)
)
driver.quit()
return results
except Exception as excpt:
driver.quit()
raise excpt
import saveto
links = saveto.load('quad_links')
products = get_prices(links)

64
price_finder.py

@ -0,0 +1,64 @@
import urllib
from fake_useragent import UserAgent
from bs4 import BeautifulSoup as BS
import re
import datetime
user_agent = UserAgent().chrome
re_words = lambda n: re.compile(r"( ?[^ ]+ ?)"+"{0,"+str(n-1)+"}"+r"[^ ]+")
def get_page(url):
page = None
while not page:
page = urllib.request.Request(url,headers = {"User-Agent":user_agent})
page = str(urllib.request.urlopen(page).read())
return page
def get_BS(url):
return BS(get_page(url),"lxml")
class price_finder:
page_funcs = {
"www.amazon.com":{
"name":lambda page: re.sub(r"( {2,}|\n|\\n)","",page.find("span",id="productTitle").text),
"price":lambda page: page.find(name = "span",id = re.compile("priceblock.*")).text
},
"www.banggood.com":{
"name":lambda page: page.find("h1",attrs = {"itemprop":"name"}).text,
"price":lambda page: page.find("div",attrs = {"class":"now"}).get("oriprice")
},
"www.dalprops.com":{
"name":lambda page: page.find("h1",attrs = {"class":"product_title"}).text,
"price":lambda page: page.find("meta",attrs = {"itemprop":"price"}).get("content")
},
"www.gearbest.com":{
"name":lambda page: page.find("div",attrs = {"class":"goods-info-top"}).find("h1").text,
"price":lambda page: page.find(id="unit_price").get("data-orgp")
},
"hobbyking.com":{
"name":lambda page: page.find("h1",attrs={"class":"product-name"}).text,
"price":lambda page: page.find("span",id = re.compile(r"product-price.*")).find("span",attrs={"class":"price"}).text
}
}
def __init__(self,url,space_seperated_categories = 7,bs=None):
self.url=url
self.info_url = urllib.parse.urlparse(url)
if self.info_url.netloc not in price_finder.page_funcs.keys():
raise NotImplementedError("Not implemented for {}".format(self.info_url.netloc))
if bs:
self.bs= bs
else:
self.bs = get_BS(url)
self.words = re_words(space_seperated_categories)
self.time = datetime.datetime.today()
self.info_product = self._get_product_info_()
def _get_product_info_(self):
funcs = price_finder.page_funcs[self.info_url.netloc]
return {
"product_name":self.words.match(
funcs["name"](self.bs)
).group(0),
"price":funcs["price"](self.bs).replace("$",""),
}
Loading…
Cancel
Save