|
|
@ -1,9 +1,14 @@ |
|
|
from lxml.etree import etree |
|
|
|
|
|
|
|
|
from bs4 import UnicodeDammit |
|
|
|
|
|
from lxml import html |
|
|
|
|
|
import json |
|
|
|
|
|
with open('xpaths.json') as file: |
|
|
|
|
|
XPATHS = json.load(file) |
|
|
class __price_scraper__: |
|
|
class __price_scraper__: |
|
|
def __init__(self,url,page_source): |
|
|
def __init__(self,url,page_source): |
|
|
self.url = url |
|
|
self.url = url |
|
|
self.page_source = page_source |
|
|
self.page_source = page_source |
|
|
self.etree = etree.fromstring(self.page_source) |
|
|
|
|
|
|
|
|
self.xpaths = XPATHS[self.base_url] |
|
|
|
|
|
self.etree = html.fromstring(self.page_source) |
|
|
|
|
|
|
|
|
def scrape(self): |
|
|
def scrape(self): |
|
|
scrape_for = ['price','name'] |
|
|
scrape_for = ['price','name'] |
|
|
@ -11,15 +16,27 @@ class __price_scraper__: |
|
|
for cat in scrape_for: |
|
|
for cat in scrape_for: |
|
|
self.__setattr__(cat,eval_xpath(cat)) |
|
|
self.__setattr__(cat,eval_xpath(cat)) |
|
|
|
|
|
|
|
|
def eval_xpath(name): |
|
|
|
|
|
|
|
|
def eval_xpath(self,name): |
|
|
return self.etree.xpath(self.xpaths[name]) |
|
|
return self.etree.xpath(self.xpaths[name]) |
|
|
|
|
|
|
|
|
def __get_stock__(self): |
|
|
def __get_stock__(self): |
|
|
return self.eval_xpath('in_stock') |
|
|
|
|
|
|
|
|
return self.eval_xpath('stock') |
|
|
|
|
|
|
|
|
def amazon_scraper(__price_scraper__): |
|
|
|
|
|
xpaths = { |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
class amazon_scraper(__price_scraper__): |
|
|
|
|
|
base_url = "www.amazon.com" |
|
|
def in_stock(self): |
|
|
def in_stock(self): |
|
|
stock_val = self.__get_stock__() |
|
|
stock_val = self.__get_stock__() |
|
|
return stock_val != 'poop' |
|
|
return stock_val != 'poop' |
|
|
|
|
|
|
|
|
|
|
|
class banggood_scraper(__price_scraper__): |
|
|
|
|
|
base_url = "www.banggood.com" |
|
|
|
|
|
def in_stock(self): |
|
|
|
|
|
stock_val = self.__get_stock__() |
|
|
|
|
|
return stock_val.text.lower() != 'out of stock' |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
test = 'test\\bg.html' |
|
|
|
|
|
with open(test,'rb') as file: |
|
|
|
|
|
source = UnicodeDammit(file.read()).unicode_markup |
|
|
|
|
|
|
|
|
|
|
|
res = banggood_scraper('idek',source) |