|
|
import urllibfrom fake_useragent import UserAgentfrom bs4 import BeautifulSoup as BSfrom requests_html import HTMLSessionimport reimport datetime
user_agent = UserAgent().chromedebug = Nonedef get_words(string,n): words = re.finditer(r"(\b[^ \n]+\b)",string) word_list = list(match.group(0) for match in words) if len(word_list) > n: word_list = word_list[:n] return ' '.join(word_list)def get_page(url): page = None while not page: page = urllib.request.Request(url,headers = {"User-Agent":user_agent}) page = str(urllib.request.urlopen(page).read()) return page
def get_BS(url): return BS(get_page(url),"lxml")
class price_finder: page_funcs = { "www.amazon.com":{ "name":lambda page: re.sub(r"( {2,}|\n|\\n)","",page.find("span",id="productTitle").text), "price":lambda page: page.find(name = "span",id = re.compile("priceblock.*")).text }, "www.banggood.com":{ "name":lambda page: page.find("h1",attrs = {"itemprop":"name"}).text, "price":lambda page: page.find("div",attrs = {"class":"now"}).get("oriprice") }, "www.dalprops.com":{ "name":lambda page: page.find("h1",attrs = {"class":"product_title"}).text, "price":lambda page: page.find("meta",attrs = {"itemprop":"price"}).get("content") }, "www.gearbest.com":{ "name":lambda page:re.sub(" {2,}|\n","",page.find("div",attrs = {"class":"goodsIntro_titleWrap"}).find("h1").text), "price":lambda page: page.find("span",attrs={"class":"goodsIntro_price"}).text }, "hobbyking.com":{ "name":lambda page: page.find("h1",attrs={"class":"product-name"}).text, "price":lambda page: page.find("span",id = re.compile(r"product-price.*")).find("span",attrs={"class":"price"}).text }, "www.getfpv.com":{ "name": lambda page: re.sub(r"\\n|\n","", page.find("div",attrs={"class":"product-name"}).text), "price": lambda page: re.sub(r"\\n|\n","", page.find("span",attrs={"id":re.compile("product-price.*")}).text) } } def __init__(self,url,space_seperated_categories = 7,bs=None): self.url=url self.info_url = urllib.parse.urlparse(url) self.word_len = space_seperated_categories if self.info_url.netloc not in price_finder.page_funcs.keys(): raise NotImplementedError("Not implemented for {}".format(self.info_url.netloc)) if bs: self.bs= bs else: self.bs = get_BS(url) # self.words = re_words(space_seperated_categories) self.time = datetime.datetime.today() self.info_product = self._get_product_info_()
def _get_product_info_(self): funcs = price_finder.page_funcs[self.info_url.netloc] # print(self.url) return { "product_name":get_words(funcs["name"](self.bs),self.word_len), "price":funcs["price"](self.bs).replace("$",""), }
|