Not done more changes will be made but big first steps

initial commit
6 changed files with 231 additions and 109 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,6 @@
 __pycache__
 .gitignore
 /uBlock0.chromium
 /bg.html
 /test_this_bullshit.py
 /output
--- a/batch_process.py
+++ b/batch_process.py
@ -1,12 +1,25 @@
 from price_finder import price_finder,BS
 from price_finder import ParseResult
 from lxml import etree
 from bs4 import BeautifulSoup as BS
 from itertools import cycle
 import requests
 import requests_html
 from urllib.parse import urlparse
 # import requests_html
 import sys
 from ipaddress import ip_address
 def get_proxies(country = 'United States'):
    ses = requests_html.HTMLSession()
    r = ses.get('https://free-proxy-list.net/')
    page = BS(r.html.raw_html,'lxml')
 from get_link import get_link
 import json
 with open('xpaths.json') as file:
    xpaths_data = json.load(file)
 parser = etree.HTMLParser()
 def text2tree(text):
    return etree.fromstring(text,parser)
 def get_proxies(link='https://free-proxy-list.net/',country = 'United States'):
 ##    ses = requests_html.HTMLSession()
    r = requests.get(link)
    page = BS(r.content,'lxml')
    table = page.find(id='proxylisttable')
    headers,*rows = table.find_all('tr')
    headers = list(tag.text.lower() for tag in headers.find_all('th'))
@ -21,55 +34,61 @@ def get_proxies(country = 'United States'):
                try:
                    ip_address(tr[ip])
                    assert int(port) >= 0 and int(port) < 2**16
                    if tr[https_support] == "yes" and tr[country_id] == country:
                    if (tr[https_support] == "yes" or False) and tr[country_id] == country:
                        proxies.append('{}:{}'.format(tr[ip],tr[port]))
                except (ValueError,AssertionError):
                    pass
            except Exception as e:
                print(row)
                raise e
    return cycle(proxies)
 def get_prices(links):
    proxies = get_proxies()
    s = requests_html.HTMLSession()
    ret = []
    bad_proxies= set()
    for link in links:
        page = None
        render_tries = 0
        print(link)
        while not page:
            proxy = next(proxies)
            while proxy in bad_proxies:
                proxy = next(proxies)
            print(proxy)
            try:
                r = s.get(link,proxies={'http':proxy,'https':proxy})
                print('got')
                try:
                    render_tries += 1
                    r.html.render()
                    print('rendered')
                except requests_html.MaxRetries:
                    if render_tries > 2:
                        pass
                    else:
                        print('!'+proxy)
                        bad_proxies.update([proxy])
                        continue
                page = r.html.raw_html
                ret.append(price_finder(link,bs=BS(page,'lxml')))
    return proxies
            except (requests.exceptions.ProxyError,requests.exceptions.SSLError):
                print('!'+proxy)
                bad_proxies.update([proxy])
 class proxy_iter:
    def __init__(self,proxies):
        self._proxies = set(proxies)
        self.proxies = self._proxies.copy()
        self.bad_proxies = set()
        # self.used_proxies = {}
    print(bad_proxies) 
    s.close()
    def __next__(self):
        self.proxies -= self.bad_proxies
        if len(self.proxies) == 0:
            raise StopIteration
        elem = self.proxies.pop()
        if len(self.proxies) == 0:
            self.proxies = self._proxies.copy()
        return elem
    def __iter__(self):
        return self
    def blacklist(self,proxy):
        self.bad_proxies.add(proxy)
 def get_prices(links,use_proxies = True):
    pages = {}
    xpaths = {link:xpaths_data[urlparse(link).netloc] for link in links}
    # print(xpaths)
    if use_proxies: 
        proxies = proxy_iter(get_proxies() + get_proxies('https://www.us-proxy.org/'))
        for link in links:
            for proxy in proxies:
                print(link,proxy)
                try:
                    page = get_link(link,xpaths,proxy=proxy)
                    pages[link] = page
                    break
                except Exception as e:
                    print(type(e),e,file=sys.stdout)
                    proxies.blacklist(proxy)
        if len(links) != len(pages.keys()):
            raise Exception('all proxies suck')
    else:
        pages = get_link(links,xpaths)
    ret = []
    for link in links:
        tree = text2tree(pages[link])
        ret.append(
            ParseResult(link,tree)
        )
    return ret
 if __name__ == "__main__":
    import saveto
    import random
    ql = saveto.load('quad_links')
    random.shuffle(ql)
    products = get_prices(ql)
--- a/get_link.py
+++ b/get_link.py
@ -0,0 +1,66 @@
 import pyppeteer
 import pyppeteer.errors
 import asyncio
 import os
 async def _get_link(browser,link,xpath):
    pages = await browser.pages()
    page = pages[0]
    await page.goto(link,waitUntil='documentloaded')
    xpath = [xpath['name'],xpath['price']]
    for _xpath in xpath:
        print(repr(_xpath))
        try:
            await page.waitForXPath(_xpath)
        except pyppeteer.errors.TimeoutError:
            pass
    await asyncio.sleep(1)
    webpage = None
    for i in range(20):
        try:
            webpage = await page.content()
            break
        except:
            await asyncio.sleep(1)
    return webpage
 async def _single_link(browser,link,xpath):
    webpage = await _get_link(browser,link,xpath)
    await browser.close()
    return webpage
 async def _multi_link(browser,links,xpaths):
    results = {}
    for link in links:
        xpath = xpaths[link]
        webpage = await _get_link(browser,link,xpath)
        results[link] = webpage
    await browser.close()
    return results
 def get_link(links,xpaths,headless = False,proxy = None):
    loop = asyncio.get_event_loop()
    run = loop.run_until_complete
    opts = {
        'headless':headless,
        }
    if proxy:
        opts['args'] = [f'--proxy-server={proxy}']
    else:
        opts['args'] = []
    ext = os.path.join(os.path.dirname(__file__),'uBlock0.chromium')
    opts['args'] += [f'--disable-extensions-except={ext}', f'--load-extension={ext}']
    # print(opts)
    browser = run(pyppeteer.launch(**opts))
    try:
        if isinstance(links,list):
            result = run(_multi_link(browser,links,xpaths))
        else:
            result = run(_single_link(browser,links,xpaths[links]))
        return result
    except Exception as e:
        run(browser.close())
        raise e
--- a/price_finder.py
+++ b/price_finder.py
@ -1,77 +1,62 @@
 import urllib
 from fake_useragent import UserAgent
 from bs4 import BeautifulSoup as BS
 from requests_html import HTMLSession
 import re
 import datetime
 import copy
 import json
 with open('xpaths.json') as file:
    xpaths = json.load(file)
 user_agent = UserAgent().chrome
 debug = None
 def get_words(string,n):
    words = re.finditer(r"(\b[^ \n]+\b)",string)
 def get_words(raw,n):
    words = re.finditer(r"(\b[^ \n]+\b)",raw)
    word_list = list(match.group(0) for match in words)
    if len(word_list) > n:
        word_list = word_list[:n]
    return ' '.join(word_list)
 def get_page(url):
    page = None
    while not page:
        page = urllib.request.Request(url,headers = {"User-Agent":user_agent})
        page = str(urllib.request.urlopen(page).read())
    return page
 def format_price(raw):
    return re.search(r'\d+(\.\d)?',raw).group(0)
 def get_BS(url):
    return BS(get_page(url),"lxml")
 class ParseResult:
 class price_finder:
    page_funcs = {
    "www.amazon.com":{
        "name":lambda page: re.sub(r"( {2,}|\n|\\n)","",page.find("span",id="productTitle").text),
        "price":lambda page: page.find(name = "span",id = re.compile("priceblock.*")).text
        },
    "www.banggood.com":{
        "name":lambda page: page.find("h1",attrs = {"itemprop":"name"}).text,
        "price":lambda page: page.find("div",attrs = {"class":"now"}).get("oriprice")
        },
    "www.dalprops.com":{
        "name":lambda page: page.find("h1",attrs = {"class":"product_title"}).text,
        "price":lambda page: page.find("meta",attrs = {"itemprop":"price"}).get("content")
        },
    "www.gearbest.com":{
        "name":lambda page:re.sub(" {2,}|\n","",page.find("div",attrs = {"class":"goodsIntro_titleWrap"}).find("h1").text),
        "price":lambda page: page.find("span",attrs={"class":"goodsIntro_price"}).text
        },
    "hobbyking.com":{
        "name":lambda page: page.find("h1",attrs={"class":"product-name"}).text,
        "price":lambda page: page.find("span",id = re.compile(r"product-price.*")).find("span",attrs={"class":"price"}).text
        },
    "www.getfpv.com":{
        "name": lambda page: re.sub(r"\\n|\n","", page.find("div",attrs={"class":"product-name"}).text),
        "price": lambda page: re.sub(r"\\n|\n","", page.find("span",attrs={"id":re.compile("product-price.*")}).text)
        }
    }
    def __init__(self,url,space_seperated_categories = 7,bs=None):
    def __init__(self,url,tree,space_seperated_categories = 7,):
        self.url=url
        self.info_url = urllib.parse.urlparse(url)
        self.word_len = space_seperated_categories
        if self.info_url.netloc not in price_finder.page_funcs.keys():
            raise NotImplementedError("Not implemented for {}".format(self.info_url.netloc))
        if bs:
            self.bs= bs
        else:
            self.bs = get_BS(url)
        # self.words = re_words(space_seperated_categories)
        self.tree = tree
        self.time = datetime.datetime.today()
        self.info_product = self._get_product_info_()
    def _get_product_info_(self):
        funcs = price_finder.page_funcs[self.info_url.netloc]
        # print(self.url)
        host = self.info_url.netloc
        product_name = get_words(
            self.tree.xpath(xpaths[host]['name'])[0].text
            )
        other_raw = None
        try:
            other_raw = self.tree.xpath(xpaths[host]['other'])[0].text
        except KeyError:
            pass
        if host in ['www.gearbest.com']:
            if other.raw:
                price = '0.00'
            else:
                price = format_price(
                    self.tree.xpath(xpaths[host]['price'])[0].text
                    )
        else:
            price = format_price(
                self.tree.xpath(xpaths[host]['price'])[0].text
                )
        return {
            "product_name":get_words(funcs["name"](self.bs),self.word_len),
            "price":funcs["price"](self.bs).replace("$",""),
            "product_name":product_name,
            "price":price,
            }
--- a/proxy_class.py
+++ b/proxy_class.py
@ -0,0 +1,21 @@
 class proxy_iter:
    def __init__(self,proxies):
        self._proxies = set(proxies)
        self.proxies = self._proxies.copy()
        self.bad_proxies = set()
        # self.used_proxies = {}
    def __next__(self):
        self.proxies -= self.bad_proxies
        if len(self.proxies) == 0:
            raise StopIteration
        elem = self.proxies.pop()
        if len(self.proxies) == 0:
            self.proxies = self._proxies.copy()
        return elem
    def __iter__(self):
        return self
    def blacklist(self,proxy):
        self.bad_proxies.add(proxy)
--- a/xpaths.json
+++ b/xpaths.json
@ -0,0 +1,27 @@
 {
 	"www.banggood.com": {
 		"name": "//h1[@itemprop='name']",
 		"price": "//div[@class='now']"
 	},
 	"www.gearbest.com": {
 		"name": "//h1[@class='goodsIntro_title']",
 		"price": "//span[contains(@class,'goodsIntro_price')]",
 		"other": "//div[@class='goodsIntro_noticeSubmit']"
 	},
 	"www.amazon.com": {
 		"name": "//span[@id='priceblock_dealprice' or @id='priceblock_ourprice']",
 		"price": "//span[@id='productTitle']"
 	},
 	"www.getfpv.com": {
 		"name": "//div[@class='product-name']/span",
 		"price": "//div[@class='price-box']/p[@class='special-price']/span[@class='price'] | //div[@class='price-box']/span[@class='regular-price']/span"
 	},
 	"www.dalprops.com": {
 		"name": "//h1[@itemprop='name']",
 		"price": "//*[@id='product-price']"
 	},
    "hobbyking.com": {
        "name": "//h1[contains(@class,'product-name')]",
        "price": "//p[@class='special-price']/span[@class='price'] | //span[@class='regular-price']/span[@class='price']"
    }
 }
Author	SHA1	Message	Date
Raphael Roberts	b0dbd9fa03	Not done more changes will be made but big first steps	7 years ago
Raphael Roberts	891684741a	initial commit	7 years ago