stopped using selium and now using proxies + updated tags sought after

8 years ago · 00fb6d11d2
2 changed files with 62 additions and 28 deletions
--- a/batch_process.py
+++ b/batch_process.py
@ -1,28 +1,57 @@
-from selenium import webdriver
 from price_finder import price_finder,BS
+from itertools import cycle
+from requests_html import HTMLSession
+from ipaddress import ip_address
+def get_proxies(country = 'United States'):
+    ses = HTMLSession()
+    r = ses.get('https://free-proxy-list.net/')
+    page = BS(r.html.raw_html,'lxml')
+    table = page.find(id='proxylisttable')
+    headers,*rows = table.find_all('tr')
+    headers = list(tag.text.lower() for tag in headers.find_all('th'))
+    ip,port = headers.index('ip address'),headers.index('port')
+    https_support = headers.index('https')
+    country_id = headers.index('country')
+    proxies = []
+    for row in rows:
+        if row.find('td'): 
+            tr = list(tag.text for tag in row.find_all('td'))
+            try:
+                try:
+                    ip_address(tr[ip])
+                    assert int(port) >= 0 and int(port) < 2**16
+                    if tr[https_support] == "yes" and tr[country_id] == country:
+                        proxies.append('{}:{}'.format(tr[ip],tr[port]))
+                except (ValueError,AssertionError):
+                    pass
+            except Exception as e:
+                print(row)
+                raise e
+    return cycle(proxies)
+proxies = get_proxies()
 def get_prices(links):
-    try:
-        opts = webdriver.chrome.options.Options()
-        opts.add_argument('--headless')
-        driver = webdriver.Chrome(chrome_options = opts,headless= True)
-        results = []
-        for link in links:
-            driver.get(link)
+    ret = []
+    s = HTMLSession()
+    ret = []
+    bad_proxies= set()
+    for link in links:
+        print(link)
+        while True:
+            proxy = next(proxies)
+            while proxy in bad_proxies:
+                proxy = next(proxies)
+            print(proxy)
            try:
-                results.append(
-                    price_finder(
-                        url = link,bs=BS(driver.page_source,'lxml')
-                        )
-                    )
-            except AttributeError:
-                results.append(price_finder(link))
-        driver.quit()
-        return results
-    except Exception as excpt:
-        driver.quit()
-        raise excpt
-if __name__ == "__main__":
-    
-    import saveto
-    links  = saveto.load('quad_links')
-    products = get_prices(links)
+                r = s.get(link,proxies={'http':proxy,'https':proxy})
+                print('got')
+                r.html.render()
+                print('rendered')
+                ret.append(price_finder(link,bs=BS(r.html.raw_html,'lxml')))
+                break
+            except Exception as e:
+                print(e)
+                print('!'+proxy)
+                bad_proxies.update([proxy])
+                pass
+    s.close()
+    return ret
--- a/price_finder.py
+++ b/price_finder.py
@ -1,6 +1,7 @@
 import urllib
 from fake_useragent import UserAgent
 from bs4 import BeautifulSoup as BS
+from requests_html import HTMLSession
 import re
 import datetime

@ -33,12 +34,16 @@ class price_finder:
        "price":lambda page: page.find("meta",attrs = {"itemprop":"price"}).get("content")
        },
    "www.gearbest.com":{
-        "name":lambda page: page.find("div",attrs = {"class":"goods-info-top"}).find("h1").text,
-        "price":lambda page: page.find(id="unit_price").get("data-orgp")
+        "name":lambda page:re.sub(' {2,}|\n','',page.find("div",attrs = {"class":"goodsIntro_titleWrap"}).find("h1").text),
+        "price":lambda page: page.find("span",attrs={"class":"goodsIntro_price"}).text
        },
    "hobbyking.com":{
        "name":lambda page: page.find("h1",attrs={"class":"product-name"}).text,
        "price":lambda page: page.find("span",id = re.compile(r"product-price.*")).find("span",attrs={"class":"price"}).text
+        },
+    'www.getfpv.com':{
+        'name': lambda page: re.sub(r'\\n|\n','', page.find("div",attrs={"class":"product-name"}).text),
+        'price': lambda page: re.sub(r'\\n|\n','', page.find('span',attrs={'id':re.compile('product-price.*')}).text)
        }
    }
    def __init__(self,url,space_seperated_categories = 7,bs=None):
@ -57,7 +62,7 @@ class price_finder:
            
    def _get_product_info_(self):
        funcs = price_finder.page_funcs[self.info_url.netloc]
-        print(self.url)
+        # print(self.url)
 
        return {
            "product_name":self.words.match(