Browse Source

stopped using selium and now using proxies + updated tags sought after

master
Raphael Roberts 8 years ago
parent
commit
00fb6d11d2
  1. 79
      batch_process.py
  2. 11
      price_finder.py

79
batch_process.py

@ -1,28 +1,57 @@
from selenium import webdriver
from price_finder import price_finder,BS
from itertools import cycle
from requests_html import HTMLSession
from ipaddress import ip_address
def get_proxies(country = 'United States'):
ses = HTMLSession()
r = ses.get('https://free-proxy-list.net/')
page = BS(r.html.raw_html,'lxml')
table = page.find(id='proxylisttable')
headers,*rows = table.find_all('tr')
headers = list(tag.text.lower() for tag in headers.find_all('th'))
ip,port = headers.index('ip address'),headers.index('port')
https_support = headers.index('https')
country_id = headers.index('country')
proxies = []
for row in rows:
if row.find('td'):
tr = list(tag.text for tag in row.find_all('td'))
try:
try:
ip_address(tr[ip])
assert int(port) >= 0 and int(port) < 2**16
if tr[https_support] == "yes" and tr[country_id] == country:
proxies.append('{}:{}'.format(tr[ip],tr[port]))
except (ValueError,AssertionError):
pass
except Exception as e:
print(row)
raise e
return cycle(proxies)
proxies = get_proxies()
def get_prices(links):
try:
opts = webdriver.chrome.options.Options()
opts.add_argument('--headless')
driver = webdriver.Chrome(chrome_options = opts,headless= True)
results = []
for link in links:
driver.get(link)
ret = []
s = HTMLSession()
ret = []
bad_proxies= set()
for link in links:
print(link)
while True:
proxy = next(proxies)
while proxy in bad_proxies:
proxy = next(proxies)
print(proxy)
try:
results.append(
price_finder(
url = link,bs=BS(driver.page_source,'lxml')
)
)
except AttributeError:
results.append(price_finder(link))
driver.quit()
return results
except Exception as excpt:
driver.quit()
raise excpt
if __name__ == "__main__":
import saveto
links = saveto.load('quad_links')
products = get_prices(links)
r = s.get(link,proxies={'http':proxy,'https':proxy})
print('got')
r.html.render()
print('rendered')
ret.append(price_finder(link,bs=BS(r.html.raw_html,'lxml')))
break
except Exception as e:
print(e)
print('!'+proxy)
bad_proxies.update([proxy])
pass
s.close()
return ret

11
price_finder.py

@ -1,6 +1,7 @@
import urllib
from fake_useragent import UserAgent
from bs4 import BeautifulSoup as BS
from requests_html import HTMLSession
import re
import datetime
@ -33,12 +34,16 @@ class price_finder:
"price":lambda page: page.find("meta",attrs = {"itemprop":"price"}).get("content")
},
"www.gearbest.com":{
"name":lambda page: page.find("div",attrs = {"class":"goods-info-top"}).find("h1").text,
"price":lambda page: page.find(id="unit_price").get("data-orgp")
"name":lambda page:re.sub(' {2,}|\n','',page.find("div",attrs = {"class":"goodsIntro_titleWrap"}).find("h1").text),
"price":lambda page: page.find("span",attrs={"class":"goodsIntro_price"}).text
},
"hobbyking.com":{
"name":lambda page: page.find("h1",attrs={"class":"product-name"}).text,
"price":lambda page: page.find("span",id = re.compile(r"product-price.*")).find("span",attrs={"class":"price"}).text
},
'www.getfpv.com':{
'name': lambda page: re.sub(r'\\n|\n','', page.find("div",attrs={"class":"product-name"}).text),
'price': lambda page: re.sub(r'\\n|\n','', page.find('span',attrs={'id':re.compile('product-price.*')}).text)
}
}
def __init__(self,url,space_seperated_categories = 7,bs=None):
@ -57,7 +62,7 @@ class price_finder:
def _get_product_info_(self):
funcs = price_finder.page_funcs[self.info_url.netloc]
print(self.url)
# print(self.url)
return {
"product_name":self.words.match(

Loading…
Cancel
Save