Browse Source

temporary fix while restoring uBlock

master
Raphael Roberts 7 years ago
parent
commit
6a7061d169
  1. 40
      restscrape/__init__.py
  2. 3
      restscrape/browser.py
  3. 2
      restscrape/proxy.py
  4. 11
      restscrape/scraper.py

40
restscrape/__init__.py

@ -0,0 +1,40 @@
if __name__ == "__main__":
import sys
sys.path.insert(0,'..')
from restscrape.browser import browser as browser_class
from restscrape.scraper import scraper
from restscrape.proxy import create_proxy_iter
import time
US_PROXY_ITER = create_proxy_iter()
def scrape(url,labels,max_tries=4,proxy_iter = None,wait_for = 0,raw_tags = True):
browser = browser_class(headless=False)
if proxy_iter is not None:
for trial in range(max_tries):
proxy_ip = next(proxy_iter)
try:
browser.restart_browser(start_page = url,proxy=proxy_ip)
if wait_for:
time.sleep(wait_for)
source = browser.get_source()
break
except Exception as e:
print(e)
print(proxy_ip)
proxy_iter.blacklist(proxy)
else:
for trial in range(max_tries):
try:
if trial == 0:
browser.restart_browser(start_page = url)
else:
browser.open(url)
except Exception as e:
print(e)
scraper = scraper(source)
return scraper.label_convert(raw_tags=True)
if __name__ == "__main__":
ret = scrape('https://www.google.com',{'imgs':'//img'},wait_for = 10)

3
restscrape/browser.py

@ -22,7 +22,8 @@ class browser:
if __name__ == "__main__":
ext = os.path.join(os.getcwd(),'uBlock')
else:
ext = os.path.join(os.path.dirname(__file__),'uBlock')
ext = os.path.abspath(os.path.join(os.path.dirname(__file__),'uBlock'))
print(ext)
opts.setdefault('args',[]).extend(['--disable-extensions-except='+ext, '--load-extension='+ext])
if proxy is not None:
opts.setdefault('args',[]).extend(['--proxy-server='+proxy])

2
restscrape/proxy.py

@ -26,7 +26,7 @@ class proxy_iter:
def __len__(self):
return len(self.proxy_set - self.bad)
def create_proxy_iter(url):
def create_proxy_iter(url = US_PROXY_URL):
'''Create a proxy_iter from proxy_webpage'''
resp = requests.get(url)
resp.raise_for_status()

11
restscrape/scraper.py

@ -17,6 +17,17 @@ class scraper:
for row in table.xpath(rows_xpath)[0].findall('tr'):
yield dict(zip(headers,(data.text for data in row.findall('td'))))
def label_convert(self,labels,raw_tags = False):
ret = {}
for label,xpath in labels.items():
res = self.xpath(xpath)
if raw_tags:
ret[label] = list(etree.tostring(element, pretty_print=True) for element in res)
else:
ret[label] = list(element.text for element in res)
return ret
def proxy_scraper(page_source):
page = scraper(page_source)
yield from page.extract_table(table="//table[@id='proxylisttable']",header_xpath="./thead/tr",rows_xpath="./tbody")

Loading…
Cancel
Save