From 6a7061d1692670e0ff0d5c5eb66155297fbb4562 Mon Sep 17 00:00:00 2001 From: Raphael Roberts Date: Thu, 24 Jan 2019 01:41:05 -0600 Subject: [PATCH] temporary fix while restoring uBlock --- restscrape/__init__.py | 40 ++++++++++++++++++++++++++++++++++++++++ restscrape/browser.py | 3 ++- restscrape/proxy.py | 2 +- restscrape/scraper.py | 11 +++++++++++ 4 files changed, 54 insertions(+), 2 deletions(-) diff --git a/restscrape/__init__.py b/restscrape/__init__.py index e69de29..78051f3 100644 --- a/restscrape/__init__.py +++ b/restscrape/__init__.py @@ -0,0 +1,40 @@ +if __name__ == "__main__": + import sys + sys.path.insert(0,'..') +from restscrape.browser import browser as browser_class +from restscrape.scraper import scraper +from restscrape.proxy import create_proxy_iter +import time +US_PROXY_ITER = create_proxy_iter() + +def scrape(url,labels,max_tries=4,proxy_iter = None,wait_for = 0,raw_tags = True): + browser = browser_class(headless=False) + if proxy_iter is not None: + for trial in range(max_tries): + proxy_ip = next(proxy_iter) + try: + browser.restart_browser(start_page = url,proxy=proxy_ip) + if wait_for: + time.sleep(wait_for) + source = browser.get_source() + break + except Exception as e: + print(e) + print(proxy_ip) + proxy_iter.blacklist(proxy) + + else: + for trial in range(max_tries): + try: + if trial == 0: + browser.restart_browser(start_page = url) + else: + browser.open(url) + except Exception as e: + print(e) + + scraper = scraper(source) + return scraper.label_convert(raw_tags=True) + +if __name__ == "__main__": + ret = scrape('https://www.google.com',{'imgs':'//img'},wait_for = 10) \ No newline at end of file diff --git a/restscrape/browser.py b/restscrape/browser.py index d51881c..4fe59fa 100644 --- a/restscrape/browser.py +++ b/restscrape/browser.py @@ -22,7 +22,8 @@ class browser: if __name__ == "__main__": ext = os.path.join(os.getcwd(),'uBlock') else: - ext = os.path.join(os.path.dirname(__file__),'uBlock') + ext = os.path.abspath(os.path.join(os.path.dirname(__file__),'uBlock')) + print(ext) opts.setdefault('args',[]).extend(['--disable-extensions-except='+ext, '--load-extension='+ext]) if proxy is not None: opts.setdefault('args',[]).extend(['--proxy-server='+proxy]) diff --git a/restscrape/proxy.py b/restscrape/proxy.py index 1073775..0da65dc 100644 --- a/restscrape/proxy.py +++ b/restscrape/proxy.py @@ -26,7 +26,7 @@ class proxy_iter: def __len__(self): return len(self.proxy_set - self.bad) -def create_proxy_iter(url): +def create_proxy_iter(url = US_PROXY_URL): '''Create a proxy_iter from proxy_webpage''' resp = requests.get(url) resp.raise_for_status() diff --git a/restscrape/scraper.py b/restscrape/scraper.py index af63fa6..0503be7 100644 --- a/restscrape/scraper.py +++ b/restscrape/scraper.py @@ -17,6 +17,17 @@ class scraper: for row in table.xpath(rows_xpath)[0].findall('tr'): yield dict(zip(headers,(data.text for data in row.findall('td')))) + def label_convert(self,labels,raw_tags = False): + ret = {} + for label,xpath in labels.items(): + res = self.xpath(xpath) + if raw_tags: + ret[label] = list(etree.tostring(element, pretty_print=True) for element in res) + else: + ret[label] = list(element.text for element in res) + + return ret + def proxy_scraper(page_source): page = scraper(page_source) yield from page.extract_table(table="//table[@id='proxylisttable']",header_xpath="./thead/tr",rows_xpath="./tbody")