From 32f94037dd76389ede74d809af01f60e4f2fde97 Mon Sep 17 00:00:00 2001 From: Raphael Roberts Date: Thu, 24 Jan 2019 02:08:37 -0600 Subject: [PATCH 1/2] fixed problem with import --- restscrape/scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/restscrape/scraper.py b/restscrape/scraper.py index 0503be7..c75b98b 100644 --- a/restscrape/scraper.py +++ b/restscrape/scraper.py @@ -22,7 +22,7 @@ class scraper: for label,xpath in labels.items(): res = self.xpath(xpath) if raw_tags: - ret[label] = list(etree.tostring(element, pretty_print=True) for element in res) + ret[label] = list(lxml.etree.tostring(element, pretty_print=True) for element in res) else: ret[label] = list(element.text for element in res) From 260384397ac629b8578cf5eb63416384207fe417 Mon Sep 17 00:00:00 2001 From: Raphael Roberts Date: Thu, 24 Jan 2019 02:09:34 -0600 Subject: [PATCH 2/2] added a function scrape, which will hopefully be the entry point for everything --- restscrape/__init__.py | 10 ++++++---- restscrape/browser.py | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/restscrape/__init__.py b/restscrape/__init__.py index 78051f3..71d3433 100644 --- a/restscrape/__init__.py +++ b/restscrape/__init__.py @@ -2,7 +2,7 @@ if __name__ == "__main__": import sys sys.path.insert(0,'..') from restscrape.browser import browser as browser_class -from restscrape.scraper import scraper +from restscrape.scraper import scraper as scraper_class from restscrape.proxy import create_proxy_iter import time US_PROXY_ITER = create_proxy_iter() @@ -30,11 +30,13 @@ def scrape(url,labels,max_tries=4,proxy_iter = None,wait_for = 0,raw_tags = True browser.restart_browser(start_page = url) else: browser.open(url) + source = browser.get_source() + break except Exception as e: print(e) - scraper = scraper(source) - return scraper.label_convert(raw_tags=True) + scraper = scraper_class(source) + return scraper.label_convert(labels,raw_tags=raw_tags),browser if __name__ == "__main__": - ret = scrape('https://www.google.com',{'imgs':'//img'},wait_for = 10) \ No newline at end of file + ret,browser = scrape('http://rlbrhost.ddns.net/',{'links':'//a'},wait_for = 10,raw_tags=False) \ No newline at end of file diff --git a/restscrape/browser.py b/restscrape/browser.py index 4fe59fa..8699e62 100644 --- a/restscrape/browser.py +++ b/restscrape/browser.py @@ -23,7 +23,7 @@ class browser: ext = os.path.join(os.getcwd(),'uBlock') else: ext = os.path.abspath(os.path.join(os.path.dirname(__file__),'uBlock')) - print(ext) + # print(ext) opts.setdefault('args',[]).extend(['--disable-extensions-except='+ext, '--load-extension='+ext]) if proxy is not None: opts.setdefault('args',[]).extend(['--proxy-server='+proxy])