diff --git a/restscrape/__init__.py b/restscrape/__init__.py index 78051f3..71d3433 100644 --- a/restscrape/__init__.py +++ b/restscrape/__init__.py @@ -2,7 +2,7 @@ if __name__ == "__main__": import sys sys.path.insert(0,'..') from restscrape.browser import browser as browser_class -from restscrape.scraper import scraper +from restscrape.scraper import scraper as scraper_class from restscrape.proxy import create_proxy_iter import time US_PROXY_ITER = create_proxy_iter() @@ -30,11 +30,13 @@ def scrape(url,labels,max_tries=4,proxy_iter = None,wait_for = 0,raw_tags = True browser.restart_browser(start_page = url) else: browser.open(url) + source = browser.get_source() + break except Exception as e: print(e) - scraper = scraper(source) - return scraper.label_convert(raw_tags=True) + scraper = scraper_class(source) + return scraper.label_convert(labels,raw_tags=raw_tags),browser if __name__ == "__main__": - ret = scrape('https://www.google.com',{'imgs':'//img'},wait_for = 10) \ No newline at end of file + ret,browser = scrape('http://rlbrhost.ddns.net/',{'links':'//a'},wait_for = 10,raw_tags=False) \ No newline at end of file diff --git a/restscrape/browser.py b/restscrape/browser.py index 4fe59fa..8699e62 100644 --- a/restscrape/browser.py +++ b/restscrape/browser.py @@ -23,7 +23,7 @@ class browser: ext = os.path.join(os.getcwd(),'uBlock') else: ext = os.path.abspath(os.path.join(os.path.dirname(__file__),'uBlock')) - print(ext) + # print(ext) opts.setdefault('args',[]).extend(['--disable-extensions-except='+ext, '--load-extension='+ext]) if proxy is not None: opts.setdefault('args',[]).extend(['--proxy-server='+proxy]) diff --git a/restscrape/scraper.py b/restscrape/scraper.py index 0503be7..c75b98b 100644 --- a/restscrape/scraper.py +++ b/restscrape/scraper.py @@ -22,7 +22,7 @@ class scraper: for label,xpath in labels.items(): res = self.xpath(xpath) if raw_tags: - ret[label] = list(etree.tostring(element, pretty_print=True) for element in res) + ret[label] = list(lxml.etree.tostring(element, pretty_print=True) for element in res) else: ret[label] = list(element.text for element in res)