From cfd20b9035290d7be28f1e727c57a28a0179eaea Mon Sep 17 00:00:00 2001 From: Raphael Roberts Date: Mon, 8 Jul 2019 13:03:25 -0500 Subject: [PATCH] Removed unnecessary stuff from __init__.py in scraping sub package. --- restscrape/scraping/__init__.py | 45 --------------------------------- 1 file changed, 45 deletions(-) diff --git a/restscrape/scraping/__init__.py b/restscrape/scraping/__init__.py index 868aeeb..e69de29 100644 --- a/restscrape/scraping/__init__.py +++ b/restscrape/scraping/__init__.py @@ -1,45 +0,0 @@ -import time - -from restscrape.scraping.browser import Browser -from restscrape.scraping.scraper import Scraper, proxy_scraper -from restscrape.scraping.proxy import ProxyIter, create_proxy_iter - -US_PROXY_ITER = create_proxy_iter() - - -def scrape(url, labels, proxy_iter=None, wait_for=0, max_tries=4, raw_tags=True): - browser = Browser(headless=False) - if proxy_iter is not None: - for trial in range(max_tries): - proxy_ip = next(proxy_iter) - try: - browser.restart_browser(start_page=url, proxy=proxy_ip) - if wait_for: - time.sleep(wait_for) - source = browser.get_source() - break - except Exception as e: - print(e) - print(proxy_ip) - proxy_iter.blacklist(proxy_ip) - - else: - for trial in range(max_tries): - try: - if trial == 0: - browser.restart_browser(start_page=url) - else: - browser.open(url) - source = browser.get_source() - break - except Exception as e: - print(e) - - scraper = Scraper(source) - return scraper.label_convert(labels, raw_tags=raw_tags), browser - - -if __name__ == "__main__": - ret, browser = scrape( - "http://rlbrhost.ddns.net/", {"links": "//a"}, wait_for=10, raw_tags=False - )