|
|
|
@ -1,45 +0,0 @@ |
|
|
|
import time |
|
|
|
|
|
|
|
from restscrape.scraping.browser import Browser |
|
|
|
from restscrape.scraping.scraper import Scraper, proxy_scraper |
|
|
|
from restscrape.scraping.proxy import ProxyIter, create_proxy_iter |
|
|
|
|
|
|
|
US_PROXY_ITER = create_proxy_iter() |
|
|
|
|
|
|
|
|
|
|
|
def scrape(url, labels, proxy_iter=None, wait_for=0, max_tries=4, raw_tags=True): |
|
|
|
browser = Browser(headless=False) |
|
|
|
if proxy_iter is not None: |
|
|
|
for trial in range(max_tries): |
|
|
|
proxy_ip = next(proxy_iter) |
|
|
|
try: |
|
|
|
browser.restart_browser(start_page=url, proxy=proxy_ip) |
|
|
|
if wait_for: |
|
|
|
time.sleep(wait_for) |
|
|
|
source = browser.get_source() |
|
|
|
break |
|
|
|
except Exception as e: |
|
|
|
print(e) |
|
|
|
print(proxy_ip) |
|
|
|
proxy_iter.blacklist(proxy_ip) |
|
|
|
|
|
|
|
else: |
|
|
|
for trial in range(max_tries): |
|
|
|
try: |
|
|
|
if trial == 0: |
|
|
|
browser.restart_browser(start_page=url) |
|
|
|
else: |
|
|
|
browser.open(url) |
|
|
|
source = browser.get_source() |
|
|
|
break |
|
|
|
except Exception as e: |
|
|
|
print(e) |
|
|
|
|
|
|
|
scraper = Scraper(source) |
|
|
|
return scraper.label_convert(labels, raw_tags=raw_tags), browser |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
ret, browser = scrape( |
|
|
|
"http://rlbrhost.ddns.net/", {"links": "//a"}, wait_for=10, raw_tags=False |
|
|
|
) |