|
|
|
@ -0,0 +1,40 @@ |
|
|
|
if __name__ == "__main__": |
|
|
|
import sys |
|
|
|
sys.path.insert(0,'..') |
|
|
|
from restscrape.browser import browser as browser_class |
|
|
|
from restscrape.scraper import scraper |
|
|
|
from restscrape.proxy import create_proxy_iter |
|
|
|
import time |
|
|
|
US_PROXY_ITER = create_proxy_iter() |
|
|
|
|
|
|
|
def scrape(url,labels,max_tries=4,proxy_iter = None,wait_for = 0,raw_tags = True): |
|
|
|
browser = browser_class(headless=False) |
|
|
|
if proxy_iter is not None: |
|
|
|
for trial in range(max_tries): |
|
|
|
proxy_ip = next(proxy_iter) |
|
|
|
try: |
|
|
|
browser.restart_browser(start_page = url,proxy=proxy_ip) |
|
|
|
if wait_for: |
|
|
|
time.sleep(wait_for) |
|
|
|
source = browser.get_source() |
|
|
|
break |
|
|
|
except Exception as e: |
|
|
|
print(e) |
|
|
|
print(proxy_ip) |
|
|
|
proxy_iter.blacklist(proxy) |
|
|
|
|
|
|
|
else: |
|
|
|
for trial in range(max_tries): |
|
|
|
try: |
|
|
|
if trial == 0: |
|
|
|
browser.restart_browser(start_page = url) |
|
|
|
else: |
|
|
|
browser.open(url) |
|
|
|
except Exception as e: |
|
|
|
print(e) |
|
|
|
|
|
|
|
scraper = scraper(source) |
|
|
|
return scraper.label_convert(raw_tags=True) |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
ret = scrape('https://www.google.com',{'imgs':'//img'},wait_for = 10) |