|
|
|
@ -2,7 +2,7 @@ if __name__ == "__main__": |
|
|
|
import sys |
|
|
|
sys.path.insert(0,'..') |
|
|
|
from restscrape.browser import browser as browser_class |
|
|
|
from restscrape.scraper import scraper |
|
|
|
from restscrape.scraper import scraper as scraper_class |
|
|
|
from restscrape.proxy import create_proxy_iter |
|
|
|
import time |
|
|
|
US_PROXY_ITER = create_proxy_iter() |
|
|
|
@ -30,11 +30,13 @@ def scrape(url,labels,max_tries=4,proxy_iter = None,wait_for = 0,raw_tags = True |
|
|
|
browser.restart_browser(start_page = url) |
|
|
|
else: |
|
|
|
browser.open(url) |
|
|
|
source = browser.get_source() |
|
|
|
break |
|
|
|
except Exception as e: |
|
|
|
print(e) |
|
|
|
|
|
|
|
scraper = scraper(source) |
|
|
|
return scraper.label_convert(raw_tags=True) |
|
|
|
scraper = scraper_class(source) |
|
|
|
return scraper.label_convert(labels,raw_tags=raw_tags),browser |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
ret = scrape('https://www.google.com',{'imgs':'//img'},wait_for = 10) |
|
|
|
ret,browser = scrape('http://rlbrhost.ddns.net/',{'links':'//a'},wait_for = 10,raw_tags=False) |