Browse Source

Removed unnecessary stuff from __init__.py in scraping sub package.

master
Raphael Roberts 7 years ago
parent
commit
cfd20b9035
  1. 45
      restscrape/scraping/__init__.py

45
restscrape/scraping/__init__.py

@ -1,45 +0,0 @@
import time
from restscrape.scraping.browser import Browser
from restscrape.scraping.scraper import Scraper, proxy_scraper
from restscrape.scraping.proxy import ProxyIter, create_proxy_iter
US_PROXY_ITER = create_proxy_iter()
def scrape(url, labels, proxy_iter=None, wait_for=0, max_tries=4, raw_tags=True):
browser = Browser(headless=False)
if proxy_iter is not None:
for trial in range(max_tries):
proxy_ip = next(proxy_iter)
try:
browser.restart_browser(start_page=url, proxy=proxy_ip)
if wait_for:
time.sleep(wait_for)
source = browser.get_source()
break
except Exception as e:
print(e)
print(proxy_ip)
proxy_iter.blacklist(proxy_ip)
else:
for trial in range(max_tries):
try:
if trial == 0:
browser.restart_browser(start_page=url)
else:
browser.open(url)
source = browser.get_source()
break
except Exception as e:
print(e)
scraper = Scraper(source)
return scraper.label_convert(labels, raw_tags=raw_tags), browser
if __name__ == "__main__":
ret, browser = scrape(
"http://rlbrhost.ddns.net/", {"links": "//a"}, wait_for=10, raw_tags=False
)
Loading…
Cancel
Save