Browse Source

added a function scrape, which will hopefully be the entry point for everything

master
Raphael Roberts 7 years ago
parent
commit
260384397a
  1. 10
      restscrape/__init__.py
  2. 2
      restscrape/browser.py

10
restscrape/__init__.py

@ -2,7 +2,7 @@ if __name__ == "__main__":
import sys
sys.path.insert(0,'..')
from restscrape.browser import browser as browser_class
from restscrape.scraper import scraper
from restscrape.scraper import scraper as scraper_class
from restscrape.proxy import create_proxy_iter
import time
US_PROXY_ITER = create_proxy_iter()
@ -30,11 +30,13 @@ def scrape(url,labels,max_tries=4,proxy_iter = None,wait_for = 0,raw_tags = True
browser.restart_browser(start_page = url)
else:
browser.open(url)
source = browser.get_source()
break
except Exception as e:
print(e)
scraper = scraper(source)
return scraper.label_convert(raw_tags=True)
scraper = scraper_class(source)
return scraper.label_convert(labels,raw_tags=raw_tags),browser
if __name__ == "__main__":
ret = scrape('https://www.google.com',{'imgs':'//img'},wait_for = 10)
ret,browser = scrape('http://rlbrhost.ddns.net/',{'links':'//a'},wait_for = 10,raw_tags=False)

2
restscrape/browser.py

@ -23,7 +23,7 @@ class browser:
ext = os.path.join(os.getcwd(),'uBlock')
else:
ext = os.path.abspath(os.path.join(os.path.dirname(__file__),'uBlock'))
print(ext)
# print(ext)
opts.setdefault('args',[]).extend(['--disable-extensions-except='+ext, '--load-extension='+ext])
if proxy is not None:
opts.setdefault('args',[]).extend(['--proxy-server='+proxy])

Loading…
Cancel
Save