Browse Source

Added the browsing and scraping components.

Find a way to auto-download uBlock
master
Raphael Roberts 6 years ago
commit
d0457d217e
  1. 2
      .gitignore
  2. 4
      requirements.txt
  3. 0
      rlbr_browser/__init__.py
  4. 79
      rlbr_browser/browser.py
  5. 42
      rlbr_browser/proxy.py
  6. 20
      rlbr_browser/scraper.py
  7. 5
      rlbr_browser/test_stays_open.py

2
.gitignore

@ -0,0 +1,2 @@
/.dir-locals.el
__pycache__

4
requirements.txt

@ -0,0 +1,4 @@
websockets==6.0
pyppeteer
requests
lxml

0
rlbr_browser/__init__.py

79
rlbr_browser/browser.py

@ -0,0 +1,79 @@
import asyncio
import time
from pathlib import Path
import pyppeteer
EVENT_LOOP = None
def run(coroutine):
global EVENT_LOOP
if EVENT_LOOP is None:
EVENT_LOOP = asyncio.get_event_loop()
return EVENT_LOOP.run_until_complete(coroutine)
class BrowserConnection:
def __init__(self, address=None, browser_handle: pyppeteer.browser.Browser = None):
if browser_handle is None:
self.browser_handle: pyppeteer.browser.Browser = run(
pyppeteer.launcher.connect(browserWSEndpoint=address)
)
elif address is None:
self.browser_handle = browser_handle
self.address = self.browser_handle.wsEndpoint
@property
def tabs(self):
return run(self.browser_handle.pages())
def create_tab(self):
return Tab(self.browser_handle)
def close(self):
run(self.browser_handle.close())
class Tab:
def __init__(self, browser_handle):
self.browser_handle = browser_handle
self.page_handle = run(self.browser_handle.newPage())
def __enter__(self):
return self
def __exit__(self, *args):
run(self.page_handle.close())
def get_source(self):
return run(self.page_handle.content())
def open(self, url, wait_for=0):
run(self.page_handle.goto(url, waitUntil="domcontentloaded"))
time.sleep(wait_for)
return self.get_source()
def start_browser(proxy=None, use_adblock=True, **launch_opts):
opts = launch_opts
opts["autoClose"] = False
if use_adblock:
adblock_path = Path(__file__).parent / "uBlock"
opts.setdefault("args", []).extend(
[
"--disable-extensions-except={}".format(adblock_path),
"--load-extension={}".format(adblock_path),
]
)
if proxy is not None:
opts.setdefault("args", []).extend(["--proxy-server=" + proxy])
opts.setdefault("args", []).append("about:blank")
browser_handle = run(pyppeteer.launch(**opts))
return BrowserConnection(browser_handle=browser_handle)
if __name__ == "__main__":
b = start_browser(headless=False)

42
rlbr_browser/proxy.py

@ -0,0 +1,42 @@
import requests
from restscrape.scraping.scraper import proxy_scraper
US_PROXY_URL = "https://www.us-proxy.org/"
class ProxyIter:
"""Like itertools.cycle but
uses a set underneath the hood and adds a method to remove an item from
iteration (if proxy doesn't work etc)"""
def __init__(self, proxy_list):
self.proxy_set = set(proxy_list)
self.bad = set()
self.iterator = iter(self)
def __iter__(self):
for proxy in self.proxy_set:
if proxy not in self.bad:
yield proxy
self.proxy_set -= self.bad
if len(self.proxy_set) > 0:
yield from iter(self)
def __next__(self):
return next(self.iterator)
def blacklist(self, proxy):
self.bad.add(proxy)
def __len__(self):
return len(self.proxy_set - self.bad)
def create_proxy_iter(url=US_PROXY_URL):
"""Create a proxy_iter from proxy_webpage"""
resp = requests.get(url)
resp.raise_for_status()
return ProxyIter(
"{ip address}:{port}".format(**row) for row in proxy_scraper(resp.text)
)

20
rlbr_browser/scraper.py

@ -0,0 +1,20 @@
import lxml.etree
class Scraper:
def __init__(self, page_source):
if not isinstance(page_source, lxml.etree._Element):
page_source = lxml.etree.HTML(page_source)
self.page_source = page_source
def xpath(self, expr):
return self.page_source.xpath(expr)
def proxy_scraper(page_source):
page = Scraper(page_source)
yield from page.extract_table(
table="//table[@id='proxylisttable']",
header_xpath="./thead/tr",
rows_xpath="./tbody",
)

5
rlbr_browser/test_stays_open.py

@ -0,0 +1,5 @@
from browser import start_browser
if __name__ == "__main__":
b = start_browser(headless=False)
print(b.address)
Loading…
Cancel
Save