From d0457d217ede7cfb0e239cceda5982b1d96674af Mon Sep 17 00:00:00 2001 From: Raphael Roberts Date: Sun, 5 Jan 2020 18:47:52 -0600 Subject: [PATCH] Added the browsing and scraping components. Find a way to auto-download uBlock --- .gitignore | 2 + requirements.txt | 4 ++ rlbr_browser/__init__.py | 0 rlbr_browser/browser.py | 79 +++++++++++++++++++++++++++++++++ rlbr_browser/proxy.py | 42 ++++++++++++++++++ rlbr_browser/scraper.py | 20 +++++++++ rlbr_browser/test_stays_open.py | 5 +++ 7 files changed, 152 insertions(+) create mode 100644 .gitignore create mode 100644 requirements.txt create mode 100644 rlbr_browser/__init__.py create mode 100644 rlbr_browser/browser.py create mode 100644 rlbr_browser/proxy.py create mode 100644 rlbr_browser/scraper.py create mode 100644 rlbr_browser/test_stays_open.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..aad875c --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/.dir-locals.el +__pycache__ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2191931 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +websockets==6.0 +pyppeteer +requests +lxml diff --git a/rlbr_browser/__init__.py b/rlbr_browser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rlbr_browser/browser.py b/rlbr_browser/browser.py new file mode 100644 index 0000000..54a99c4 --- /dev/null +++ b/rlbr_browser/browser.py @@ -0,0 +1,79 @@ +import asyncio +import time +from pathlib import Path + + +import pyppeteer + +EVENT_LOOP = None + + +def run(coroutine): + global EVENT_LOOP + if EVENT_LOOP is None: + EVENT_LOOP = asyncio.get_event_loop() + return EVENT_LOOP.run_until_complete(coroutine) + + +class BrowserConnection: + def __init__(self, address=None, browser_handle: pyppeteer.browser.Browser = None): + if browser_handle is None: + self.browser_handle: pyppeteer.browser.Browser = run( + pyppeteer.launcher.connect(browserWSEndpoint=address) + ) + elif address is None: + self.browser_handle = browser_handle + + self.address = self.browser_handle.wsEndpoint + + @property + def tabs(self): + return run(self.browser_handle.pages()) + + def create_tab(self): + return Tab(self.browser_handle) + + def close(self): + run(self.browser_handle.close()) + + +class Tab: + def __init__(self, browser_handle): + self.browser_handle = browser_handle + self.page_handle = run(self.browser_handle.newPage()) + + def __enter__(self): + return self + + def __exit__(self, *args): + run(self.page_handle.close()) + + def get_source(self): + return run(self.page_handle.content()) + + def open(self, url, wait_for=0): + run(self.page_handle.goto(url, waitUntil="domcontentloaded")) + time.sleep(wait_for) + return self.get_source() + + +def start_browser(proxy=None, use_adblock=True, **launch_opts): + opts = launch_opts + opts["autoClose"] = False + if use_adblock: + adblock_path = Path(__file__).parent / "uBlock" + opts.setdefault("args", []).extend( + [ + "--disable-extensions-except={}".format(adblock_path), + "--load-extension={}".format(adblock_path), + ] + ) + if proxy is not None: + opts.setdefault("args", []).extend(["--proxy-server=" + proxy]) + opts.setdefault("args", []).append("about:blank") + browser_handle = run(pyppeteer.launch(**opts)) + return BrowserConnection(browser_handle=browser_handle) + + +if __name__ == "__main__": + b = start_browser(headless=False) diff --git a/rlbr_browser/proxy.py b/rlbr_browser/proxy.py new file mode 100644 index 0000000..57f1b05 --- /dev/null +++ b/rlbr_browser/proxy.py @@ -0,0 +1,42 @@ +import requests + +from restscrape.scraping.scraper import proxy_scraper + +US_PROXY_URL = "https://www.us-proxy.org/" + + +class ProxyIter: + """Like itertools.cycle but +uses a set underneath the hood and adds a method to remove an item from +iteration (if proxy doesn't work etc)""" + + def __init__(self, proxy_list): + self.proxy_set = set(proxy_list) + self.bad = set() + self.iterator = iter(self) + + def __iter__(self): + for proxy in self.proxy_set: + if proxy not in self.bad: + yield proxy + self.proxy_set -= self.bad + if len(self.proxy_set) > 0: + yield from iter(self) + + def __next__(self): + return next(self.iterator) + + def blacklist(self, proxy): + self.bad.add(proxy) + + def __len__(self): + return len(self.proxy_set - self.bad) + + +def create_proxy_iter(url=US_PROXY_URL): + """Create a proxy_iter from proxy_webpage""" + resp = requests.get(url) + resp.raise_for_status() + return ProxyIter( + "{ip address}:{port}".format(**row) for row in proxy_scraper(resp.text) + ) diff --git a/rlbr_browser/scraper.py b/rlbr_browser/scraper.py new file mode 100644 index 0000000..4603428 --- /dev/null +++ b/rlbr_browser/scraper.py @@ -0,0 +1,20 @@ +import lxml.etree + + +class Scraper: + def __init__(self, page_source): + if not isinstance(page_source, lxml.etree._Element): + page_source = lxml.etree.HTML(page_source) + self.page_source = page_source + + def xpath(self, expr): + return self.page_source.xpath(expr) + + +def proxy_scraper(page_source): + page = Scraper(page_source) + yield from page.extract_table( + table="//table[@id='proxylisttable']", + header_xpath="./thead/tr", + rows_xpath="./tbody", + ) diff --git a/rlbr_browser/test_stays_open.py b/rlbr_browser/test_stays_open.py new file mode 100644 index 0000000..7abdcaa --- /dev/null +++ b/rlbr_browser/test_stays_open.py @@ -0,0 +1,5 @@ +from browser import start_browser + +if __name__ == "__main__": + b = start_browser(headless=False) + print(b.address)