From ab3ee3a901394fb327f1eddf3f84174f14affd5a Mon Sep 17 00:00:00 2001 From: Raphael Roberts Date: Fri, 17 May 2019 13:26:28 -0500 Subject: [PATCH] Pascal cased the classes, started on page model, and fixed Browser class --- requirements.txt | 4 +- restscrape/__init__.py | 2 + restscrape/migrations/0001_initial.py | 22 ++++++ .../migrations/0002_auto_20190517_1311.py | 18 +++++ restscrape/models.py | 19 ++++++ restscrape/scraping/__init__.py | 10 +-- restscrape/scraping/browser.py | 68 +++++++++---------- restscrape/scraping/proxy.py | 4 +- restscrape/scraping/scraper.py | 4 +- 9 files changed, 107 insertions(+), 44 deletions(-) create mode 100644 restscrape/migrations/0001_initial.py create mode 100644 restscrape/migrations/0002_auto_20190517_1311.py diff --git a/requirements.txt b/requirements.txt index d9601af..e20ce3a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +websockets==6.0 pyppeteer requests -lxml \ No newline at end of file +lxml +django diff --git a/restscrape/__init__.py b/restscrape/__init__.py index e69de29..fe18976 100644 --- a/restscrape/__init__.py +++ b/restscrape/__init__.py @@ -0,0 +1,2 @@ +from restscrape import migrations +from restscrape import scraping diff --git a/restscrape/migrations/0001_initial.py b/restscrape/migrations/0001_initial.py new file mode 100644 index 0000000..d8e2448 --- /dev/null +++ b/restscrape/migrations/0001_initial.py @@ -0,0 +1,22 @@ +# Generated by Django 2.2.1 on 2019-05-17 18:04 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='Page', + fields=[ + ('url', models.CharField(max_length=300, primary_key=True, serialize=False)), + ('acess_time', models.DateTimeField()), + ('page_content', models.FileField(upload_to='page_cache')), + ], + ), + ] diff --git a/restscrape/migrations/0002_auto_20190517_1311.py b/restscrape/migrations/0002_auto_20190517_1311.py new file mode 100644 index 0000000..91e7568 --- /dev/null +++ b/restscrape/migrations/0002_auto_20190517_1311.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.1 on 2019-05-17 18:11 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('restscrape', '0001_initial'), + ] + + operations = [ + migrations.RenameField( + model_name='page', + old_name='acess_time', + new_name='access_time', + ), + ] diff --git a/restscrape/models.py b/restscrape/models.py index 71a8362..b48c8fa 100644 --- a/restscrape/models.py +++ b/restscrape/models.py @@ -1,3 +1,22 @@ from django.db import models +from django.core.files.base import ContentFile +from urllib.parse import quote_plus # Create your models here. + + +class Page(models.Model): + url = models.CharField(max_length=300, primary_key=True) + access_time = models.DateTimeField() + page_content = models.FileField(upload_to='page_cache') + + @property + def filename(self): + return quote_plus(self.url) + + def write(self, page_content): + file = ContentFile(page_content) + self.page_content.save(name=self.filename, content=file) + + def read(self): + return self.page_content.read() diff --git a/restscrape/scraping/__init__.py b/restscrape/scraping/__init__.py index 246f219..40e332f 100644 --- a/restscrape/scraping/__init__.py +++ b/restscrape/scraping/__init__.py @@ -1,8 +1,8 @@ import time -from restscrape.scraping.proxy import create_proxy_iter -from restscrape.scraping.scraper import scraper as scraper_class -from restscrape.scraping.browser import browser as browser_class +from restscrape.scraping.browser import Browser +from restscrape.scraping.scraper import Scraper, proxy_scraper +from restscrape.scraping.proxy import ProxyIter, create_proxy_iter US_PROXY_ITER = create_proxy_iter() @@ -15,7 +15,7 @@ def scrape( max_tries=4, raw_tags=True ): - browser = browser_class(headless=False) + browser = Browser(headless=False) if proxy_iter is not None: for trial in range(max_tries): proxy_ip = next(proxy_iter) @@ -42,7 +42,7 @@ def scrape( except Exception as e: print(e) - scraper = scraper_class(source) + scraper = Scraper(source) return scraper.label_convert(labels, raw_tags=raw_tags), browser diff --git a/restscrape/scraping/browser.py b/restscrape/scraping/browser.py index 958fd4e..adaac68 100644 --- a/restscrape/scraping/browser.py +++ b/restscrape/scraping/browser.py @@ -4,60 +4,60 @@ import time import pyppeteer +EVENT_LOOP = None -def run(coroutine): - loop = asyncio.get_event_loop() - return loop.run_until_complete(coroutine) +def run(coroutine): + global EVENT_LOOP + if EVENT_LOOP is None: + EVENT_LOOP = asyncio.get_event_loop() + return EVENT_LOOP.run_until_complete(coroutine) -class browser: - '''wrapper around pyppeteer browser''' +class Browser: def __init__(self, **launch_opts): - self.browser = None + self.connected = False + self.browser_instance: pyppeteer.browser.Browser = None + self.address = None self.launch_opts = launch_opts - self.page = None - def restart_browser(self, proxy=None, use_adblock=True, - start_page='about:blank'): - if self.browser is not None: - self.close() + def connect(self, socket_address): + self.browser_instance = run( + pyppeteer.launcher.connect(browserWSEndpoint=socket_address)) + self.address = socket_address + self.page: pyppeteer.page.Page = run(self.browser_instance.pages())[0] + # self.page: pyppeteer.page.Page = run(b.browser_instance.newPage()) + + def start_browser(self, proxy=None, use_adblock=True): opts = {} opts.update(self.launch_opts) if use_adblock: - if __name__ == "__main__": - ext = os.path.join(os.getcwd(), 'uBlock') - else: - ext = os.path.abspath(os.path.join( - os.path.dirname(__file__), 'uBlock')) - # print(ext) + adblock_path = os.path.join(__file__, "..", "uBlock") opts.setdefault('args', []).extend( - ['--disable-extensions-except='+ext, '--load-extension='+ext]) + [ + '--disable-extensions-except='+adblock_path, + '--load-extension='+adblock_path + ] + ) if proxy is not None: opts.setdefault('args', []).extend(['--proxy-server='+proxy]) - opts.setdefault('args', []).append(start_page) - self.browser = run(pyppeteer.launch(**opts)) - self.page = run(self.browser.pages())[0] - run(self.page.waitForNavigation()) + opts.setdefault('args', []).append('about:blank') + self.browser_instance = run(pyppeteer.launch(**opts)) + self.address = self.browser_instance.wsEndpoint + self.page: pyppeteer.page.Page = run(self.browser_instance.pages())[0] + # self.page: pyppeteer.page.Page = run(b.browser_instance.newPage()) + + def close(self): + run(self.browser_instance.close()) def open(self, url, wait_for=0): - if self.page is None: - page = run(self.browser.pages())[0] - run(page.goto(url)) + run(self.page.goto(url, waitUntil="domcontentloaded")) time.sleep(wait_for) return self.get_source() - def close(self): - run(self.browser.close()) - def get_source(self): - if self.page is None: - self.page = run(self.browser.pages())[0] return run(self.page.content()) if __name__ == "__main__": - b = browser(headless=False) - b.restart_browser(start_page='https://www.google.com') - source = b.get_source() - b.close() + b = Browser(headless=False) diff --git a/restscrape/scraping/proxy.py b/restscrape/scraping/proxy.py index 822fcb4..6555422 100644 --- a/restscrape/scraping/proxy.py +++ b/restscrape/scraping/proxy.py @@ -5,7 +5,7 @@ from restscrape.scraping.scraper import proxy_scraper US_PROXY_URL = 'https://www.us-proxy.org/' -class proxy_iter: +class ProxyIter: '''Like itertools.cycle but uses a set underneath the hood and adds a method to remove an item from iteration (if proxy doesn't work etc)''' @@ -37,6 +37,6 @@ def create_proxy_iter(url=US_PROXY_URL): '''Create a proxy_iter from proxy_webpage''' resp = requests.get(url) resp.raise_for_status() - return proxy_iter( + return ProxyIter( '{ip address}:{port}'.format(**row) for row in proxy_scraper(resp.text)) diff --git a/restscrape/scraping/scraper.py b/restscrape/scraping/scraper.py index c3d8e29..dca1e68 100644 --- a/restscrape/scraping/scraper.py +++ b/restscrape/scraping/scraper.py @@ -1,7 +1,7 @@ import lxml.etree -class scraper: +class Scraper: def __init__(self, page_source): if not isinstance(page_source, lxml.etree._Element): @@ -34,7 +34,7 @@ class scraper: def proxy_scraper(page_source): - page = scraper(page_source) + page = Scraper(page_source) yield from page.extract_table( table="//table[@id='proxylisttable']", header_xpath="./thead/tr", rows_xpath="./tbody")