Browse Source

Pascal cased the classes, started on page model, and fixed Browser

class
master
Raphael Roberts 7 years ago
parent
commit
ab3ee3a901
  1. 2
      requirements.txt
  2. 2
      restscrape/__init__.py
  3. 22
      restscrape/migrations/0001_initial.py
  4. 18
      restscrape/migrations/0002_auto_20190517_1311.py
  5. 19
      restscrape/models.py
  6. 10
      restscrape/scraping/__init__.py
  7. 68
      restscrape/scraping/browser.py
  8. 4
      restscrape/scraping/proxy.py
  9. 4
      restscrape/scraping/scraper.py

2
requirements.txt

@ -1,3 +1,5 @@
websockets==6.0
pyppeteer pyppeteer
requests requests
lxml lxml
django

2
restscrape/__init__.py

@ -0,0 +1,2 @@
from restscrape import migrations
from restscrape import scraping

22
restscrape/migrations/0001_initial.py

@ -0,0 +1,22 @@
# Generated by Django 2.2.1 on 2019-05-17 18:04
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='Page',
fields=[
('url', models.CharField(max_length=300, primary_key=True, serialize=False)),
('acess_time', models.DateTimeField()),
('page_content', models.FileField(upload_to='page_cache')),
],
),
]

18
restscrape/migrations/0002_auto_20190517_1311.py

@ -0,0 +1,18 @@
# Generated by Django 2.2.1 on 2019-05-17 18:11
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('restscrape', '0001_initial'),
]
operations = [
migrations.RenameField(
model_name='page',
old_name='acess_time',
new_name='access_time',
),
]

19
restscrape/models.py

@ -1,3 +1,22 @@
from django.db import models from django.db import models
from django.core.files.base import ContentFile
from urllib.parse import quote_plus
# Create your models here. # Create your models here.
class Page(models.Model):
url = models.CharField(max_length=300, primary_key=True)
access_time = models.DateTimeField()
page_content = models.FileField(upload_to='page_cache')
@property
def filename(self):
return quote_plus(self.url)
def write(self, page_content):
file = ContentFile(page_content)
self.page_content.save(name=self.filename, content=file)
def read(self):
return self.page_content.read()

10
restscrape/scraping/__init__.py

@ -1,8 +1,8 @@
import time import time
from restscrape.scraping.proxy import create_proxy_iter
from restscrape.scraping.scraper import scraper as scraper_class
from restscrape.scraping.browser import browser as browser_class
from restscrape.scraping.browser import Browser
from restscrape.scraping.scraper import Scraper, proxy_scraper
from restscrape.scraping.proxy import ProxyIter, create_proxy_iter
US_PROXY_ITER = create_proxy_iter() US_PROXY_ITER = create_proxy_iter()
@ -15,7 +15,7 @@ def scrape(
max_tries=4, max_tries=4,
raw_tags=True raw_tags=True
): ):
browser = browser_class(headless=False)
browser = Browser(headless=False)
if proxy_iter is not None: if proxy_iter is not None:
for trial in range(max_tries): for trial in range(max_tries):
proxy_ip = next(proxy_iter) proxy_ip = next(proxy_iter)
@ -42,7 +42,7 @@ def scrape(
except Exception as e: except Exception as e:
print(e) print(e)
scraper = scraper_class(source)
scraper = Scraper(source)
return scraper.label_convert(labels, raw_tags=raw_tags), browser return scraper.label_convert(labels, raw_tags=raw_tags), browser

68
restscrape/scraping/browser.py

@ -4,60 +4,60 @@ import time
import pyppeteer import pyppeteer
EVENT_LOOP = None
def run(coroutine):
loop = asyncio.get_event_loop()
return loop.run_until_complete(coroutine)
def run(coroutine):
global EVENT_LOOP
if EVENT_LOOP is None:
EVENT_LOOP = asyncio.get_event_loop()
return EVENT_LOOP.run_until_complete(coroutine)
class browser:
'''wrapper around pyppeteer browser'''
class Browser:
def __init__(self, **launch_opts): def __init__(self, **launch_opts):
self.browser = None
self.connected = False
self.browser_instance: pyppeteer.browser.Browser = None
self.address = None
self.launch_opts = launch_opts self.launch_opts = launch_opts
self.page = None
def restart_browser(self, proxy=None, use_adblock=True,
start_page='about:blank'):
if self.browser is not None:
self.close()
def connect(self, socket_address):
self.browser_instance = run(
pyppeteer.launcher.connect(browserWSEndpoint=socket_address))
self.address = socket_address
self.page: pyppeteer.page.Page = run(self.browser_instance.pages())[0]
# self.page: pyppeteer.page.Page = run(b.browser_instance.newPage())
def start_browser(self, proxy=None, use_adblock=True):
opts = {} opts = {}
opts.update(self.launch_opts) opts.update(self.launch_opts)
if use_adblock: if use_adblock:
if __name__ == "__main__":
ext = os.path.join(os.getcwd(), 'uBlock')
else:
ext = os.path.abspath(os.path.join(
os.path.dirname(__file__), 'uBlock'))
# print(ext)
adblock_path = os.path.join(__file__, "..", "uBlock")
opts.setdefault('args', []).extend( opts.setdefault('args', []).extend(
['--disable-extensions-except='+ext, '--load-extension='+ext])
[
'--disable-extensions-except='+adblock_path,
'--load-extension='+adblock_path
]
)
if proxy is not None: if proxy is not None:
opts.setdefault('args', []).extend(['--proxy-server='+proxy]) opts.setdefault('args', []).extend(['--proxy-server='+proxy])
opts.setdefault('args', []).append(start_page)
self.browser = run(pyppeteer.launch(**opts))
self.page = run(self.browser.pages())[0]
run(self.page.waitForNavigation())
opts.setdefault('args', []).append('about:blank')
self.browser_instance = run(pyppeteer.launch(**opts))
self.address = self.browser_instance.wsEndpoint
self.page: pyppeteer.page.Page = run(self.browser_instance.pages())[0]
# self.page: pyppeteer.page.Page = run(b.browser_instance.newPage())
def close(self):
run(self.browser_instance.close())
def open(self, url, wait_for=0): def open(self, url, wait_for=0):
if self.page is None:
page = run(self.browser.pages())[0]
run(page.goto(url))
run(self.page.goto(url, waitUntil="domcontentloaded"))
time.sleep(wait_for) time.sleep(wait_for)
return self.get_source() return self.get_source()
def close(self):
run(self.browser.close())
def get_source(self): def get_source(self):
if self.page is None:
self.page = run(self.browser.pages())[0]
return run(self.page.content()) return run(self.page.content())
if __name__ == "__main__": if __name__ == "__main__":
b = browser(headless=False)
b.restart_browser(start_page='https://www.google.com')
source = b.get_source()
b.close()
b = Browser(headless=False)

4
restscrape/scraping/proxy.py

@ -5,7 +5,7 @@ from restscrape.scraping.scraper import proxy_scraper
US_PROXY_URL = 'https://www.us-proxy.org/' US_PROXY_URL = 'https://www.us-proxy.org/'
class proxy_iter:
class ProxyIter:
'''Like itertools.cycle but '''Like itertools.cycle but
uses a set underneath the hood and adds a method to remove an item from uses a set underneath the hood and adds a method to remove an item from
iteration (if proxy doesn't work etc)''' iteration (if proxy doesn't work etc)'''
@ -37,6 +37,6 @@ def create_proxy_iter(url=US_PROXY_URL):
'''Create a proxy_iter from proxy_webpage''' '''Create a proxy_iter from proxy_webpage'''
resp = requests.get(url) resp = requests.get(url)
resp.raise_for_status() resp.raise_for_status()
return proxy_iter(
return ProxyIter(
'{ip address}:{port}'.format(**row) '{ip address}:{port}'.format(**row)
for row in proxy_scraper(resp.text)) for row in proxy_scraper(resp.text))

4
restscrape/scraping/scraper.py

@ -1,7 +1,7 @@
import lxml.etree import lxml.etree
class scraper:
class Scraper:
def __init__(self, page_source): def __init__(self, page_source):
if not isinstance(page_source, lxml.etree._Element): if not isinstance(page_source, lxml.etree._Element):
@ -34,7 +34,7 @@ class scraper:
def proxy_scraper(page_source): def proxy_scraper(page_source):
page = scraper(page_source)
page = Scraper(page_source)
yield from page.extract_table( yield from page.extract_table(
table="//table[@id='proxylisttable']", table="//table[@id='proxylisttable']",
header_xpath="./thead/tr", rows_xpath="./tbody") header_xpath="./thead/tr", rows_xpath="./tbody")
Loading…
Cancel
Save