|
|
|
@ -2,16 +2,23 @@ from urllib.parse import quote_plus |
|
|
|
import datetime |
|
|
|
import uuid |
|
|
|
|
|
|
|
|
|
|
|
from celery.result import AsyncResult |
|
|
|
from django.conf import settings |
|
|
|
from django.core.files.base import ContentFile |
|
|
|
from django.db import models |
|
|
|
import django.contrib.postgres.fields as extended_fields |
|
|
|
import pytz |
|
|
|
|
|
|
|
|
|
|
|
from restscrape.celery import app |
|
|
|
from restscrape.scraping.browser import BrowserConnection |
|
|
|
from restscrape.scraping.browser import BrowserConnection, start_browser |
|
|
|
|
|
|
|
OLDEST_PAGE = getattr(settings, "OLDEST_PAGE", datetime.timedelta(days=1)) |
|
|
|
|
|
|
|
|
|
|
|
# Create your models here. |
|
|
|
class PageTooOldError(Exception): |
|
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
class Page(models.Model): |
|
|
|
@ -63,6 +70,12 @@ class Browser(models.Model): |
|
|
|
super().delete() |
|
|
|
|
|
|
|
|
|
|
|
@app.task |
|
|
|
def fetch_page(url, wait_for=0, proxy=None, use_adblock=True): |
|
|
|
with get_tab(proxy=proxy, use_adblock=use_adblock) as tab: |
|
|
|
return tab.open(url, wait_for=wait_for) |
|
|
|
|
|
|
|
|
|
|
|
def get_tab(proxy, use_adblock=True): |
|
|
|
try: |
|
|
|
|
|
|
|
@ -125,6 +138,7 @@ class ScrapeRequest(models.Model): |
|
|
|
def submit(self): |
|
|
|
self.save() |
|
|
|
pending = PendingScrapingResponse(request=self) |
|
|
|
pending.get_page() |
|
|
|
pending.save() |
|
|
|
return pending |
|
|
|
|
|
|
|
@ -139,6 +153,40 @@ class PendingScrapingResponse(ScrapingResponse): |
|
|
|
) |
|
|
|
request = models.ForeignKey(ScrapeRequest, on_delete=models.CASCADE) |
|
|
|
|
|
|
|
def get_page(self): |
|
|
|
page = None |
|
|
|
try: |
|
|
|
# firstly attempt to retrieve page from cache |
|
|
|
page = Page.objects().get(url=self.request.url) |
|
|
|
if page.acess_time < datetime.datetime.now(pytz.UTC) - OLDEST_PAGE: |
|
|
|
raise PageTooOldError |
|
|
|
return page |
|
|
|
|
|
|
|
except (Page.DoesNotExist, PageTooOldError): |
|
|
|
if self.request.blocking: |
|
|
|
# if we can just send the page, then we'll do that |
|
|
|
page_source = fetch_page( |
|
|
|
url=self.request.url, |
|
|
|
wait_for=self.request.wait_for, |
|
|
|
use_adblock=self.request.use_adblock, |
|
|
|
proxy=self.request.proxy, |
|
|
|
) |
|
|
|
if page is None: |
|
|
|
page = Page(url=self.request.url) |
|
|
|
page.write(page_source) |
|
|
|
page.save() |
|
|
|
return page |
|
|
|
else: |
|
|
|
# otherwise create a task and store it in the pending request object |
|
|
|
task: AsyncResult = fetch_page.delay( |
|
|
|
url=self.request.url, |
|
|
|
wait_for=self.request.wait_for, |
|
|
|
use_adblock=self.request.use_adblock, |
|
|
|
proxy=self.request.proxy, |
|
|
|
) |
|
|
|
self.task = PageRequestTask(task_id=task.id) |
|
|
|
return None |
|
|
|
|
|
|
|
def mark_complete(self, xpath_labels): |
|
|
|
completion_time = datetime.datetime.now(pytz.UTC) |
|
|
|
completed_response = CompletedScrapingResponse( |
|
|
|
|