diff --git a/restscrape/models.py b/restscrape/models.py index 9f3412d..cd51aed 100644 --- a/restscrape/models.py +++ b/restscrape/models.py @@ -2,16 +2,23 @@ from urllib.parse import quote_plus import datetime import uuid + from celery.result import AsyncResult +from django.conf import settings from django.core.files.base import ContentFile from django.db import models import django.contrib.postgres.fields as extended_fields import pytz + from restscrape.celery import app -from restscrape.scraping.browser import BrowserConnection +from restscrape.scraping.browser import BrowserConnection, start_browser + +OLDEST_PAGE = getattr(settings, "OLDEST_PAGE", datetime.timedelta(days=1)) + -# Create your models here. +class PageTooOldError(Exception): + pass class Page(models.Model): @@ -63,6 +70,12 @@ class Browser(models.Model): super().delete() +@app.task +def fetch_page(url, wait_for=0, proxy=None, use_adblock=True): + with get_tab(proxy=proxy, use_adblock=use_adblock) as tab: + return tab.open(url, wait_for=wait_for) + + def get_tab(proxy, use_adblock=True): try: @@ -125,6 +138,7 @@ class ScrapeRequest(models.Model): def submit(self): self.save() pending = PendingScrapingResponse(request=self) + pending.get_page() pending.save() return pending @@ -139,6 +153,40 @@ class PendingScrapingResponse(ScrapingResponse): ) request = models.ForeignKey(ScrapeRequest, on_delete=models.CASCADE) + def get_page(self): + page = None + try: + # firstly attempt to retrieve page from cache + page = Page.objects().get(url=self.request.url) + if page.acess_time < datetime.datetime.now(pytz.UTC) - OLDEST_PAGE: + raise PageTooOldError + return page + + except (Page.DoesNotExist, PageTooOldError): + if self.request.blocking: + # if we can just send the page, then we'll do that + page_source = fetch_page( + url=self.request.url, + wait_for=self.request.wait_for, + use_adblock=self.request.use_adblock, + proxy=self.request.proxy, + ) + if page is None: + page = Page(url=self.request.url) + page.write(page_source) + page.save() + return page + else: + # otherwise create a task and store it in the pending request object + task: AsyncResult = fetch_page.delay( + url=self.request.url, + wait_for=self.request.wait_for, + use_adblock=self.request.use_adblock, + proxy=self.request.proxy, + ) + self.task = PageRequestTask(task_id=task.id) + return None + def mark_complete(self, xpath_labels): completion_time = datetime.datetime.now(pytz.UTC) completed_response = CompletedScrapingResponse(