Browse Source

Fleshing out some of the functions to get the page, both blocking and async

master
Raphael Roberts 7 years ago
parent
commit
b8d5cb5546
  1. 52
      restscrape/models.py

52
restscrape/models.py

@ -2,16 +2,23 @@ from urllib.parse import quote_plus
import datetime import datetime
import uuid import uuid
from celery.result import AsyncResult from celery.result import AsyncResult
from django.conf import settings
from django.core.files.base import ContentFile from django.core.files.base import ContentFile
from django.db import models from django.db import models
import django.contrib.postgres.fields as extended_fields import django.contrib.postgres.fields as extended_fields
import pytz import pytz
from restscrape.celery import app from restscrape.celery import app
from restscrape.scraping.browser import BrowserConnection
from restscrape.scraping.browser import BrowserConnection, start_browser
OLDEST_PAGE = getattr(settings, "OLDEST_PAGE", datetime.timedelta(days=1))
# Create your models here.
class PageTooOldError(Exception):
pass
class Page(models.Model): class Page(models.Model):
@ -63,6 +70,12 @@ class Browser(models.Model):
super().delete() super().delete()
@app.task
def fetch_page(url, wait_for=0, proxy=None, use_adblock=True):
with get_tab(proxy=proxy, use_adblock=use_adblock) as tab:
return tab.open(url, wait_for=wait_for)
def get_tab(proxy, use_adblock=True): def get_tab(proxy, use_adblock=True):
try: try:
@ -125,6 +138,7 @@ class ScrapeRequest(models.Model):
def submit(self): def submit(self):
self.save() self.save()
pending = PendingScrapingResponse(request=self) pending = PendingScrapingResponse(request=self)
pending.get_page()
pending.save() pending.save()
return pending return pending
@ -139,6 +153,40 @@ class PendingScrapingResponse(ScrapingResponse):
) )
request = models.ForeignKey(ScrapeRequest, on_delete=models.CASCADE) request = models.ForeignKey(ScrapeRequest, on_delete=models.CASCADE)
def get_page(self):
page = None
try:
# firstly attempt to retrieve page from cache
page = Page.objects().get(url=self.request.url)
if page.acess_time < datetime.datetime.now(pytz.UTC) - OLDEST_PAGE:
raise PageTooOldError
return page
except (Page.DoesNotExist, PageTooOldError):
if self.request.blocking:
# if we can just send the page, then we'll do that
page_source = fetch_page(
url=self.request.url,
wait_for=self.request.wait_for,
use_adblock=self.request.use_adblock,
proxy=self.request.proxy,
)
if page is None:
page = Page(url=self.request.url)
page.write(page_source)
page.save()
return page
else:
# otherwise create a task and store it in the pending request object
task: AsyncResult = fetch_page.delay(
url=self.request.url,
wait_for=self.request.wait_for,
use_adblock=self.request.use_adblock,
proxy=self.request.proxy,
)
self.task = PageRequestTask(task_id=task.id)
return None
def mark_complete(self, xpath_labels): def mark_complete(self, xpath_labels):
completion_time = datetime.datetime.now(pytz.UTC) completion_time = datetime.datetime.now(pytz.UTC)
completed_response = CompletedScrapingResponse( completed_response = CompletedScrapingResponse(

Loading…
Cancel
Save