Browse Source

Fleshing out some of the functions to get the page, both blocking and async

master
Raphael Roberts 7 years ago
parent
commit
b8d5cb5546
  1. 52
      restscrape/models.py

52
restscrape/models.py

@ -2,16 +2,23 @@ from urllib.parse import quote_plus
import datetime
import uuid
from celery.result import AsyncResult
from django.conf import settings
from django.core.files.base import ContentFile
from django.db import models
import django.contrib.postgres.fields as extended_fields
import pytz
from restscrape.celery import app
from restscrape.scraping.browser import BrowserConnection
from restscrape.scraping.browser import BrowserConnection, start_browser
OLDEST_PAGE = getattr(settings, "OLDEST_PAGE", datetime.timedelta(days=1))
# Create your models here.
class PageTooOldError(Exception):
pass
class Page(models.Model):
@ -63,6 +70,12 @@ class Browser(models.Model):
super().delete()
@app.task
def fetch_page(url, wait_for=0, proxy=None, use_adblock=True):
with get_tab(proxy=proxy, use_adblock=use_adblock) as tab:
return tab.open(url, wait_for=wait_for)
def get_tab(proxy, use_adblock=True):
try:
@ -125,6 +138,7 @@ class ScrapeRequest(models.Model):
def submit(self):
self.save()
pending = PendingScrapingResponse(request=self)
pending.get_page()
pending.save()
return pending
@ -139,6 +153,40 @@ class PendingScrapingResponse(ScrapingResponse):
)
request = models.ForeignKey(ScrapeRequest, on_delete=models.CASCADE)
def get_page(self):
page = None
try:
# firstly attempt to retrieve page from cache
page = Page.objects().get(url=self.request.url)
if page.acess_time < datetime.datetime.now(pytz.UTC) - OLDEST_PAGE:
raise PageTooOldError
return page
except (Page.DoesNotExist, PageTooOldError):
if self.request.blocking:
# if we can just send the page, then we'll do that
page_source = fetch_page(
url=self.request.url,
wait_for=self.request.wait_for,
use_adblock=self.request.use_adblock,
proxy=self.request.proxy,
)
if page is None:
page = Page(url=self.request.url)
page.write(page_source)
page.save()
return page
else:
# otherwise create a task and store it in the pending request object
task: AsyncResult = fetch_page.delay(
url=self.request.url,
wait_for=self.request.wait_for,
use_adblock=self.request.use_adblock,
proxy=self.request.proxy,
)
self.task = PageRequestTask(task_id=task.id)
return None
def mark_complete(self, xpath_labels):
completion_time = datetime.datetime.now(pytz.UTC)
completed_response = CompletedScrapingResponse(

Loading…
Cancel
Save