Browse Source

Starting on celery tasks and response models

master
Raphael Roberts 7 years ago
parent
commit
88d36e5622
  1. 1
      requirements.txt
  2. 12
      restscrape/celery.py
  3. 55
      restscrape/models.py
  4. 2
      restscrape/utils.py

1
requirements.txt

@ -5,3 +5,4 @@ lxml
django
django-rest-framework
psycopg2
celery[redis]

12
restscrape/celery.py

@ -0,0 +1,12 @@
from celery import Celery
from restscrape.utils import get_tab
app = Celery("restscrape_funcs")
app.conf.broker_url = "redis://localhost:6379/0"
app.conf.result_backend = "redis://localhost:6379/0"
@app.task
def fetch_page(url, wait_for=0, proxy=None, use_adblock=True):
with get_tab(proxy=proxy, use_adblock=use_adblock) as tab:
return tab.open(url, wait_for=wait_for)

55
restscrape/models.py

@ -1,11 +1,14 @@
from urllib.parse import quote_plus
import datetime
import uuid
from celery.result import AsyncResult
from django.core.files.base import ContentFile
from django.db import models
import django.contrib.postgres.fields as extended_fields
import pytz
from restscrape.celery import app
from restscrape.scraping.browser import BrowserConnection
# Create your models here.
@ -60,7 +63,53 @@ class Browser(models.Model):
super().delete()
class ScrapeRequet(models.Model):
xpath_labels = extended_fields.HStoreField()
class PageRequestTask(models.Model):
task_id = models.UUIDField(primary_key=True)
_result = None
@property
def async_result(self) -> AsyncResult:
if self._result is None:
self._result = AsyncResult(self.task_id, app=app)
return self._result
def is_ready(self):
return self.async_result.ready()
def pop_result(self):
res = self.async_result.get()
self.delete()
return res
class ScrapingResponse(models.Model):
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
class PendingScrapingResponse(ScrapingResponse):
task = models.ForeignKey(PageRequestTask, on_delete=models.CASCADE)
def mark_complete(self, xpath_labels):
completion_time = datetime.datetime.now(pytz.UTC)
completed_response = CompletedScrapingResponse(
xpath_labels=xpath_labels, completion_time=completion_time, id=self.id
)
completed_response.save()
self.delete()
return completed_response
class CompletedScrapingResponse(ScrapingResponse):
xpath_labels = extended_fields.JSONField()
completion_time = models.DateTimeField()
class ScrapeRequest(models.Model):
# internal
submit_time = models.DateTimeField(auto_now=True, editable=False)
# parameters
blocking = models.BooleanField()
submit_time = models.DateTimeField(auto_now=True)
url = models.URLField()
use_adblock = models.BooleanField()
wait_for = models.IntegerField()
xpath_labels = extended_fields.HStoreField()

2
restscrape/utils.py

@ -1,5 +1,5 @@
from restscrape.models import Browser
from restscrape.scraping.browser import start_browser, BrowserConnection
from restscrape.scraping.browser import start_browser
def get_tab(proxy, use_adblock=True):

Loading…
Cancel
Save