From 88d36e562283e5cc07582c00846645f6c20ddfdb Mon Sep 17 00:00:00 2001 From: Raphael Roberts Date: Fri, 31 May 2019 22:44:08 -0500 Subject: [PATCH] Starting on celery tasks and response models --- requirements.txt | 1 + restscrape/celery.py | 12 ++++++++++ restscrape/models.py | 55 +++++++++++++++++++++++++++++++++++++++++--- restscrape/utils.py | 2 +- 4 files changed, 66 insertions(+), 4 deletions(-) create mode 100644 restscrape/celery.py diff --git a/requirements.txt b/requirements.txt index 52a5dcd..851e0f2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ lxml django django-rest-framework psycopg2 +celery[redis] diff --git a/restscrape/celery.py b/restscrape/celery.py new file mode 100644 index 0000000..2410e9b --- /dev/null +++ b/restscrape/celery.py @@ -0,0 +1,12 @@ +from celery import Celery +from restscrape.utils import get_tab + +app = Celery("restscrape_funcs") +app.conf.broker_url = "redis://localhost:6379/0" +app.conf.result_backend = "redis://localhost:6379/0" + + +@app.task +def fetch_page(url, wait_for=0, proxy=None, use_adblock=True): + with get_tab(proxy=proxy, use_adblock=use_adblock) as tab: + return tab.open(url, wait_for=wait_for) diff --git a/restscrape/models.py b/restscrape/models.py index 64b866d..c9ca805 100644 --- a/restscrape/models.py +++ b/restscrape/models.py @@ -1,11 +1,14 @@ from urllib.parse import quote_plus import datetime +import uuid +from celery.result import AsyncResult from django.core.files.base import ContentFile from django.db import models import django.contrib.postgres.fields as extended_fields import pytz +from restscrape.celery import app from restscrape.scraping.browser import BrowserConnection # Create your models here. @@ -60,7 +63,53 @@ class Browser(models.Model): super().delete() -class ScrapeRequet(models.Model): - xpath_labels = extended_fields.HStoreField() +class PageRequestTask(models.Model): + task_id = models.UUIDField(primary_key=True) + _result = None + + @property + def async_result(self) -> AsyncResult: + if self._result is None: + self._result = AsyncResult(self.task_id, app=app) + return self._result + + def is_ready(self): + return self.async_result.ready() + + def pop_result(self): + res = self.async_result.get() + self.delete() + return res + + +class ScrapingResponse(models.Model): + id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) + + +class PendingScrapingResponse(ScrapingResponse): + task = models.ForeignKey(PageRequestTask, on_delete=models.CASCADE) + + def mark_complete(self, xpath_labels): + completion_time = datetime.datetime.now(pytz.UTC) + completed_response = CompletedScrapingResponse( + xpath_labels=xpath_labels, completion_time=completion_time, id=self.id + ) + completed_response.save() + self.delete() + return completed_response + + +class CompletedScrapingResponse(ScrapingResponse): + xpath_labels = extended_fields.JSONField() + completion_time = models.DateTimeField() + + +class ScrapeRequest(models.Model): + # internal + submit_time = models.DateTimeField(auto_now=True, editable=False) + # parameters blocking = models.BooleanField() - submit_time = models.DateTimeField(auto_now=True) + url = models.URLField() + use_adblock = models.BooleanField() + wait_for = models.IntegerField() + xpath_labels = extended_fields.HStoreField() diff --git a/restscrape/utils.py b/restscrape/utils.py index 4e4ad3e..17522bb 100644 --- a/restscrape/utils.py +++ b/restscrape/utils.py @@ -1,5 +1,5 @@ from restscrape.models import Browser -from restscrape.scraping.browser import start_browser, BrowserConnection +from restscrape.scraping.browser import start_browser def get_tab(proxy, use_adblock=True):