Browse Source

Starting on celery tasks and response models

master
Raphael Roberts 7 years ago
parent
commit
88d36e5622
  1. 1
      requirements.txt
  2. 12
      restscrape/celery.py
  3. 55
      restscrape/models.py
  4. 2
      restscrape/utils.py

1
requirements.txt

@ -5,3 +5,4 @@ lxml
django django
django-rest-framework django-rest-framework
psycopg2 psycopg2
celery[redis]

12
restscrape/celery.py

@ -0,0 +1,12 @@
from celery import Celery
from restscrape.utils import get_tab
app = Celery("restscrape_funcs")
app.conf.broker_url = "redis://localhost:6379/0"
app.conf.result_backend = "redis://localhost:6379/0"
@app.task
def fetch_page(url, wait_for=0, proxy=None, use_adblock=True):
with get_tab(proxy=proxy, use_adblock=use_adblock) as tab:
return tab.open(url, wait_for=wait_for)

55
restscrape/models.py

@ -1,11 +1,14 @@
from urllib.parse import quote_plus from urllib.parse import quote_plus
import datetime import datetime
import uuid
from celery.result import AsyncResult
from django.core.files.base import ContentFile from django.core.files.base import ContentFile
from django.db import models from django.db import models
import django.contrib.postgres.fields as extended_fields import django.contrib.postgres.fields as extended_fields
import pytz import pytz
from restscrape.celery import app
from restscrape.scraping.browser import BrowserConnection from restscrape.scraping.browser import BrowserConnection
# Create your models here. # Create your models here.
@ -60,7 +63,53 @@ class Browser(models.Model):
super().delete() super().delete()
class ScrapeRequet(models.Model):
xpath_labels = extended_fields.HStoreField()
class PageRequestTask(models.Model):
task_id = models.UUIDField(primary_key=True)
_result = None
@property
def async_result(self) -> AsyncResult:
if self._result is None:
self._result = AsyncResult(self.task_id, app=app)
return self._result
def is_ready(self):
return self.async_result.ready()
def pop_result(self):
res = self.async_result.get()
self.delete()
return res
class ScrapingResponse(models.Model):
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
class PendingScrapingResponse(ScrapingResponse):
task = models.ForeignKey(PageRequestTask, on_delete=models.CASCADE)
def mark_complete(self, xpath_labels):
completion_time = datetime.datetime.now(pytz.UTC)
completed_response = CompletedScrapingResponse(
xpath_labels=xpath_labels, completion_time=completion_time, id=self.id
)
completed_response.save()
self.delete()
return completed_response
class CompletedScrapingResponse(ScrapingResponse):
xpath_labels = extended_fields.JSONField()
completion_time = models.DateTimeField()
class ScrapeRequest(models.Model):
# internal
submit_time = models.DateTimeField(auto_now=True, editable=False)
# parameters
blocking = models.BooleanField() blocking = models.BooleanField()
submit_time = models.DateTimeField(auto_now=True)
url = models.URLField()
use_adblock = models.BooleanField()
wait_for = models.IntegerField()
xpath_labels = extended_fields.HStoreField()

2
restscrape/utils.py

@ -1,5 +1,5 @@
from restscrape.models import Browser from restscrape.models import Browser
from restscrape.scraping.browser import start_browser, BrowserConnection
from restscrape.scraping.browser import start_browser
def get_tab(proxy, use_adblock=True): def get_tab(proxy, use_adblock=True):

Loading…
Cancel
Save