|
|
@ -1,11 +1,14 @@ |
|
|
from urllib.parse import quote_plus |
|
|
from urllib.parse import quote_plus |
|
|
import datetime |
|
|
import datetime |
|
|
|
|
|
import uuid |
|
|
|
|
|
|
|
|
|
|
|
from celery.result import AsyncResult |
|
|
from django.core.files.base import ContentFile |
|
|
from django.core.files.base import ContentFile |
|
|
from django.db import models |
|
|
from django.db import models |
|
|
import django.contrib.postgres.fields as extended_fields |
|
|
import django.contrib.postgres.fields as extended_fields |
|
|
import pytz |
|
|
import pytz |
|
|
|
|
|
|
|
|
|
|
|
from restscrape.celery import app |
|
|
from restscrape.scraping.browser import BrowserConnection |
|
|
from restscrape.scraping.browser import BrowserConnection |
|
|
|
|
|
|
|
|
# Create your models here. |
|
|
# Create your models here. |
|
|
@ -60,7 +63,53 @@ class Browser(models.Model): |
|
|
super().delete() |
|
|
super().delete() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ScrapeRequet(models.Model): |
|
|
|
|
|
xpath_labels = extended_fields.HStoreField() |
|
|
|
|
|
|
|
|
class PageRequestTask(models.Model): |
|
|
|
|
|
task_id = models.UUIDField(primary_key=True) |
|
|
|
|
|
_result = None |
|
|
|
|
|
|
|
|
|
|
|
@property |
|
|
|
|
|
def async_result(self) -> AsyncResult: |
|
|
|
|
|
if self._result is None: |
|
|
|
|
|
self._result = AsyncResult(self.task_id, app=app) |
|
|
|
|
|
return self._result |
|
|
|
|
|
|
|
|
|
|
|
def is_ready(self): |
|
|
|
|
|
return self.async_result.ready() |
|
|
|
|
|
|
|
|
|
|
|
def pop_result(self): |
|
|
|
|
|
res = self.async_result.get() |
|
|
|
|
|
self.delete() |
|
|
|
|
|
return res |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ScrapingResponse(models.Model): |
|
|
|
|
|
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PendingScrapingResponse(ScrapingResponse): |
|
|
|
|
|
task = models.ForeignKey(PageRequestTask, on_delete=models.CASCADE) |
|
|
|
|
|
|
|
|
|
|
|
def mark_complete(self, xpath_labels): |
|
|
|
|
|
completion_time = datetime.datetime.now(pytz.UTC) |
|
|
|
|
|
completed_response = CompletedScrapingResponse( |
|
|
|
|
|
xpath_labels=xpath_labels, completion_time=completion_time, id=self.id |
|
|
|
|
|
) |
|
|
|
|
|
completed_response.save() |
|
|
|
|
|
self.delete() |
|
|
|
|
|
return completed_response |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class CompletedScrapingResponse(ScrapingResponse): |
|
|
|
|
|
xpath_labels = extended_fields.JSONField() |
|
|
|
|
|
completion_time = models.DateTimeField() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ScrapeRequest(models.Model): |
|
|
|
|
|
# internal |
|
|
|
|
|
submit_time = models.DateTimeField(auto_now=True, editable=False) |
|
|
|
|
|
# parameters |
|
|
blocking = models.BooleanField() |
|
|
blocking = models.BooleanField() |
|
|
submit_time = models.DateTimeField(auto_now=True) |
|
|
|
|
|
|
|
|
url = models.URLField() |
|
|
|
|
|
use_adblock = models.BooleanField() |
|
|
|
|
|
wait_for = models.IntegerField() |
|
|
|
|
|
xpath_labels = extended_fields.HStoreField() |