9 changed files with 107 additions and 44 deletions
-
4requirements.txt
-
2restscrape/__init__.py
-
22restscrape/migrations/0001_initial.py
-
18restscrape/migrations/0002_auto_20190517_1311.py
-
19restscrape/models.py
-
10restscrape/scraping/__init__.py
-
68restscrape/scraping/browser.py
-
4restscrape/scraping/proxy.py
-
4restscrape/scraping/scraper.py
@ -1,3 +1,5 @@ |
|||
websockets==6.0 |
|||
pyppeteer |
|||
requests |
|||
lxml |
|||
lxml |
|||
django |
|||
@ -0,0 +1,2 @@ |
|||
from restscrape import migrations |
|||
from restscrape import scraping |
|||
@ -0,0 +1,22 @@ |
|||
# Generated by Django 2.2.1 on 2019-05-17 18:04 |
|||
|
|||
from django.db import migrations, models |
|||
|
|||
|
|||
class Migration(migrations.Migration): |
|||
|
|||
initial = True |
|||
|
|||
dependencies = [ |
|||
] |
|||
|
|||
operations = [ |
|||
migrations.CreateModel( |
|||
name='Page', |
|||
fields=[ |
|||
('url', models.CharField(max_length=300, primary_key=True, serialize=False)), |
|||
('acess_time', models.DateTimeField()), |
|||
('page_content', models.FileField(upload_to='page_cache')), |
|||
], |
|||
), |
|||
] |
|||
@ -0,0 +1,18 @@ |
|||
# Generated by Django 2.2.1 on 2019-05-17 18:11 |
|||
|
|||
from django.db import migrations |
|||
|
|||
|
|||
class Migration(migrations.Migration): |
|||
|
|||
dependencies = [ |
|||
('restscrape', '0001_initial'), |
|||
] |
|||
|
|||
operations = [ |
|||
migrations.RenameField( |
|||
model_name='page', |
|||
old_name='acess_time', |
|||
new_name='access_time', |
|||
), |
|||
] |
|||
@ -1,3 +1,22 @@ |
|||
from django.db import models |
|||
from django.core.files.base import ContentFile |
|||
from urllib.parse import quote_plus |
|||
|
|||
# Create your models here. |
|||
|
|||
|
|||
class Page(models.Model): |
|||
url = models.CharField(max_length=300, primary_key=True) |
|||
access_time = models.DateTimeField() |
|||
page_content = models.FileField(upload_to='page_cache') |
|||
|
|||
@property |
|||
def filename(self): |
|||
return quote_plus(self.url) |
|||
|
|||
def write(self, page_content): |
|||
file = ContentFile(page_content) |
|||
self.page_content.save(name=self.filename, content=file) |
|||
|
|||
def read(self): |
|||
return self.page_content.read() |
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue