diff --git a/restscrape/migrations/0004_page_encoding.py b/restscrape/migrations/0004_page_encoding.py new file mode 100644 index 0000000..3c953e7 --- /dev/null +++ b/restscrape/migrations/0004_page_encoding.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.1 on 2019-05-27 21:00 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('restscrape', '0003_page_content_size'), + ] + + operations = [ + migrations.AddField( + model_name='page', + name='encoding', + field=models.CharField(default='utf-8', max_length=15), + ), + ] diff --git a/restscrape/models.py b/restscrape/models.py index 6efb6cf..57d89d2 100644 --- a/restscrape/models.py +++ b/restscrape/models.py @@ -10,8 +10,9 @@ import pytz class Page(models.Model): url = models.CharField(max_length=300, primary_key=True) + encoding = models.CharField(max_length=15, default="utf-8") access_time = models.DateTimeField() - page_content = models.FileField(upload_to='page_cache') + page_content = models.FileField(upload_to="page_cache") content_size = models.PositiveIntegerField() @property @@ -19,13 +20,20 @@ class Page(models.Model): return quote_plus(self.url) def write(self, page_content): + if not isinstance(page_content, bytes): + try: + page_content = page_content.encode(encoding="utf-8") + except UnicodeEncodeError: + page_content = page_content.encode(encoding="utf-16") + self.encoding = "utf-16" self.content_size = len(page_content) file = ContentFile(page_content) self.access_time = datetime.datetime.now(pytz.utc) self.page_content.save(name=self.filename, content=file) def read(self): - return self.page_content.read() + raw: bytes = self.page_content.read() + return raw.decode(self.encoding) def delete(self): self.page_content.delete()