Browse Source

Fixed style of scraping components and moved them to submodule 'scraping'

master
Raphael Roberts 7 years ago
parent
commit
fdc370f656
  1. 17
      restscrape/cache.py
  2. 34
      restscrape/scraper.py
  3. 38
      restscrape/scraping/__init__.py
  4. 30
      restscrape/scraping/browser.py
  5. 25
      restscrape/scraping/proxy.py
  6. 40
      restscrape/scraping/scraper.py
  7. 0
      restscrape/scraping/uBlock/1p-filters.html
  8. 0
      restscrape/scraping/uBlock/3p-filters.html
  9. 0
      restscrape/scraping/uBlock/LICENSE.txt
  10. 0
      restscrape/scraping/uBlock/_locales/ar/messages.json
  11. 0
      restscrape/scraping/uBlock/_locales/az/messages.json
  12. 0
      restscrape/scraping/uBlock/_locales/bg/messages.json
  13. 0
      restscrape/scraping/uBlock/_locales/bn/messages.json
  14. 0
      restscrape/scraping/uBlock/_locales/ca/messages.json
  15. 0
      restscrape/scraping/uBlock/_locales/cs/messages.json
  16. 0
      restscrape/scraping/uBlock/_locales/cv/messages.json
  17. 0
      restscrape/scraping/uBlock/_locales/da/messages.json
  18. 0
      restscrape/scraping/uBlock/_locales/de/messages.json
  19. 0
      restscrape/scraping/uBlock/_locales/el/messages.json
  20. 0
      restscrape/scraping/uBlock/_locales/en/messages.json
  21. 0
      restscrape/scraping/uBlock/_locales/eo/messages.json
  22. 0
      restscrape/scraping/uBlock/_locales/es/messages.json
  23. 0
      restscrape/scraping/uBlock/_locales/et/messages.json
  24. 0
      restscrape/scraping/uBlock/_locales/eu/messages.json
  25. 0
      restscrape/scraping/uBlock/_locales/fa/messages.json
  26. 0
      restscrape/scraping/uBlock/_locales/fi/messages.json
  27. 0
      restscrape/scraping/uBlock/_locales/fil/messages.json
  28. 0
      restscrape/scraping/uBlock/_locales/fr/messages.json
  29. 0
      restscrape/scraping/uBlock/_locales/fy/messages.json
  30. 0
      restscrape/scraping/uBlock/_locales/gl/messages.json
  31. 0
      restscrape/scraping/uBlock/_locales/he/messages.json
  32. 0
      restscrape/scraping/uBlock/_locales/hi/messages.json
  33. 0
      restscrape/scraping/uBlock/_locales/hr/messages.json
  34. 0
      restscrape/scraping/uBlock/_locales/hu/messages.json
  35. 0
      restscrape/scraping/uBlock/_locales/id/messages.json
  36. 0
      restscrape/scraping/uBlock/_locales/it/messages.json
  37. 0
      restscrape/scraping/uBlock/_locales/ja/messages.json
  38. 0
      restscrape/scraping/uBlock/_locales/ka/messages.json
  39. 0
      restscrape/scraping/uBlock/_locales/kk/messages.json
  40. 0
      restscrape/scraping/uBlock/_locales/kn/messages.json
  41. 0
      restscrape/scraping/uBlock/_locales/ko/messages.json
  42. 0
      restscrape/scraping/uBlock/_locales/lt/messages.json
  43. 0
      restscrape/scraping/uBlock/_locales/lv/messages.json
  44. 0
      restscrape/scraping/uBlock/_locales/ml/messages.json
  45. 0
      restscrape/scraping/uBlock/_locales/mr/messages.json
  46. 0
      restscrape/scraping/uBlock/_locales/ms/messages.json
  47. 0
      restscrape/scraping/uBlock/_locales/nb/messages.json
  48. 0
      restscrape/scraping/uBlock/_locales/nl/messages.json
  49. 0
      restscrape/scraping/uBlock/_locales/no/messages.json
  50. 0
      restscrape/scraping/uBlock/_locales/pl/messages.json
  51. 0
      restscrape/scraping/uBlock/_locales/pt_BR/messages.json
  52. 0
      restscrape/scraping/uBlock/_locales/pt_PT/messages.json
  53. 0
      restscrape/scraping/uBlock/_locales/ro/messages.json
  54. 0
      restscrape/scraping/uBlock/_locales/ru/messages.json
  55. 0
      restscrape/scraping/uBlock/_locales/sk/messages.json
  56. 0
      restscrape/scraping/uBlock/_locales/sl/messages.json
  57. 0
      restscrape/scraping/uBlock/_locales/sq/messages.json
  58. 0
      restscrape/scraping/uBlock/_locales/sr/messages.json
  59. 0
      restscrape/scraping/uBlock/_locales/sv/messages.json
  60. 0
      restscrape/scraping/uBlock/_locales/ta/messages.json
  61. 0
      restscrape/scraping/uBlock/_locales/te/messages.json
  62. 0
      restscrape/scraping/uBlock/_locales/th/messages.json
  63. 0
      restscrape/scraping/uBlock/_locales/tr/messages.json
  64. 0
      restscrape/scraping/uBlock/_locales/uk/messages.json
  65. 0
      restscrape/scraping/uBlock/_locales/vi/messages.json
  66. 0
      restscrape/scraping/uBlock/_locales/zh_CN/messages.json
  67. 0
      restscrape/scraping/uBlock/_locales/zh_TW/messages.json
  68. 0
      restscrape/scraping/uBlock/about.html
  69. 0
      restscrape/scraping/uBlock/advanced-settings.html
  70. 0
      restscrape/scraping/uBlock/asset-viewer.html
  71. 0
      restscrape/scraping/uBlock/assets/assets.json
  72. 0
      restscrape/scraping/uBlock/assets/thirdparties/easylist-downloads.adblockplus.org/easylist.txt
  73. 0
      restscrape/scraping/uBlock/assets/thirdparties/easylist-downloads.adblockplus.org/easyprivacy.txt
  74. 0
      restscrape/scraping/uBlock/assets/thirdparties/mirror1.malwaredomains.com/files/README.md
  75. 0
      restscrape/scraping/uBlock/assets/thirdparties/mirror1.malwaredomains.com/files/justdomains
  76. 0
      restscrape/scraping/uBlock/assets/thirdparties/pgl.yoyo.org/as/README.md
  77. 0
      restscrape/scraping/uBlock/assets/thirdparties/pgl.yoyo.org/as/serverlist
  78. 0
      restscrape/scraping/uBlock/assets/thirdparties/publicsuffix.org/list/effective_tld_names.dat
  79. 0
      restscrape/scraping/uBlock/assets/thirdparties/www.malwaredomainlist.com/hostslist/README.md
  80. 0
      restscrape/scraping/uBlock/assets/thirdparties/www.malwaredomainlist.com/hostslist/hosts.txt
  81. 0
      restscrape/scraping/uBlock/assets/ublock/badware.txt
  82. 0
      restscrape/scraping/uBlock/assets/ublock/experimental.txt
  83. 0
      restscrape/scraping/uBlock/assets/ublock/filters.txt
  84. 0
      restscrape/scraping/uBlock/assets/ublock/privacy.txt
  85. 0
      restscrape/scraping/uBlock/assets/ublock/resource-abuse.txt
  86. 0
      restscrape/scraping/uBlock/assets/ublock/resources.txt
  87. 0
      restscrape/scraping/uBlock/assets/ublock/unbreak.txt
  88. 0
      restscrape/scraping/uBlock/background.html
  89. 0
      restscrape/scraping/uBlock/cloud-ui.html
  90. 0
      restscrape/scraping/uBlock/css/1p-filters.css
  91. 0
      restscrape/scraping/uBlock/css/3p-filters.css
  92. 0
      restscrape/scraping/uBlock/css/advanced-settings.css
  93. 0
      restscrape/scraping/uBlock/css/benchmarks.css
  94. 0
      restscrape/scraping/uBlock/css/cloud-ui.css
  95. 0
      restscrape/scraping/uBlock/css/codemirror.css
  96. 0
      restscrape/scraping/uBlock/css/common.css
  97. 0
      restscrape/scraping/uBlock/css/dashboard-common.css
  98. 0
      restscrape/scraping/uBlock/css/dashboard.css
  99. 0
      restscrape/scraping/uBlock/css/document-blocked.css
  100. 0
      restscrape/scraping/uBlock/css/dyna-rules.css

17
restscrape/cache.py

@ -1,17 +0,0 @@
from datetime import timedelta,datetime
import sqlite3
class cache:
def __init__(self,cache_path="page_cache.db",interval=datetime.timedelta(days=1)):
self.con = sqlite3.connect(cache_path)
self.cur = self.con.cursor()
self.cur.execute('''\
create table if not exists `pages`(
`url` text primary key,
`page_source` text,
`datetime` datetime,
);
'''
)

34
restscrape/scraper.py

@ -1,34 +0,0 @@
import lxml.etree
class scraper:
def __init__(self,page_source):
if not isinstance(page_source,lxml.etree._Element):
page_source = lxml.etree.HTML(page_source)
self.page_source = page_source
def xpath(self,expr):
return self.page_source.xpath(expr)
def extract_table(self,table,header_xpath,rows_xpath):
if not isinstance(table,lxml.etree._Element):
table = self.xpath(table)[0]
header = table.xpath(header_xpath)[0]
headers = list(element.text.lower() for element in header.findall('th'))
for row in table.xpath(rows_xpath)[0].findall('tr'):
yield dict(zip(headers,(data.text for data in row.findall('td'))))
def label_convert(self,labels,raw_tags = False):
ret = {}
for label,xpath in labels.items():
res = self.xpath(xpath)
if raw_tags:
ret[label] = list(lxml.etree.tostring(element, pretty_print=True) for element in res)
else:
ret[label] = list(element.text for element in res)
return ret
def proxy_scraper(page_source):
page = scraper(page_source)
yield from page.extract_table(table="//table[@id='proxylisttable']",header_xpath="./thead/tr",rows_xpath="./tbody")

restscrape/__init__.py → restscrape/scraping/__init__.py

restscrape/browser.py → restscrape/scraping/browser.py

restscrape/proxy.py → restscrape/scraping/proxy.py

40
restscrape/scraping/scraper.py

@ -0,0 +1,40 @@
import lxml.etree
class scraper:
def __init__(self, page_source):
if not isinstance(page_source, lxml.etree._Element):
page_source = lxml.etree.HTML(page_source)
self.page_source = page_source
def xpath(self, expr):
return self.page_source.xpath(expr)
def extract_table(self, table, header_xpath, rows_xpath):
if not isinstance(table, lxml.etree._Element):
table = self.xpath(table)[0]
header = table.xpath(header_xpath)[0]
headers = list(element.text.lower()
for element in header.findall('th'))
for row in table.xpath(rows_xpath)[0].findall('tr'):
yield dict(zip(headers, (data.text for data in row.findall('td'))))
def label_convert(self, labels, raw_tags=False):
ret = {}
for label, xpath in labels.items():
res = self.xpath(xpath)
if raw_tags:
ret[label] = list(lxml.etree.tostring(
element, pretty_print=True) for element in res)
else:
ret[label] = list(element.text for element in res)
return ret
def proxy_scraper(page_source):
page = scraper(page_source)
yield from page.extract_table(
table="//table[@id='proxylisttable']",
header_xpath="./thead/tr", rows_xpath="./tbody")

restscrape/uBlock/1p-filters.html → restscrape/scraping/uBlock/1p-filters.html

restscrape/uBlock/3p-filters.html → restscrape/scraping/uBlock/3p-filters.html

restscrape/uBlock/LICENSE.txt → restscrape/scraping/uBlock/LICENSE.txt

restscrape/uBlock/_locales/ar/messages.json → restscrape/scraping/uBlock/_locales/ar/messages.json

restscrape/uBlock/_locales/az/messages.json → restscrape/scraping/uBlock/_locales/az/messages.json

restscrape/uBlock/_locales/bg/messages.json → restscrape/scraping/uBlock/_locales/bg/messages.json

restscrape/uBlock/_locales/bn/messages.json → restscrape/scraping/uBlock/_locales/bn/messages.json

restscrape/uBlock/_locales/ca/messages.json → restscrape/scraping/uBlock/_locales/ca/messages.json

restscrape/uBlock/_locales/cs/messages.json → restscrape/scraping/uBlock/_locales/cs/messages.json

restscrape/uBlock/_locales/cv/messages.json → restscrape/scraping/uBlock/_locales/cv/messages.json

restscrape/uBlock/_locales/da/messages.json → restscrape/scraping/uBlock/_locales/da/messages.json

restscrape/uBlock/_locales/de/messages.json → restscrape/scraping/uBlock/_locales/de/messages.json

restscrape/uBlock/_locales/el/messages.json → restscrape/scraping/uBlock/_locales/el/messages.json

restscrape/uBlock/_locales/en/messages.json → restscrape/scraping/uBlock/_locales/en/messages.json

restscrape/uBlock/_locales/eo/messages.json → restscrape/scraping/uBlock/_locales/eo/messages.json

restscrape/uBlock/_locales/es/messages.json → restscrape/scraping/uBlock/_locales/es/messages.json

restscrape/uBlock/_locales/et/messages.json → restscrape/scraping/uBlock/_locales/et/messages.json

restscrape/uBlock/_locales/eu/messages.json → restscrape/scraping/uBlock/_locales/eu/messages.json

restscrape/uBlock/_locales/fa/messages.json → restscrape/scraping/uBlock/_locales/fa/messages.json

restscrape/uBlock/_locales/fi/messages.json → restscrape/scraping/uBlock/_locales/fi/messages.json

restscrape/uBlock/_locales/fil/messages.json → restscrape/scraping/uBlock/_locales/fil/messages.json

restscrape/uBlock/_locales/fr/messages.json → restscrape/scraping/uBlock/_locales/fr/messages.json

restscrape/uBlock/_locales/fy/messages.json → restscrape/scraping/uBlock/_locales/fy/messages.json

restscrape/uBlock/_locales/gl/messages.json → restscrape/scraping/uBlock/_locales/gl/messages.json

restscrape/uBlock/_locales/he/messages.json → restscrape/scraping/uBlock/_locales/he/messages.json

restscrape/uBlock/_locales/hi/messages.json → restscrape/scraping/uBlock/_locales/hi/messages.json

restscrape/uBlock/_locales/hr/messages.json → restscrape/scraping/uBlock/_locales/hr/messages.json

restscrape/uBlock/_locales/hu/messages.json → restscrape/scraping/uBlock/_locales/hu/messages.json

restscrape/uBlock/_locales/id/messages.json → restscrape/scraping/uBlock/_locales/id/messages.json

restscrape/uBlock/_locales/it/messages.json → restscrape/scraping/uBlock/_locales/it/messages.json

restscrape/uBlock/_locales/ja/messages.json → restscrape/scraping/uBlock/_locales/ja/messages.json

restscrape/uBlock/_locales/ka/messages.json → restscrape/scraping/uBlock/_locales/ka/messages.json

restscrape/uBlock/_locales/kk/messages.json → restscrape/scraping/uBlock/_locales/kk/messages.json

restscrape/uBlock/_locales/kn/messages.json → restscrape/scraping/uBlock/_locales/kn/messages.json

restscrape/uBlock/_locales/ko/messages.json → restscrape/scraping/uBlock/_locales/ko/messages.json

restscrape/uBlock/_locales/lt/messages.json → restscrape/scraping/uBlock/_locales/lt/messages.json

restscrape/uBlock/_locales/lv/messages.json → restscrape/scraping/uBlock/_locales/lv/messages.json

restscrape/uBlock/_locales/ml/messages.json → restscrape/scraping/uBlock/_locales/ml/messages.json

restscrape/uBlock/_locales/mr/messages.json → restscrape/scraping/uBlock/_locales/mr/messages.json

restscrape/uBlock/_locales/ms/messages.json → restscrape/scraping/uBlock/_locales/ms/messages.json

restscrape/uBlock/_locales/nb/messages.json → restscrape/scraping/uBlock/_locales/nb/messages.json

restscrape/uBlock/_locales/nl/messages.json → restscrape/scraping/uBlock/_locales/nl/messages.json

restscrape/uBlock/_locales/no/messages.json → restscrape/scraping/uBlock/_locales/no/messages.json

restscrape/uBlock/_locales/pl/messages.json → restscrape/scraping/uBlock/_locales/pl/messages.json

restscrape/uBlock/_locales/pt_BR/messages.json → restscrape/scraping/uBlock/_locales/pt_BR/messages.json

restscrape/uBlock/_locales/pt_PT/messages.json → restscrape/scraping/uBlock/_locales/pt_PT/messages.json

restscrape/uBlock/_locales/ro/messages.json → restscrape/scraping/uBlock/_locales/ro/messages.json

restscrape/uBlock/_locales/ru/messages.json → restscrape/scraping/uBlock/_locales/ru/messages.json

restscrape/uBlock/_locales/sk/messages.json → restscrape/scraping/uBlock/_locales/sk/messages.json

restscrape/uBlock/_locales/sl/messages.json → restscrape/scraping/uBlock/_locales/sl/messages.json

restscrape/uBlock/_locales/sq/messages.json → restscrape/scraping/uBlock/_locales/sq/messages.json

restscrape/uBlock/_locales/sr/messages.json → restscrape/scraping/uBlock/_locales/sr/messages.json

restscrape/uBlock/_locales/sv/messages.json → restscrape/scraping/uBlock/_locales/sv/messages.json

restscrape/uBlock/_locales/ta/messages.json → restscrape/scraping/uBlock/_locales/ta/messages.json

restscrape/uBlock/_locales/te/messages.json → restscrape/scraping/uBlock/_locales/te/messages.json

restscrape/uBlock/_locales/th/messages.json → restscrape/scraping/uBlock/_locales/th/messages.json

restscrape/uBlock/_locales/tr/messages.json → restscrape/scraping/uBlock/_locales/tr/messages.json

restscrape/uBlock/_locales/uk/messages.json → restscrape/scraping/uBlock/_locales/uk/messages.json

restscrape/uBlock/_locales/vi/messages.json → restscrape/scraping/uBlock/_locales/vi/messages.json

restscrape/uBlock/_locales/zh_CN/messages.json → restscrape/scraping/uBlock/_locales/zh_CN/messages.json

restscrape/uBlock/_locales/zh_TW/messages.json → restscrape/scraping/uBlock/_locales/zh_TW/messages.json

restscrape/uBlock/about.html → restscrape/scraping/uBlock/about.html

restscrape/uBlock/advanced-settings.html → restscrape/scraping/uBlock/advanced-settings.html

restscrape/uBlock/asset-viewer.html → restscrape/scraping/uBlock/asset-viewer.html

restscrape/uBlock/assets/assets.json → restscrape/scraping/uBlock/assets/assets.json

restscrape/uBlock/assets/thirdparties/easylist-downloads.adblockplus.org/easylist.txt → restscrape/scraping/uBlock/assets/thirdparties/easylist-downloads.adblockplus.org/easylist.txt

restscrape/uBlock/assets/thirdparties/easylist-downloads.adblockplus.org/easyprivacy.txt → restscrape/scraping/uBlock/assets/thirdparties/easylist-downloads.adblockplus.org/easyprivacy.txt

restscrape/uBlock/assets/thirdparties/mirror1.malwaredomains.com/files/README.md → restscrape/scraping/uBlock/assets/thirdparties/mirror1.malwaredomains.com/files/README.md

restscrape/uBlock/assets/thirdparties/mirror1.malwaredomains.com/files/justdomains → restscrape/scraping/uBlock/assets/thirdparties/mirror1.malwaredomains.com/files/justdomains

restscrape/uBlock/assets/thirdparties/pgl.yoyo.org/as/README.md → restscrape/scraping/uBlock/assets/thirdparties/pgl.yoyo.org/as/README.md

restscrape/uBlock/assets/thirdparties/pgl.yoyo.org/as/serverlist → restscrape/scraping/uBlock/assets/thirdparties/pgl.yoyo.org/as/serverlist

restscrape/uBlock/assets/thirdparties/publicsuffix.org/list/effective_tld_names.dat → restscrape/scraping/uBlock/assets/thirdparties/publicsuffix.org/list/effective_tld_names.dat

restscrape/uBlock/assets/thirdparties/www.malwaredomainlist.com/hostslist/README.md → restscrape/scraping/uBlock/assets/thirdparties/www.malwaredomainlist.com/hostslist/README.md

restscrape/uBlock/assets/thirdparties/www.malwaredomainlist.com/hostslist/hosts.txt → restscrape/scraping/uBlock/assets/thirdparties/www.malwaredomainlist.com/hostslist/hosts.txt

restscrape/uBlock/assets/ublock/badware.txt → restscrape/scraping/uBlock/assets/ublock/badware.txt

restscrape/uBlock/assets/ublock/experimental.txt → restscrape/scraping/uBlock/assets/ublock/experimental.txt

restscrape/uBlock/assets/ublock/filters.txt → restscrape/scraping/uBlock/assets/ublock/filters.txt

restscrape/uBlock/assets/ublock/privacy.txt → restscrape/scraping/uBlock/assets/ublock/privacy.txt

restscrape/uBlock/assets/ublock/resource-abuse.txt → restscrape/scraping/uBlock/assets/ublock/resource-abuse.txt

restscrape/uBlock/assets/ublock/resources.txt → restscrape/scraping/uBlock/assets/ublock/resources.txt

restscrape/uBlock/assets/ublock/unbreak.txt → restscrape/scraping/uBlock/assets/ublock/unbreak.txt

restscrape/uBlock/background.html → restscrape/scraping/uBlock/background.html

restscrape/uBlock/cloud-ui.html → restscrape/scraping/uBlock/cloud-ui.html

restscrape/uBlock/css/1p-filters.css → restscrape/scraping/uBlock/css/1p-filters.css

restscrape/uBlock/css/3p-filters.css → restscrape/scraping/uBlock/css/3p-filters.css

restscrape/uBlock/css/advanced-settings.css → restscrape/scraping/uBlock/css/advanced-settings.css

restscrape/uBlock/css/benchmarks.css → restscrape/scraping/uBlock/css/benchmarks.css

restscrape/uBlock/css/cloud-ui.css → restscrape/scraping/uBlock/css/cloud-ui.css

restscrape/uBlock/css/codemirror.css → restscrape/scraping/uBlock/css/codemirror.css

restscrape/uBlock/css/common.css → restscrape/scraping/uBlock/css/common.css

restscrape/uBlock/css/dashboard-common.css → restscrape/scraping/uBlock/css/dashboard-common.css

restscrape/uBlock/css/dashboard.css → restscrape/scraping/uBlock/css/dashboard.css

restscrape/uBlock/css/document-blocked.css → restscrape/scraping/uBlock/css/document-blocked.css

restscrape/uBlock/css/dyna-rules.css → restscrape/scraping/uBlock/css/dyna-rules.css

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save