Browse Source

Blackened codebase

master
Raphael Roberts 7 years ago
parent
commit
de2cbb46b7
  1. 2
      restscrape/apps.py
  2. 14
      restscrape/scraping/__init__.py
  3. 12
      restscrape/scraping/proxy.py
  4. 17
      restscrape/scraping/scraper.py
  5. 4
      setup.py

2
restscrape/apps.py

@ -2,4 +2,4 @@ from django.apps import AppConfig
class RestscrapeConfig(AppConfig):
name = 'restscrape'
name = "restscrape"

14
restscrape/scraping/__init__.py

@ -7,14 +7,7 @@ from restscrape.scraping.proxy import ProxyIter, create_proxy_iter
US_PROXY_ITER = create_proxy_iter()
def scrape(
url,
labels,
proxy_iter=None,
wait_for=0,
max_tries=4,
raw_tags=True
):
def scrape(url, labels, proxy_iter=None, wait_for=0, max_tries=4, raw_tags=True):
browser = Browser(headless=False)
if proxy_iter is not None:
for trial in range(max_tries):
@ -47,5 +40,6 @@ def scrape(
if __name__ == "__main__":
ret, browser = scrape('http://rlbrhost.ddns.net/',
{'links': '//a'}, wait_for=10, raw_tags=False)
ret, browser = scrape(
"http://rlbrhost.ddns.net/", {"links": "//a"}, wait_for=10, raw_tags=False
)

12
restscrape/scraping/proxy.py

@ -2,13 +2,13 @@ import requests
from restscrape.scraping.scraper import proxy_scraper
US_PROXY_URL = 'https://www.us-proxy.org/'
US_PROXY_URL = "https://www.us-proxy.org/"
class ProxyIter:
'''Like itertools.cycle but
"""Like itertools.cycle but
uses a set underneath the hood and adds a method to remove an item from
iteration (if proxy doesn't work etc)'''
iteration (if proxy doesn't work etc)"""
def __init__(self, proxy_list):
self.proxy_set = set(proxy_list)
@ -34,9 +34,9 @@ iteration (if proxy doesn't work etc)'''
def create_proxy_iter(url=US_PROXY_URL):
'''Create a proxy_iter from proxy_webpage'''
"""Create a proxy_iter from proxy_webpage"""
resp = requests.get(url)
resp.raise_for_status()
return ProxyIter(
'{ip address}:{port}'.format(**row)
for row in proxy_scraper(resp.text))
"{ip address}:{port}".format(**row) for row in proxy_scraper(resp.text)
)

17
restscrape/scraping/scraper.py

@ -2,7 +2,6 @@ import lxml.etree
class Scraper:
def __init__(self, page_source):
if not isinstance(page_source, lxml.etree._Element):
page_source = lxml.etree.HTML(page_source)
@ -15,18 +14,18 @@ class Scraper:
if not isinstance(table, lxml.etree._Element):
table = self.xpath(table)[0]
header = table.xpath(header_xpath)[0]
headers = list(element.text.lower()
for element in header.findall('th'))
for row in table.xpath(rows_xpath)[0].findall('tr'):
yield dict(zip(headers, (data.text for data in row.findall('td'))))
headers = list(element.text.lower() for element in header.findall("th"))
for row in table.xpath(rows_xpath)[0].findall("tr"):
yield dict(zip(headers, (data.text for data in row.findall("td"))))
def label_convert(self, labels, raw_tags=False):
ret = {}
for label, xpath in labels.items():
res = self.xpath(xpath)
if raw_tags:
ret[label] = list(lxml.etree.tostring(
element, pretty_print=True) for element in res)
ret[label] = list(
lxml.etree.tostring(element, pretty_print=True) for element in res
)
else:
ret[label] = list(element.text for element in res)
@ -37,4 +36,6 @@ def proxy_scraper(page_source):
page = Scraper(page_source)
yield from page.extract_table(
table="//table[@id='proxylisttable']",
header_xpath="./thead/tr", rows_xpath="./tbody")
header_xpath="./thead/tr",
rows_xpath="./tbody",
)

4
setup.py

@ -1,7 +1,7 @@
from setuptools import setup, find_packages
with open('requirements.txt') as file:
INSTALL_REQUIRES = file.read().rstrip().split('\n')
with open("requirements.txt") as file:
INSTALL_REQUIRES = file.read().rstrip().split("\n")
setup(
author="Raphael Roberts",

Loading…
Cancel
Save