commit
1fb73f2479
6 changed files with 161 additions and 0 deletions
-
118.gitignore
-
3requirements.txt
-
0restscrape/__init__.py
-
32restscrape/page_source.py
-
8restscrape/scraper.py
-
0setup.py
@ -0,0 +1,118 @@ |
|||
# Byte-compiled / optimized / DLL files |
|||
__pycache__/ |
|||
*.py[cod] |
|||
*$py.class |
|||
|
|||
# C extensions |
|||
*.so |
|||
|
|||
# Distribution / packaging |
|||
.Python |
|||
build/ |
|||
develop-eggs/ |
|||
dist/ |
|||
downloads/ |
|||
eggs/ |
|||
.eggs/ |
|||
lib/ |
|||
lib64/ |
|||
parts/ |
|||
sdist/ |
|||
var/ |
|||
wheels/ |
|||
share/python-wheels/ |
|||
*.egg-info/ |
|||
.installed.cfg |
|||
*.egg |
|||
MANIFEST |
|||
|
|||
# PyInstaller |
|||
# Usually these files are written by a python script from a template |
|||
# before PyInstaller builds the exe, so as to inject date/other infos into it. |
|||
*.manifest |
|||
*.spec |
|||
|
|||
# Installer logs |
|||
pip-log.txt |
|||
pip-delete-this-directory.txt |
|||
|
|||
# Unit test / coverage reports |
|||
htmlcov/ |
|||
.tox/ |
|||
.nox/ |
|||
.coverage |
|||
.coverage.* |
|||
.cache |
|||
nosetests.xml |
|||
coverage.xml |
|||
*.cover |
|||
.hypothesis/ |
|||
.pytest_cache/ |
|||
|
|||
# Translations |
|||
*.mo |
|||
*.pot |
|||
|
|||
# Django stuff: |
|||
*.log |
|||
local_settings.py |
|||
db.sqlite3 |
|||
|
|||
# Flask stuff: |
|||
instance/ |
|||
.webassets-cache |
|||
|
|||
# Scrapy stuff: |
|||
.scrapy |
|||
|
|||
# Sphinx documentation |
|||
docs/_build/ |
|||
|
|||
# PyBuilder |
|||
target/ |
|||
|
|||
# Jupyter Notebook |
|||
.ipynb_checkpoints |
|||
|
|||
# IPython |
|||
profile_default/ |
|||
ipython_config.py |
|||
|
|||
# pyenv |
|||
.python-version |
|||
|
|||
# celery beat schedule file |
|||
celerybeat-schedule |
|||
|
|||
# SageMath parsed files |
|||
*.sage.py |
|||
|
|||
# Environments |
|||
.env |
|||
.venv |
|||
env/ |
|||
venv/ |
|||
ENV/ |
|||
env.bak/ |
|||
venv.bak/ |
|||
|
|||
# Spyder project settings |
|||
.spyderproject |
|||
.spyproject |
|||
|
|||
# Rope project settings |
|||
.ropeproject |
|||
|
|||
# mkdocs documentation |
|||
/site |
|||
|
|||
# mypy |
|||
.mypy_cache/ |
|||
.dmypy.json |
|||
dmypy.json |
|||
|
|||
# Pyre type checker |
|||
.pyre/ |
|||
|
|||
# HTML files for testing |
|||
*.html |
|||
@ -0,0 +1,3 @@ |
|||
pyppeteer |
|||
requests |
|||
lxml |
|||
@ -0,0 +1,32 @@ |
|||
from restscrape.scraper import proxy_scraper |
|||
import pyppeteer |
|||
import requests |
|||
US_PROXY_URL = 'https://www.us-proxy.org/' |
|||
class proxy_iter: |
|||
'''Like itertools.cycle but uses a set underneath the hood and adds a method to remove an item from iteration (if proxy doesn't work etc)''' |
|||
def __init__(self,proxy_list): |
|||
self.proxy_set = set(proxy_list) |
|||
self.bad = set() |
|||
self.iterator = iter(self) |
|||
|
|||
def __iter__(self): |
|||
for proxy in self.proxy_set: |
|||
if not proxy in self.bad: |
|||
yield proxy |
|||
self.proxy_set -= self.bad |
|||
if len(self.proxy_set) > 0: |
|||
yield from iter(self) |
|||
|
|||
def __next__(self): |
|||
return next(self.iterator) |
|||
|
|||
def blacklist(self,proxy): |
|||
self.bad.add(proxy) |
|||
|
|||
def __len__(self): |
|||
return len(self.proxy_set - self.bad) |
|||
def create_proxy_iter(url): |
|||
'''Create a proxy_iter from proxy_webpage''' |
|||
resp = requests.get(url) |
|||
resp.raise_for_status() |
|||
return proxy_iter('{ip address}:{port}'.format(**row) for row in proxy_scraper(resp.text)) |
|||
@ -0,0 +1,8 @@ |
|||
import lxml.etree |
|||
def proxy_scraper(page_source): |
|||
if not isinstance(page_source,lxml.etree._Element): |
|||
page_source = lxml.etree.HTML(page_source) |
|||
proxy_table = page_source.xpath("//table[@id='proxylisttable']")[0] |
|||
headers = list(element.text.lower() for element in proxy_table.xpath('./thead/tr/th')) |
|||
for row in proxy_table.xpath('./tbody/tr'): |
|||
yield dict(zip(headers,(data.text for data in row.findall('td')))) |
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue