commit 1fb73f24796291dbfdd7366dd707bb0a7018132a Author: Raphael Roberts Date: Thu Dec 27 15:15:01 2018 -0600 added proxy server finder diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3344528 --- /dev/null +++ b/.gitignore @@ -0,0 +1,118 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# HTML files for testing +*.html diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d9601af --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +pyppeteer +requests +lxml \ No newline at end of file diff --git a/restscrape/__init__.py b/restscrape/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/restscrape/page_source.py b/restscrape/page_source.py new file mode 100644 index 0000000..076504d --- /dev/null +++ b/restscrape/page_source.py @@ -0,0 +1,32 @@ +from restscrape.scraper import proxy_scraper +import pyppeteer +import requests +US_PROXY_URL = 'https://www.us-proxy.org/' +class proxy_iter: + '''Like itertools.cycle but uses a set underneath the hood and adds a method to remove an item from iteration (if proxy doesn't work etc)''' + def __init__(self,proxy_list): + self.proxy_set = set(proxy_list) + self.bad = set() + self.iterator = iter(self) + + def __iter__(self): + for proxy in self.proxy_set: + if not proxy in self.bad: + yield proxy + self.proxy_set -= self.bad + if len(self.proxy_set) > 0: + yield from iter(self) + + def __next__(self): + return next(self.iterator) + + def blacklist(self,proxy): + self.bad.add(proxy) + + def __len__(self): + return len(self.proxy_set - self.bad) +def create_proxy_iter(url): + '''Create a proxy_iter from proxy_webpage''' + resp = requests.get(url) + resp.raise_for_status() + return proxy_iter('{ip address}:{port}'.format(**row) for row in proxy_scraper(resp.text)) \ No newline at end of file diff --git a/restscrape/scraper.py b/restscrape/scraper.py new file mode 100644 index 0000000..454ae6a --- /dev/null +++ b/restscrape/scraper.py @@ -0,0 +1,8 @@ +import lxml.etree +def proxy_scraper(page_source): + if not isinstance(page_source,lxml.etree._Element): + page_source = lxml.etree.HTML(page_source) + proxy_table = page_source.xpath("//table[@id='proxylisttable']")[0] + headers = list(element.text.lower() for element in proxy_table.xpath('./thead/tr/th')) + for row in proxy_table.xpath('./tbody/tr'): + yield dict(zip(headers,(data.text for data in row.findall('td')))) \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..e69de29