Browse Source

added proxy server finder

master
Raphael Roberts 7 years ago
commit
1fb73f2479
  1. 118
      .gitignore
  2. 3
      requirements.txt
  3. 0
      restscrape/__init__.py
  4. 32
      restscrape/page_source.py
  5. 8
      restscrape/scraper.py
  6. 0
      setup.py

118
.gitignore

@ -0,0 +1,118 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# HTML files for testing
*.html

3
requirements.txt

@ -0,0 +1,3 @@
pyppeteer
requests
lxml

0
restscrape/__init__.py

32
restscrape/page_source.py

@ -0,0 +1,32 @@
from restscrape.scraper import proxy_scraper
import pyppeteer
import requests
US_PROXY_URL = 'https://www.us-proxy.org/'
class proxy_iter:
'''Like itertools.cycle but uses a set underneath the hood and adds a method to remove an item from iteration (if proxy doesn't work etc)'''
def __init__(self,proxy_list):
self.proxy_set = set(proxy_list)
self.bad = set()
self.iterator = iter(self)
def __iter__(self):
for proxy in self.proxy_set:
if not proxy in self.bad:
yield proxy
self.proxy_set -= self.bad
if len(self.proxy_set) > 0:
yield from iter(self)
def __next__(self):
return next(self.iterator)
def blacklist(self,proxy):
self.bad.add(proxy)
def __len__(self):
return len(self.proxy_set - self.bad)
def create_proxy_iter(url):
'''Create a proxy_iter from proxy_webpage'''
resp = requests.get(url)
resp.raise_for_status()
return proxy_iter('{ip address}:{port}'.format(**row) for row in proxy_scraper(resp.text))

8
restscrape/scraper.py

@ -0,0 +1,8 @@
import lxml.etree
def proxy_scraper(page_source):
if not isinstance(page_source,lxml.etree._Element):
page_source = lxml.etree.HTML(page_source)
proxy_table = page_source.xpath("//table[@id='proxylisttable']")[0]
headers = list(element.text.lower() for element in proxy_table.xpath('./thead/tr/th'))
for row in proxy_table.xpath('./tbody/tr'):
yield dict(zip(headers,(data.text for data in row.findall('td'))))

0
setup.py

Loading…
Cancel
Save