You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

94 lines
3.0 KiB

7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
  1. from price_finder import ParseResult
  2. from lxml import etree
  3. from bs4 import BeautifulSoup as BS
  4. from itertools import cycle
  5. import requests
  6. from urllib.parse import urlparse
  7. # import requests_html
  8. import sys
  9. from ipaddress import ip_address
  10. from get_link import get_link
  11. import json
  12. with open('xpaths.json') as file:
  13. xpaths_data = json.load(file)
  14. parser = etree.HTMLParser()
  15. def text2tree(text):
  16. return etree.fromstring(text,parser)
  17. def get_proxies(link='https://free-proxy-list.net/',country = 'United States'):
  18. ## ses = requests_html.HTMLSession()
  19. r = requests.get(link)
  20. page = BS(r.content,'lxml')
  21. table = page.find(id='proxylisttable')
  22. headers,*rows = table.find_all('tr')
  23. headers = list(tag.text.lower() for tag in headers.find_all('th'))
  24. ip,port = headers.index('ip address'),headers.index('port')
  25. https_support = headers.index('https')
  26. country_id = headers.index('country')
  27. proxies = []
  28. for row in rows:
  29. if row.find('td'):
  30. tr = list(tag.text for tag in row.find_all('td'))
  31. try:
  32. try:
  33. ip_address(tr[ip])
  34. assert int(port) >= 0 and int(port) < 2**16
  35. if (tr[https_support] == "yes" or False) and tr[country_id] == country:
  36. proxies.append('{}:{}'.format(tr[ip],tr[port]))
  37. except (ValueError,AssertionError):
  38. pass
  39. except Exception as e:
  40. print(row)
  41. raise e
  42. return proxies
  43. class proxy_iter:
  44. def __init__(self,proxies):
  45. self._proxies = set(proxies)
  46. self.proxies = self._proxies.copy()
  47. self.bad_proxies = set()
  48. # self.used_proxies = {}
  49. def __next__(self):
  50. self.proxies -= self.bad_proxies
  51. if len(self.proxies) == 0:
  52. raise StopIteration
  53. elem = self.proxies.pop()
  54. if len(self.proxies) == 0:
  55. self.proxies = self._proxies.copy()
  56. return elem
  57. def __iter__(self):
  58. return self
  59. def blacklist(self,proxy):
  60. self.bad_proxies.add(proxy)
  61. def get_prices(links,use_proxies = True):
  62. pages = {}
  63. xpaths = {link:xpaths_data[urlparse(link).netloc] for link in links}
  64. # print(xpaths)
  65. if use_proxies:
  66. proxies = proxy_iter(get_proxies() + get_proxies('https://www.us-proxy.org/'))
  67. for link in links:
  68. for proxy in proxies:
  69. print(link,proxy)
  70. try:
  71. page = get_link(link,xpaths,proxy=proxy)
  72. pages[link] = page
  73. break
  74. except Exception as e:
  75. print(type(e),e,file=sys.stdout)
  76. proxies.blacklist(proxy)
  77. if len(links) != len(pages.keys()):
  78. raise Exception('all proxies suck')
  79. else:
  80. pages = get_link(links,xpaths)
  81. ret = []
  82. for link in links:
  83. tree = text2tree(pages[link])
  84. ret.append(
  85. ParseResult(link,tree)
  86. )
  87. return ret