You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

156 lines
5.0 KiB

8 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
  1. from price_finder import price_finder,BS
  2. from itertools import cycle
  3. import requests
  4. # import requests_html
  5. import sys
  6. from ipaddress import ip_address
  7. from get_link import get_link
  8. def get_proxies(link='https://free-proxy-list.net/',country = 'United States'):
  9. ## ses = requests_html.HTMLSession()
  10. r = requests.get(link)
  11. page = BS(r.content,'lxml')
  12. table = page.find(id='proxylisttable')
  13. headers,*rows = table.find_all('tr')
  14. headers = list(tag.text.lower() for tag in headers.find_all('th'))
  15. ip,port = headers.index('ip address'),headers.index('port')
  16. https_support = headers.index('https')
  17. country_id = headers.index('country')
  18. proxies = []
  19. for row in rows:
  20. if row.find('td'):
  21. tr = list(tag.text for tag in row.find_all('td'))
  22. try:
  23. try:
  24. ip_address(tr[ip])
  25. assert int(port) >= 0 and int(port) < 2**16
  26. if (tr[https_support] == "yes" or False) and tr[country_id] == country:
  27. proxies.append('{}:{}'.format(tr[ip],tr[port]))
  28. except (ValueError,AssertionError):
  29. pass
  30. except Exception as e:
  31. print(row)
  32. raise e
  33. return proxies
  34. class proxy_iter:
  35. def __init__(self,proxies):
  36. self._proxies = set(proxies)
  37. self.proxies = self._proxies.copy()
  38. self.bad_proxies = set()
  39. # self.used_proxies = {}
  40. def __next__(self):
  41. self.proxies -= self.bad_proxies
  42. if len(self.proxies) == 0:
  43. raise StopIteration
  44. elem = self.proxies.pop()
  45. if len(self.proxies) == 0:
  46. self.proxies = self._proxies.copy()
  47. return elem
  48. def __iter__(self):
  49. return self
  50. def blacklist(self,proxy):
  51. self.bad_proxies.add(proxy)
  52. # def render_page(link,proxies,ses):
  53. # print(link)
  54. # bad_proxies = set()
  55. # page = None
  56. # render_attempts = 0
  57. # for proxy in proxies:
  58. # print(proxy)
  59. # try:
  60. # r = ses.get(link,proxies={'http':proxy,'https':proxy})
  61. # print('got')
  62. # except (requests.exceptions.ProxyError,requests.exceptions.SSLError):
  63. # print('!g!'+proxy)
  64. # bad_proxies.add(proxy)
  65. # continue
  66. # if render_attempts < 3:
  67. # render_attempts += 1
  68. # try:
  69. # r.html.render(timeout=10, sleep=10)
  70. # print('rendered')
  71. # except requests_html.MaxRetries:
  72. # print('!r!'+proxy)
  73. # bad_proxies.add(proxy)
  74. # continue
  75. # page = r.html.raw_html
  76. # break
  77. # if page:
  78. # return page,{proxy},bad_proxies
  79. # else:
  80. # raise Exception("All proxies used up")
  81. def get_prices(links,use_proxies = True):
  82. pages = {}
  83. if use_proxies:
  84. proxies = proxy_iter(get_proxies() + get_proxies('https://www.us-proxy.org/'))
  85. for link in links:
  86. for proxy in proxies:
  87. print(link,proxy)
  88. try:
  89. page = get_link(link,proxy=proxy)
  90. pages[link] = page
  91. break
  92. except Exception as e:
  93. print(type(e),e,file=sys.stdout)
  94. proxies.blacklist(proxy)
  95. if len(links) != len(pages.keys()):
  96. raise Exception('all proxies suck')
  97. else:
  98. pages = get_link(links)
  99. ret = []
  100. for link in links:
  101. ret.append(price_finder(
  102. link,bs=BS(pages[link],'lxml')
  103. ))
  104. return ret
  105. def get_prices_old(links,no_reuse = True,use_proxies=True):
  106. if use_proxies:
  107. proxies = set(get_proxies() + get_proxies('https://www.us-proxy.org/'))
  108. ses = requests_html.HTMLSession()
  109. ret = []
  110. if use_proxies:
  111. prev = set()
  112. if use_proxies:
  113. bad_proxies_set= set()
  114. for link in links:
  115. if use_proxies:
  116. if no_reuse:
  117. working_set = proxies-prev
  118. # if use_proxies:
  119. else:
  120. working_set = proxies
  121. page,prev,bad_proxies = render_page(link,working_set,ses)
  122. else:
  123. r=ses.get(link)
  124. r.html.render()
  125. page = r.html.raw_html
  126. ret.append(price_finder(link,bs=BS(page,'lxml')))
  127. if use_proxies:
  128. bad_proxies_set |= bad_proxies
  129. proxies -= bad_proxies
  130. if use_proxies:
  131. print(bad_proxies_set)
  132. ses.close()
  133. return ret
  134. if __name__ == "__main__":
  135. # ses = requests_html.HTMLSession()
  136. # proxies = get_proxies('https://www.us-proxy.org/')
  137. # page = render_page('https://www.banggood.com/Aomway-Commander-Goggles-V1-2D-3D-40CH-5_8G-FPV-Video-Headset-Support-HDMI-DVR-Headtracker-p-1107684.html?cur_warehouse=CN',
  138. # proxies,
  139. # ses)
  140. import saveto
  141. import random
  142. ql = saveto.load('quad_links')
  143. random.shuffle(ql)
  144. products = get_prices(ql,use_proxies=False)
  145. # pass