6 changed files with 226 additions and 45 deletions
-
2.gitignore
-
161batch_process.py
-
50get_link.py
-
6price_finder.py
-
21proxy_class.py
-
23xpaths.json
@ -1,2 +1,4 @@ |
|||||
__pycache__ |
__pycache__ |
||||
.gitignore |
.gitignore |
||||
|
/uBlock0.chromium |
||||
|
/bg.html |
||||
@ -0,0 +1,50 @@ |
|||||
|
import pyppeteer |
||||
|
import asyncio |
||||
|
import os |
||||
|
async def _get_link(browser,link): |
||||
|
pages = await browser.pages() |
||||
|
page = pages[0] |
||||
|
await page.goto(link,timeout=60_000) |
||||
|
webpage = None |
||||
|
for i in range(20): |
||||
|
try: |
||||
|
webpage = await page.content() |
||||
|
break |
||||
|
except: |
||||
|
time.sleep(1) |
||||
|
return webpage |
||||
|
|
||||
|
async def _single_link(browser,link): |
||||
|
webpage = await _get_link(browser,link) |
||||
|
await browser.close() |
||||
|
return webpage |
||||
|
|
||||
|
async def _multi_link(browser,links): |
||||
|
results = {} |
||||
|
for link in links: |
||||
|
webpage = await _get_link(browser,link) |
||||
|
results[link] = webpage |
||||
|
await browser.close() |
||||
|
return results |
||||
|
|
||||
|
def get_link(links,headless = True,proxy = None): |
||||
|
ext = os.path.join(os.path.dirname(__file__),'uBlock0.chromium') |
||||
|
loop = asyncio.get_event_loop() |
||||
|
run = loop.run_until_complete |
||||
|
opts = { |
||||
|
'headless':headless, |
||||
|
} |
||||
|
opts['args'] = [f'--disable-extensions-except={ext}', f'--load-extension={ext}'] |
||||
|
if proxy: |
||||
|
opts['args'] += [f'--proxy-server={proxy}'] |
||||
|
# print(opts) |
||||
|
browser = run(pyppeteer.launch(**opts)) |
||||
|
try: |
||||
|
if isinstance(links,list): |
||||
|
result = run(_multi_link(browser,links)) |
||||
|
else: |
||||
|
result = run(_single_link(browser,links)) |
||||
|
return result |
||||
|
except Exception as e: |
||||
|
run(browser.close()) |
||||
|
raise e |
||||
@ -0,0 +1,21 @@ |
|||||
|
class proxy_iter: |
||||
|
def __init__(self,proxies): |
||||
|
self._proxies = set(proxies) |
||||
|
self.proxies = self._proxies.copy() |
||||
|
self.bad_proxies = set() |
||||
|
# self.used_proxies = {} |
||||
|
|
||||
|
def __next__(self): |
||||
|
self.proxies -= self.bad_proxies |
||||
|
if len(self.proxies) == 0: |
||||
|
raise StopIteration |
||||
|
|
||||
|
elem = self.proxies.pop() |
||||
|
if len(self.proxies) == 0: |
||||
|
self.proxies = self._proxies.copy() |
||||
|
return elem |
||||
|
|
||||
|
def __iter__(self): |
||||
|
return self |
||||
|
def blacklist(self,proxy): |
||||
|
self.bad_proxies.add(proxy) |
||||
@ -0,0 +1,23 @@ |
|||||
|
{ |
||||
|
"www.banggood.com": { |
||||
|
"name": "//h1[@itemprop='name']", |
||||
|
"price": "//div[@class='now']" |
||||
|
}, |
||||
|
"www.gearbest.com": { |
||||
|
"name": "//h1[@class='goodsIntro_title']", |
||||
|
"price": "//span[contains(@class,'goodsIntro_price')]", |
||||
|
"other": "//div[@class='goodsIntro_noticeSubmit']" |
||||
|
}, |
||||
|
"www.amazon.com": { |
||||
|
"name": "//span[@id='priceblock_dealprice' or @id='priceblock_ourprice']", |
||||
|
"price": "//span[@id='productTitle']" |
||||
|
}, |
||||
|
"www.getfpv.com": { |
||||
|
"name": "//div[@class='product-name']/span", |
||||
|
"price": "//div[@class='price-box']/p[@class='special-price']/span[@class='price'] | //div[@class='price-box']/span[@class='regular-price']/span" |
||||
|
}, |
||||
|
"www.dalprops.com": { |
||||
|
"name": "//h1[@itemprop='name']", |
||||
|
"price": "//*[@id='product-price']" |
||||
|
} |
||||
|
} |
||||
Write
Preview
Loading…
Cancel
Save
Reference in new issue