You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
66 lines
1.8 KiB
66 lines
1.8 KiB
import pyppeteer
|
|
import pyppeteer.errors
|
|
import asyncio
|
|
import os
|
|
|
|
async def _get_link(browser,link,xpath):
|
|
pages = await browser.pages()
|
|
page = pages[0]
|
|
await page.goto(link,waitUntil='documentloaded')
|
|
|
|
xpath = [xpath['name'],xpath['price']]
|
|
for _xpath in xpath:
|
|
print(repr(_xpath))
|
|
try:
|
|
await page.waitForXPath(_xpath)
|
|
except pyppeteer.errors.TimeoutError:
|
|
pass
|
|
await asyncio.sleep(1)
|
|
webpage = None
|
|
for i in range(20):
|
|
try:
|
|
webpage = await page.content()
|
|
break
|
|
except:
|
|
await asyncio.sleep(1)
|
|
return webpage
|
|
|
|
async def _single_link(browser,link,xpath):
|
|
webpage = await _get_link(browser,link,xpath)
|
|
await browser.close()
|
|
return webpage
|
|
|
|
async def _multi_link(browser,links,xpaths):
|
|
results = {}
|
|
for link in links:
|
|
xpath = xpaths[link]
|
|
webpage = await _get_link(browser,link,xpath)
|
|
results[link] = webpage
|
|
await browser.close()
|
|
return results
|
|
|
|
def get_link(links,xpaths,headless = False,proxy = None):
|
|
loop = asyncio.get_event_loop()
|
|
run = loop.run_until_complete
|
|
opts = {
|
|
'headless':headless,
|
|
}
|
|
if proxy:
|
|
opts['args'] = [f'--proxy-server={proxy}']
|
|
|
|
else:
|
|
opts['args'] = []
|
|
ext = os.path.join(os.path.dirname(__file__),'uBlock0.chromium')
|
|
opts['args'] += [f'--disable-extensions-except={ext}', f'--load-extension={ext}']
|
|
# print(opts)
|
|
browser = run(pyppeteer.launch(**opts))
|
|
try:
|
|
if isinstance(links,list):
|
|
result = run(_multi_link(browser,links,xpaths))
|
|
else:
|
|
result = run(_single_link(browser,links,xpaths[links]))
|
|
return result
|
|
except Exception as e:
|
|
run(browser.close())
|
|
raise e
|
|
|