You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
34 lines
1.3 KiB
34 lines
1.3 KiB
import lxml.etree
|
|
class scraper:
|
|
|
|
def __init__(self,page_source):
|
|
if not isinstance(page_source,lxml.etree._Element):
|
|
page_source = lxml.etree.HTML(page_source)
|
|
self.page_source = page_source
|
|
|
|
def xpath(self,expr):
|
|
return self.page_source.xpath(expr)
|
|
|
|
def extract_table(self,table,header_xpath,rows_xpath):
|
|
if not isinstance(table,lxml.etree._Element):
|
|
table = self.xpath(table)[0]
|
|
header = table.xpath(header_xpath)[0]
|
|
headers = list(element.text.lower() for element in header.findall('th'))
|
|
for row in table.xpath(rows_xpath)[0].findall('tr'):
|
|
yield dict(zip(headers,(data.text for data in row.findall('td'))))
|
|
|
|
def label_convert(self,labels,raw_tags = False):
|
|
ret = {}
|
|
for label,xpath in labels.items():
|
|
res = self.xpath(xpath)
|
|
if raw_tags:
|
|
ret[label] = list(lxml.etree.tostring(element, pretty_print=True) for element in res)
|
|
else:
|
|
ret[label] = list(element.text for element in res)
|
|
|
|
return ret
|
|
|
|
def proxy_scraper(page_source):
|
|
page = scraper(page_source)
|
|
yield from page.extract_table(table="//table[@id='proxylisttable']",header_xpath="./thead/tr",rows_xpath="./tbody")
|
|
|