|
|
|
@ -2,7 +2,6 @@ import lxml.etree |
|
|
|
|
|
|
|
|
|
|
|
class Scraper: |
|
|
|
|
|
|
|
def __init__(self, page_source): |
|
|
|
if not isinstance(page_source, lxml.etree._Element): |
|
|
|
page_source = lxml.etree.HTML(page_source) |
|
|
|
@ -15,18 +14,18 @@ class Scraper: |
|
|
|
if not isinstance(table, lxml.etree._Element): |
|
|
|
table = self.xpath(table)[0] |
|
|
|
header = table.xpath(header_xpath)[0] |
|
|
|
headers = list(element.text.lower() |
|
|
|
for element in header.findall('th')) |
|
|
|
for row in table.xpath(rows_xpath)[0].findall('tr'): |
|
|
|
yield dict(zip(headers, (data.text for data in row.findall('td')))) |
|
|
|
headers = list(element.text.lower() for element in header.findall("th")) |
|
|
|
for row in table.xpath(rows_xpath)[0].findall("tr"): |
|
|
|
yield dict(zip(headers, (data.text for data in row.findall("td")))) |
|
|
|
|
|
|
|
def label_convert(self, labels, raw_tags=False): |
|
|
|
ret = {} |
|
|
|
for label, xpath in labels.items(): |
|
|
|
res = self.xpath(xpath) |
|
|
|
if raw_tags: |
|
|
|
ret[label] = list(lxml.etree.tostring( |
|
|
|
element, pretty_print=True) for element in res) |
|
|
|
ret[label] = list( |
|
|
|
lxml.etree.tostring(element, pretty_print=True) for element in res |
|
|
|
) |
|
|
|
else: |
|
|
|
ret[label] = list(element.text for element in res) |
|
|
|
|
|
|
|
@ -37,4 +36,6 @@ def proxy_scraper(page_source): |
|
|
|
page = Scraper(page_source) |
|
|
|
yield from page.extract_table( |
|
|
|
table="//table[@id='proxylisttable']", |
|
|
|
header_xpath="./thead/tr", rows_xpath="./tbody") |
|
|
|
header_xpath="./thead/tr", |
|
|
|
rows_xpath="./tbody", |
|
|
|
) |