|
|
|
@ -1,8 +1,23 @@ |
|
|
|
import lxml.etree |
|
|
|
class scraper: |
|
|
|
|
|
|
|
def __init__(self,page_source): |
|
|
|
if not isinstance(page_source,lxml.etree._Element): |
|
|
|
page_source = lxml.etree.HTML(page_source) |
|
|
|
self.page_source = page_source |
|
|
|
|
|
|
|
def xpath(self,expr): |
|
|
|
return self.page_source.xpath(expr) |
|
|
|
|
|
|
|
def extract_table(self,table,header_xpath,rows_xpath): |
|
|
|
if not isinstance(table,lxml.etree._Element): |
|
|
|
table = self.xpath(table)[0] |
|
|
|
header = table.xpath(header_xpath)[0] |
|
|
|
headers = list(element.text.lower() for element in header.findall('th')) |
|
|
|
for row in table.xpath(rows_xpath)[0].findall('tr'): |
|
|
|
yield dict(zip(headers,(data.text for data in row.findall('td')))) |
|
|
|
|
|
|
|
def proxy_scraper(page_source): |
|
|
|
if not isinstance(page_source,lxml.etree._Element): |
|
|
|
page_source = lxml.etree.HTML(page_source) |
|
|
|
proxy_table = page_source.xpath("//table[@id='proxylisttable']")[0] |
|
|
|
headers = list(element.text.lower() for element in proxy_table.xpath('./thead/tr/th')) |
|
|
|
for row in proxy_table.xpath('./tbody/tr'): |
|
|
|
yield dict(zip(headers,(data.text for data in row.findall('td')))) |
|
|
|
page = scraper(page_source) |
|
|
|
yield from page.extract_table(table="//table[@id='proxylisttable']",header_xpath="./thead/tr",rows_xpath="./tbody") |
|
|
|
|