Browse Source

added scraper class to hopefully make things easier

master
Raphael Roberts 7 years ago
parent
commit
65820cfc17
  1. 27
      restscrape/scraper.py

27
restscrape/scraper.py

@ -1,8 +1,23 @@
import lxml.etree
class scraper:
def __init__(self,page_source):
if not isinstance(page_source,lxml.etree._Element):
page_source = lxml.etree.HTML(page_source)
self.page_source = page_source
def xpath(self,expr):
return self.page_source.xpath(expr)
def extract_table(self,table,header_xpath,rows_xpath):
if not isinstance(table,lxml.etree._Element):
table = self.xpath(table)[0]
header = table.xpath(header_xpath)[0]
headers = list(element.text.lower() for element in header.findall('th'))
for row in table.xpath(rows_xpath)[0].findall('tr'):
yield dict(zip(headers,(data.text for data in row.findall('td'))))
def proxy_scraper(page_source):
if not isinstance(page_source,lxml.etree._Element):
page_source = lxml.etree.HTML(page_source)
proxy_table = page_source.xpath("//table[@id='proxylisttable']")[0]
headers = list(element.text.lower() for element in proxy_table.xpath('./thead/tr/th'))
for row in proxy_table.xpath('./tbody/tr'):
yield dict(zip(headers,(data.text for data in row.findall('td'))))
page = scraper(page_source)
yield from page.extract_table(table="//table[@id='proxylisttable']",header_xpath="./thead/tr",rows_xpath="./tbody")
Loading…
Cancel
Save