From 65820cfc17b5dadbf2a2a3c51bfa5b146e434852 Mon Sep 17 00:00:00 2001 From: Raphael Roberts Date: Sat, 29 Dec 2018 12:49:30 -0600 Subject: [PATCH] added scraper class to hopefully make things easier --- restscrape/scraper.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/restscrape/scraper.py b/restscrape/scraper.py index 454ae6a..af63fa6 100644 --- a/restscrape/scraper.py +++ b/restscrape/scraper.py @@ -1,8 +1,23 @@ import lxml.etree +class scraper: + + def __init__(self,page_source): + if not isinstance(page_source,lxml.etree._Element): + page_source = lxml.etree.HTML(page_source) + self.page_source = page_source + + def xpath(self,expr): + return self.page_source.xpath(expr) + + def extract_table(self,table,header_xpath,rows_xpath): + if not isinstance(table,lxml.etree._Element): + table = self.xpath(table)[0] + header = table.xpath(header_xpath)[0] + headers = list(element.text.lower() for element in header.findall('th')) + for row in table.xpath(rows_xpath)[0].findall('tr'): + yield dict(zip(headers,(data.text for data in row.findall('td')))) + def proxy_scraper(page_source): - if not isinstance(page_source,lxml.etree._Element): - page_source = lxml.etree.HTML(page_source) - proxy_table = page_source.xpath("//table[@id='proxylisttable']")[0] - headers = list(element.text.lower() for element in proxy_table.xpath('./thead/tr/th')) - for row in proxy_table.xpath('./tbody/tr'): - yield dict(zip(headers,(data.text for data in row.findall('td')))) \ No newline at end of file + page = scraper(page_source) + yield from page.extract_table(table="//table[@id='proxylisttable']",header_xpath="./thead/tr",rows_xpath="./tbody") +