gooseeker.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. # 模块名: gooseeker
  4. # 类名: GsExtractor
  5. # Version: 2.1
  6. # 说明: html内容提取器
  7. # 功能: 使用xslt作为模板,快速提取HTML DOM中的内容。
  8. # released by 集搜客(http://www.gooseeker.com) on May 18, 2016
  9. # github: https://github.com/FullerHua/jisou/core/gooseeker.py
  10. import time
  11. from urllib import request
  12. from urllib.parse import quote
  13. from lxml import etree
  14. class GsExtractor(object):
  15. def _init_(self):
  16. self.xslt = ""
  17. # 从文件读取xslt
  18. def setXsltFromFile(self , xsltFilePath):
  19. file = open(xsltFilePath , 'r' , encoding='UTF-8')
  20. try:
  21. self.xslt = file.read()
  22. finally:
  23. file.close()
  24. # 从字符串获得xslt
  25. def setXsltFromMem(self , xsltStr):
  26. self.xslt = xsltStr
  27. # 通过GooSeeker API接口获得xslt
  28. def setXsltFromAPI(self , APIKey , theme, middle=None, bname=None):
  29. apiurl = "http://www.gooseeker.com/api/getextractor?key="+ APIKey +"&theme="+quote(theme)
  30. if (middle):
  31. apiurl = apiurl + "&middle="+quote(middle)
  32. if (bname):
  33. apiurl = apiurl + "&bname="+quote(bname)
  34. apiconn = request.urlopen(apiurl)
  35. self.xslt = apiconn.read()
  36. # 返回当前xslt
  37. def getXslt(self):
  38. return self.xslt
  39. # 提取方法,入参是一个HTML DOM对象,返回是提取结果
  40. def extract(self , html):
  41. xslt_root = etree.XML(self.xslt)
  42. transform = etree.XSLT(xslt_root)
  43. result_tree = transform(html)
  44. return result_tree
  45. # 提取方法,入参是html源码,返回是提取结果
  46. def extractHTML(self , html):
  47. doc = etree.HTML(html)
  48. return self.extract(doc)