douban.py 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. # _*_coding:utf8_*_
  2. # douban.py
  3. # 爬取豆瓣小组讨论话题
  4. from urllib import request
  5. from lxml import etree
  6. from gooseeker import GsExtractor
  7. from selenium import webdriver
  8. class PhantomSpider:
  9. def getContent(self, url):
  10. browser = webdriver.PhantomJS(executable_path='C:\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe')
  11. browser.get(url)
  12. time.sleep(3)
  13. html = browser.execute_script("return document.documentElement.outerHTML")
  14. output = etree.HTML(html)
  15. return output
  16. def saveContent(self, filepath, content):
  17. file_obj = open(filepath, 'w', encoding='UTF-8')
  18. file_obj.write(content)
  19. file_obj.close()
  20. doubanExtra = GsExtractor()
  21. # 下面这句调用gooseeker的api来设置xslt抓取规则
  22. # 第一个参数是app key,请到GooSeeker会员中心申请
  23. # 第二个参数是规则名,是通过GooSeeker的图形化工具: 谋数台MS 来生成的
  24. doubanExtra.setXsltFromAPI("ffd5273e213036d812ea298922e2627b" , "豆瓣小组讨论话题")
  25. url = "https://www.douban.com/group/haixiuzu/discussion?start="
  26. totalpages = 5
  27. doubanSpider = PhantomSpider()
  28. print("爬取开始")
  29. for pagenumber in range(1 , totalpages):
  30. currenturl = url + str((pagenumber-1)*25)
  31. print("正在爬取", currenturl)
  32. content = doubanSpider.getContent(currenturl)
  33. outputxml = doubanExtra.extract(content)
  34. outputfile = "result" + str(pagenumber) +".xml"
  35. doubanSpider.saveContent(outputfile , str(outputxml))
  36. print("爬取结束")