anjuke.py 1.1 KB

123456789101112131415161718192021222324252627282930313233343536373839
  1. # _*_coding:utf8_*_
  2. # anjuke.py
  3. # 爬取安居客房产经纪人
  4. from urllib import request
  5. from lxml import etree
  6. from gooseeker import GsExtractor
  7. totalpages = 50
  8. class Spider:
  9. def getContent(self, url):
  10. conn = request.urlopen(url)
  11. output = etree.HTML(conn.read())
  12. return output
  13. def saveContent(self, filepath, content):
  14. file_obj = open(filepath, 'w', encoding='UTF-8')
  15. file_obj.write(content)
  16. file_obj.close()
  17. bbsExtra = GsExtractor()
  18. bbsExtra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e" , "安居客房产经纪人") # 设置xslt抓取规则,第一个参数是app key,请到会员中心申请
  19. url = "http://shenzhen.anjuke.com/tycoon/nanshan/p"
  20. anjukeSpider = Spider()
  21. print("爬取开始")
  22. for pagenumber in range(1 , totalpages):
  23. currenturl = url + str(pagenumber)
  24. print("正在爬取", currenturl)
  25. content = anjukeSpider.getContent(currenturl)
  26. outputxml = bbsExtra.extract(content)
  27. outputfile = "result" + str(pagenumber) + ".xml"
  28. anjukeSpider.saveContent(outputfile , str(outputxml))
  29. print("爬取结束")