HtmlDownloader.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. #coding:utf-8
  2. import random
  3. import config
  4. import json
  5. from db.DataStore import sqlhelper
  6. __author__ = 'qiye'
  7. import requests
  8. import chardet
  9. class Html_Downloader(object):
  10. @classmethod
  11. def download(self,url):
  12. count = 0#重试次数
  13. r=''
  14. try:
  15. r = requests.get(url=url,headers=config.HEADER,timeout=config.TIMEOUT)
  16. r.encoding =chardet.detect(r.content)['encoding']
  17. while count< config.RETRY_TIME:
  18. if (not r.ok) or len(r.content)<500 :
  19. proxylist = sqlhelper.select(10)
  20. proxy = random.choice(proxylist)
  21. ip = proxy[0]
  22. port = proxy[1]
  23. proxies={"http": "http://%s:%s"%(ip,port),"https": "http://%s:%s"%(ip,port)}
  24. try:
  25. r = requests.get(url=url,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
  26. r.encoding =chardet.detect(r.content)['encoding']
  27. count += 1
  28. except Exception as e:
  29. count += 1
  30. else:
  31. return r.text
  32. return None
  33. except Exception as e:
  34. while count< config.RETRY_TIME:
  35. if r==''or (not r.ok) or len(r.content)<500 :
  36. try:
  37. proxylist = sqlhelper.select(10)
  38. proxy = random.choice(proxylist)
  39. ip = proxy[0]
  40. port = proxy[1]
  41. proxies={"http": "http://%s:%s"%(ip,port),"https": "http://%s:%s"%(ip,port)}
  42. try:
  43. r = requests.get(url=url,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
  44. r.encoding =chardet.detect(r.content)['encoding']
  45. count += 1
  46. except Exception as e:
  47. count += 1
  48. except Exception as e:
  49. return None
  50. else:
  51. return r.text
  52. return None