HtmlDownloader.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. # coding:utf-8
  2. import random
  3. import config
  4. import json
  5. from db.DataStore import sqlhelper
  6. __author__ = 'qiye'
  7. import requests
  8. import chardet
  9. class Html_Downloader(object):
  10. @classmethod
  11. def download(self, url):
  12. count = 0 # 重试次数
  13. r = ''
  14. try:
  15. r = requests.get(url=url, headers=config.HEADER, timeout=config.TIMEOUT)
  16. r.encoding = chardet.detect(r.content)['encoding']
  17. while count < config.RETRY_TIME:
  18. if (not r.ok) or len(r.content) < 500:
  19. proxylist = sqlhelper.select(10)
  20. proxy = random.choice(proxylist)
  21. ip = proxy[0]
  22. port = proxy[1]
  23. proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
  24. try:
  25. r = requests.get(url=url, headers=config.HEADER, timeout=config.TIMEOUT, proxies=proxies)
  26. r.encoding = chardet.detect(r.content)['encoding']
  27. count += 1
  28. except Exception as e:
  29. count += 1
  30. else:
  31. return r.text
  32. return None
  33. except Exception as e:
  34. while count < config.RETRY_TIME:
  35. if r == '' or (not r.ok) or len(r.content) < 500:
  36. try:
  37. proxylist = sqlhelper.select(10)
  38. proxy = random.choice(proxylist)
  39. ip = proxy[0]
  40. port = proxy[1]
  41. proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
  42. try:
  43. r = requests.get(url=url, headers=config.HEADER, timeout=config.TIMEOUT, proxies=proxies)
  44. r.encoding = chardet.detect(r.content)['encoding']
  45. count += 1
  46. except Exception as e:
  47. count += 1
  48. except Exception as e:
  49. return None
  50. else:
  51. return r.text
  52. return None