HtmlDownloader.py 1.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. # coding:utf-8
  2. import random
  3. import config
  4. import json
  5. from db.DataStore import sqlhelper
  6. __author__ = 'qiye'
  7. import requests
  8. import chardet
  9. class Html_Downloader(object):
  10. @staticmethod
  11. def download(url):
  12. try:
  13. r = requests.get(url=url, headers=config.get_header(), timeout=config.TIMEOUT)
  14. r.encoding = chardet.detect(r.content)['encoding']
  15. if (not r.ok) or len(r.content) < 500:
  16. raise ConnectionError
  17. else:
  18. return r.text
  19. except Exception:
  20. count = 0 # 重试次数
  21. proxylist = sqlhelper.select(10)
  22. if not proxylist:
  23. return None
  24. while count < config.RETRY_TIME:
  25. try:
  26. proxy = random.choice(proxylist)
  27. ip = proxy[0]
  28. port = proxy[1]
  29. proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
  30. r = requests.get(url=url, headers=config.get_header(), timeout=config.TIMEOUT, proxies=proxies)
  31. r.encoding = chardet.detect(r.content)['encoding']
  32. if (not r.ok) or len(r.content) < 500:
  33. raise ConnectionError
  34. else:
  35. return r.text
  36. except Exception:
  37. count += 1
  38. return None