api.py 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. import copy
  4. import re
  5. import requests
  6. from img_util import get_num
  7. UserAgent = ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, '
  8. 'like Gecko) Chrome/32.0.1700.76 Safari/537.36')
  9. HUMAN_HEADERS = {
  10. 'Accept': ('text/html,application/xhtml+xml,application/xml;q=0.9,'
  11. 'image/webp,*/*;q=0.8'),
  12. 'User-Agent': UserAgent,
  13. 'Accept-Encoding': 'gzip,deflate,sdch'
  14. }
  15. class Api(object):
  16. def __init__(self, cookie):
  17. self.headers = copy.deepcopy(HUMAN_HEADERS)
  18. self.headers.update({'Cookie': cookie})
  19. def get_all_index_html(self, all_index_url):
  20. r = requests.get(all_index_url, headers=self.headers)
  21. return r.json()
  22. def get_index_show_html(self, index_show_url):
  23. r = requests.get(index_show_url, headers=self.headers)
  24. content = r.json()['data']['code'][0]
  25. img_url = re.findall('(?is)"(/Interface/IndexShow/img/[^"]*?)"', content)
  26. img_url = "http://index.baidu.com%s" % img_url[0]
  27. regex = ('(?is)<span class="imgval" style="width:(\d+)px;">'
  28. '<div class="imgtxt" style="margin-left:-(\d+)px;">')
  29. result = re.findall(regex, content)
  30. skip_info = result if result else list()
  31. return img_url, skip_info
  32. def get_value_from_url(self, img_url, index_skip_info):
  33. r = requests.get(img_url, headers=self.headers)
  34. return get_num(r.content, index_skip_info)