david
/
fetch_baidu_index
peilaus alkaen https://github.com/longzhiwen888/fetch_baidu_index.git


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546
							#!/usr/bin/env python
# -*- coding: utf-8 -*-

import copy
import re

import requests

from img_util import get_num


UserAgent = ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, '
             'like Gecko) Chrome/32.0.1700.76 Safari/537.36')

HUMAN_HEADERS = {
    'Accept': ('text/html,application/xhtml+xml,application/xml;q=0.9,'
               'image/webp,*/*;q=0.8'),
    'User-Agent': UserAgent,
    'Accept-Encoding': 'gzip,deflate,sdch'
}


class Api(object):
    def __init__(self, cookie):
        self.headers = copy.deepcopy(HUMAN_HEADERS)
        self.headers.update({'Cookie': cookie})

    def get_all_index_html(self, all_index_url):
        r = requests.get(all_index_url, headers=self.headers)
        return r.json()

    def get_index_show_html(self, index_show_url):
        r = requests.get(index_show_url, headers=self.headers)
        content = r.json()['data']['code'][0]
        img_url = re.findall('(?is)"(/Interface/IndexShow/img/[^"]*?)"', content)
        img_url = "http://index.baidu.com%s" % img_url[0]

        regex = ('(?is)<span class="imgval" style="width:(\d+)px;">'
                 '<div class="imgtxt" style="margin-left:-(\d+)px;">')
        result = re.findall(regex, content)
        skip_info = result if result else list()
        return img_url, skip_info

    def get_value_from_url(self, img_url, index_skip_info):
        r = requests.get(img_url, headers=self.headers)
        return get_num(r.content, index_skip_info)