browser.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. import traceback
  4. import time
  5. import urllib
  6. import json
  7. from datetime import datetime, timedelta
  8. from selenium import webdriver
  9. from threadpool import ThreadPool, makeRequests
  10. from config import ini_config
  11. from api import Api
  12. from utils.log import logger
  13. class BaiduBrowser(object):
  14. def __init__(self, cookie_json='', check_login=True):
  15. if not ini_config.browser_driver:
  16. browser_driver_name = 'Firefox'
  17. else:
  18. browser_driver_name = ini_config.browser_driver
  19. browser_driver_class = getattr(webdriver, browser_driver_name)
  20. if ini_config.browser_driver == 'Chrome':
  21. self.browser = browser_driver_class(executable_path=ini_config.executable_path)
  22. else:
  23. self.browser = browser_driver_class()
  24. # 设置超时时间
  25. self.browser.set_page_load_timeout(50)
  26. # 设置脚本运行超时时间
  27. self.browser.set_script_timeout(10)
  28. # 百度用户名
  29. self.user_name = ini_config.user_name
  30. # 百度密码
  31. self.password = ini_config.password
  32. self.cookie_json = cookie_json
  33. self.api = None
  34. self.cookie_dict_list = []
  35. self.date_list = []
  36. self.init_api(check_login=check_login)
  37. self.init_date_list()
  38. def __del__(self):
  39. self.close()
  40. def is_login(self):
  41. # 如果初始化BaiduBrowser时传递了cookie信息,则检测一下是否登录状态
  42. self.login_with_cookie(self.cookie_json)
  43. # 访问待检测的页面
  44. self.browser.get(ini_config.user_center_url)
  45. html = self.browser.page_source
  46. # 检测是否有登录成功标记
  47. return ini_config.login_sign in html
  48. def init_api(self, check_login=True):
  49. # 判断是否需要登录
  50. need_login = False
  51. if not self.cookie_json:
  52. logger.info(u'因无历史cookie,本次执行需要登录百度')
  53. need_login = True
  54. elif check_login and not self.is_login():
  55. logger.info(u'加载历史cookie登录失败,本次执行需要登录百度')
  56. need_login = True
  57. else:
  58. logger.info(u'本次执行无需登录百度')
  59. # 执行浏览器自动填表登录,登录后获取cookie
  60. if need_login:
  61. self.login(self.user_name, self.password)
  62. self.cookie_json = self.get_cookie_json()
  63. cookie_str = self.get_cookie_str(self.cookie_json)
  64. # 获取到cookie后传给api
  65. self.api = Api(cookie_str)
  66. def get_date_info(self, start_date, end_date):
  67. # 如果start_date和end_date中带有“-”,则替换掉
  68. if start_date.find('-') != -1 and end_date.find('-') != -1:
  69. start_date = start_date.replace('-', '')
  70. end_date = end_date.replace('-', '')
  71. # start_date和end_date转换成datetime对象
  72. start_date = datetime.strptime(start_date, '%Y%m%d')
  73. end_date = datetime.strptime(end_date, '%Y%m%d')
  74. # 循环start_date和end_date的差值,获取区间内所有的日期
  75. date_list = []
  76. temp_date = start_date
  77. while temp_date <= end_date:
  78. date_list.append(temp_date.strftime("%Y-%m-%d"))
  79. temp_date += timedelta(days=1)
  80. start_date = start_date.strftime("%Y-%m-%d")
  81. end_date = end_date.strftime("%Y-%m-%d")
  82. return start_date, end_date, date_list
  83. def get_one_day_index(self, date, url, keyword, type_name, area):
  84. try_num = 0
  85. try_max_num = 5
  86. while try_num < try_max_num:
  87. try:
  88. try_num += 1
  89. # 获取图片的下载地址以及图片的切割信息
  90. img_url, val_info = self.api.get_index_show_html(url)
  91. # 下载img图片,然后根据css切割图片的信息去切割图片,组成新的图片,
  92. # 将新图片跟已经做好的图片识别库对应识别
  93. value = self.api.get_value_from_url(img_url, val_info)
  94. break
  95. except:
  96. pass
  97. logger.info(
  98. 'keyword:%s, type_name:%s, area:%s, date:%s, value:%s' % (
  99. keyword, type_name, area, date, value
  100. )
  101. )
  102. return value.replace(',', '')
  103. def get_baidu_index_by_date_range(self, keyword, start_date, end_date,
  104. type_name, area):
  105. # 根据区间获取关键词的索引值
  106. url = ini_config.time_range_trend_url.format(
  107. start_date=start_date, end_date=end_date,
  108. word=urllib.quote(keyword.encode('gbk')),
  109. area=area
  110. )
  111. self.browser.get(url)
  112. if ini_config.browser_sleep:
  113. time.sleep(float(ini_config.browser_sleep))
  114. if u'未被收录' in self.browser.page_source:
  115. return {}
  116. # 执行js获取后面所需的res和res2的值
  117. res = self.browser.execute_script('return PPval.ppt;')
  118. res2 = self.browser.execute_script('return PPval.res2;')
  119. # 获取指定区间的日期列表,方便下面循环用
  120. start_date, end_date, date_list = self.get_date_info(
  121. start_date, end_date
  122. )
  123. # 拼接api的url
  124. url = ini_config.all_index_url.format(
  125. res=res, res2=res2, start_date=start_date, end_date=end_date
  126. )
  127. # 获取api的结果信息,这里面保存了后面日期节点的一些加密值
  128. all_index_info = self.api.get_all_index_html(url)
  129. indexes_enc = all_index_info['data'][type_name][0]['userIndexes_enc']
  130. enc_list = indexes_enc.split(',')
  131. pool = ThreadPool(int(ini_config.num_of_threads))
  132. # wm = WorkManager(int(ini_config.num_of_threads))
  133. # 遍历这些enc值,这些值拼接出api的url(这个页面返回 图片信息以及css规定的切图信息)
  134. list_of_args = []
  135. for index, _ in enumerate(enc_list):
  136. url = ini_config.index_show_url.format(
  137. res=res, res2=res2, enc_index=_, t=int(time.time()) * 1000
  138. )
  139. # 根据enc在列表中的位置,获取它的日期
  140. date = date_list[index]
  141. # 将任务添加到多线程下载模型中
  142. item = (None, dict(date=date, url=url, keyword=keyword, type_name=type_name, area=area))
  143. list_of_args.append(item)
  144. baidu_index_dict = {}
  145. def callback(*args, **kwargs):
  146. req, val = args[0], args[1]
  147. baidu_index_dict[req.kwds['date']] = val
  148. req_list = makeRequests(self.get_one_day_index, list_of_args, callback)
  149. [pool.putRequest(req) for req in req_list]
  150. pool.wait()
  151. return baidu_index_dict
  152. def _get_index_period(self, keyword, area):
  153. # 拼接一周趋势的url
  154. url = ini_config.one_week_trend_url.format(
  155. area=area, word=urllib.quote(keyword.encode('gbk'))
  156. )
  157. self.browser.get(url)
  158. # 获取下方api要用到的res和res2的值
  159. res = self.browser.execute_script('return PPval.ppt;')
  160. res2 = self.browser.execute_script('return PPval.res2;')
  161. start_date, end_date = self.browser.execute_script(
  162. 'return BID.getParams.time()[0];'
  163. ).split('|')
  164. start_date, end_date, date_list = self.get_date_info(
  165. start_date, end_date
  166. )
  167. url = ini_config.all_index_url.format(
  168. res=res, res2=res2, start_date=start_date, end_date=end_date
  169. )
  170. all_index_info = self.api.get_all_index_html(url)
  171. start_date, end_date = all_index_info['data']['all'][0][
  172. 'period'].split('|')
  173. # 重置start_date, end_date,以api返回的为准
  174. start_date, end_date, date_list = self.get_date_info(
  175. start_date, end_date
  176. )
  177. logger.info(
  178. 'all_start_date:%s, all_end_date:%s' % (start_date, end_date)
  179. )
  180. return date_list
  181. def init_date_list(self):
  182. if ini_config.start_date and ini_config.end_date:
  183. _, _, date_list = self.get_date_info(
  184. start_date=ini_config.start_date, end_date=ini_config.end_date
  185. )
  186. else:
  187. # 配置文件不配置start_date和end_date,就会按照索引的最大区间来,
  188. # 目前每个关键词的最大区间是一样的,都是从2011年1月1日开始的
  189. date_list = self._get_index_period(keyword=u'test', area=0)
  190. self.date_list = date_list
  191. def get_baidu_index(self, keyword, type_name, area):
  192. baidu_index_dict = dict()
  193. start = 0
  194. skip = 180
  195. end = len(self.date_list)
  196. while start < end:
  197. try:
  198. start_date = self.date_list[start]
  199. if start + skip >= end - 1:
  200. end_date = self.date_list[-1]
  201. else:
  202. end_date = self.date_list[start + skip]
  203. result = self.get_baidu_index_by_date_range(
  204. keyword, start_date, end_date, type_name, area
  205. )
  206. baidu_index_dict.update(result)
  207. start += skip + 1
  208. except:
  209. import traceback
  210. print traceback.format_exc()
  211. return baidu_index_dict
  212. def login(self, user_name, password):
  213. login_url = ini_config.login_url
  214. # 访问登陆页
  215. self.browser.get(login_url)
  216. time.sleep(2)
  217. # 自动填写表单并提交,如果出现验证码需要手动填写
  218. while 1:
  219. try:
  220. self.browser.find_element_by_id('TANGRAM__PSP_3__userName').clear()
  221. user_name_obj = self.browser.find_element_by_id(
  222. 'TANGRAM__PSP_3__userName'
  223. )
  224. break
  225. except:
  226. logger.error(traceback.format_exc())
  227. time.sleep(1)
  228. user_name_obj.send_keys(user_name)
  229. ps_obj = self.browser.find_element_by_id('TANGRAM__PSP_3__password')
  230. ps_obj.send_keys(password)
  231. sub_obj = self.browser.find_element_by_id('TANGRAM__PSP_3__submit')
  232. sub_obj.click()
  233. # 如果页面的url没有改变,则继续等待
  234. while self.browser.current_url == login_url:
  235. time.sleep(1)
  236. def close(self):
  237. if getattr(self, 'browser'):
  238. if self.browser:
  239. try:
  240. self.browser.quit()
  241. except:
  242. pass
  243. def get_cookie_json(self):
  244. return json.dumps(self.browser.get_cookies())
  245. def get_cookie_str(self, cookie_json=''):
  246. if cookie_json:
  247. cookies = json.loads(cookie_json)
  248. else:
  249. cookies = self.browser.get_cookies()
  250. return '; '.join(['%s=%s' % (item['name'], item['value'])
  251. for item in cookies])
  252. def login_with_cookie(self, cookie_json):
  253. self.browser.get('https://www.baidu.com/')
  254. for item in json.loads(cookie_json):
  255. try:
  256. self.browser.add_cookie(item)
  257. except:
  258. continue