main.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. import os
  4. import traceback
  5. import sys
  6. import xlwt
  7. import chardet
  8. from browser import BaiduBrowser
  9. from utils.log import logger
  10. from config import ini_config
  11. from city import final_city_dict
  12. index_type_dict = {
  13. 'all': u'整体趋势', 'pc': u'PC趋势', 'wise': u'移动趋势'
  14. }
  15. if sys.platform in ['win32', 'cygwin']:
  16. FILE_NAME_ENCODING = 'gbk'
  17. else:
  18. FILE_NAME_ENCODING = 'utf-8'
  19. def save_cookie_to_file(cookie_json):
  20. with open(ini_config.cookie_file_path, 'w') as f:
  21. f.write(cookie_json)
  22. def load_cookie_from_file():
  23. cookie_json = ''
  24. if os.path.exists(ini_config.cookie_file_path):
  25. with open(ini_config.cookie_file_path, 'r') as f:
  26. cookie_json = f.read()
  27. return cookie_json
  28. def main():
  29. logger.info(u'请确保你填写的账号密码能够成功登陆百度')
  30. # 创建data目录
  31. result_folder = ini_config.out_file_path
  32. if not os.path.exists(result_folder):
  33. os.makedirs(result_folder)
  34. # 加载曾经保存的cookie文件,尽量避免重复登录
  35. cookie_json = load_cookie_from_file()
  36. baidu_browser = BaiduBrowser(cookie_json=cookie_json)
  37. # 将登陆成功后的cookie_json保存到文件
  38. save_cookie_to_file(baidu_browser.get_cookie_json())
  39. logger.info(u'登陆成功')
  40. fp = open(ini_config.keywords_task_file_path, 'rb')
  41. task_list = fp.readlines()
  42. fp.close()
  43. root = os.path.dirname(os.path.realpath(__file__))
  44. result_folder = os.path.join(root, ini_config.out_file_path)
  45. if not os.path.exists(result_folder):
  46. os.makedirs(result_folder)
  47. for keyword in task_list:
  48. try:
  49. keyword = keyword.strip()
  50. if not keyword:
  51. continue
  52. parse_one_keyword(keyword, result_folder, baidu_browser)
  53. except:
  54. print traceback.format_exc()
  55. def parse_one_keyword(keyword, result_folder,
  56. baidu_browser):
  57. area_list = ini_config.area_list.split(',')
  58. area_list = [_.strip() for _ in area_list]
  59. type_list = ini_config.index_type_list.split(',')
  60. type_list = [_.strip() for _ in type_list]
  61. detect_result = chardet.detect(keyword)
  62. encoding = detect_result['encoding'] if detect_result else 'gbk'
  63. keyword_unicode = keyword.decode(encoding, 'ignore')
  64. logger.info('%s start' % keyword_unicode)
  65. for area in area_list:
  66. for type_name in type_list:
  67. baidu_index_dict = baidu_browser.get_baidu_index(
  68. keyword_unicode, type_name, area
  69. )
  70. type_name_zh = index_type_dict.get(type_name)
  71. file_name = u'%s_%s_%s.xls' % (
  72. keyword_unicode,
  73. final_city_dict[area],
  74. type_name_zh
  75. )
  76. file_name = file_name.encode(FILE_NAME_ENCODING, 'ignore')
  77. file_path = os.path.join(result_folder, file_name)
  78. data_list = []
  79. for date in baidu_browser.date_list:
  80. value = baidu_index_dict.get(date, 0)
  81. data_list.append(
  82. (keyword_unicode, date, type_name_zh, value)
  83. )
  84. write_excel(file_path, data_list)
  85. def write_excel(excel_file, data_list):
  86. wb = xlwt.Workbook()
  87. ws = wb.add_sheet(u'工作表1')
  88. row = 0
  89. ws.write(row, 0, u'关键词')
  90. ws.write(row, 1, u'日期')
  91. ws.write(row, 2, u'类型')
  92. ws.write(row, 3, u'指数')
  93. row = 1
  94. for result in data_list:
  95. col = 0
  96. for item in result:
  97. ws.write(row, col, item)
  98. col += 1
  99. row += 1
  100. wb.save(excel_file)
  101. if __name__ == '__main__':
  102. main()