1
0

parser.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. # -*- coding: UTF-8 -*-
  2. import sqlite3
  3. from collections import defaultdict, Counter
  4. import itertools
  5. from datetime import datetime
  6. import logging
  7. logger = logging.getLogger(__name__)
  8. from .msg import WeChatMsg, TYPE_SYSTEM
  9. """ tables in concern:
  10. emojiinfo
  11. imginfo2
  12. addr_upload2
  13. chatroom
  14. message
  15. rcontact
  16. img_flag
  17. """
  18. class WeChatDBParser(object):
  19. FIELDS = ["msgSvrId","type","isSend","createTime","talker","content","imgPath"]
  20. def __init__(self, db_fname):
  21. """ db_fname: a decrypted EnMicroMsg.db"""
  22. self.db_fname = db_fname
  23. self.db_conn = sqlite3.connect(self.db_fname)
  24. self.db_conn_bytes = sqlite3.connect(self.db_fname)
  25. # https://stackoverflow.com/questions/22751363/sqlite3-operationalerror-could-not-decode-to-utf-8-column
  26. self.db_conn_bytes.text_factory = lambda b: b
  27. self.cc = self.db_conn.cursor()
  28. self.contacts = {} # username -> nickname
  29. self.contacts_rev = defaultdict(list)
  30. self.msgs_by_chat = defaultdict(list)
  31. self.emoji_groups = {}
  32. self.emoji_info = {}
  33. self.emoji_encryption_key = None
  34. self.avatar_urls = {}
  35. self._parse()
  36. def _parse_contact(self):
  37. contacts = self.cc.execute(
  38. """
  39. SELECT username,conRemark,nickname FROM rcontact
  40. """)
  41. for row in contacts:
  42. username, remark, nickname = row
  43. if remark:
  44. self.contacts[username] = remark
  45. else:
  46. self.contacts[username] = nickname
  47. for k, v in self.contacts.items():
  48. self.contacts_rev[v].append(k)
  49. logger.info("Found {} names in `contact` table.".format(len(self.contacts)))
  50. def _parse_msg(self):
  51. msgs_tot_cnt = 0
  52. db_msgs = self.db_conn_bytes.cursor().execute(
  53. """
  54. SELECT {} FROM message
  55. """.format(','.join(WeChatDBParser.FIELDS)))
  56. unknown_type_cnt = Counter()
  57. for row in db_msgs:
  58. values = self._parse_msg_row(row)
  59. if not values:
  60. continue
  61. msg = WeChatMsg(values)
  62. # TODO keep system message?
  63. if not WeChatMsg.filter_type(msg.type):
  64. self.msgs_by_chat[msg.chat].append(msg)
  65. if not msg.known_type:
  66. unknown_type_cnt[msg.type] += 1
  67. logger.warning("[Parser] Unhandled messages (type->cnt): {}".format(unknown_type_cnt))
  68. for k, v in self.msgs_by_chat.items():
  69. self.msgs_by_chat[k] = sorted(v, key=lambda x: x.createTime)
  70. msgs_tot_cnt += len(v)
  71. logger.info("Found {} message records.".format(msgs_tot_cnt))
  72. def _parse_userinfo(self):
  73. userinfo_q = self.cc.execute(""" SELECT id, value FROM userinfo """)
  74. userinfo = dict(userinfo_q)
  75. self.username = userinfo.get(2, None)
  76. if self.username is None:
  77. nickname = userinfo.get(4, None)
  78. if nickname is not None:
  79. self.username = self.contacts_rev.get(nickname, [None])[0]
  80. if self.username is None:
  81. logger.error("Cannot find username in userinfo table!")
  82. self.username = input("Please enter your username:")
  83. assert isinstance(self.username, str), self.username
  84. logger.info("Your username is: {}".format(self.username))
  85. def _parse_imginfo(self):
  86. imginfo_q = self.cc.execute("""SELECT msgSvrId, bigImgPath FROM ImgInfo2""")
  87. self.imginfo = {k: v for (k, v) in imginfo_q
  88. if not v.startswith('SERVERID://')}
  89. logger.info("Found {} hd image records.".format(len(self.imginfo)))
  90. def _find_msg_by_type(self, msgs=None):
  91. ret = []
  92. if msgs is None:
  93. msgs = itertools.chain.from_iterable(self.msgs_by_chat.itervalues())
  94. for msg in msgs:
  95. if msg.type == 34:
  96. ret.append(msg)
  97. return sorted(ret)
  98. def _parse_emoji(self):
  99. # wechat provided emojis
  100. query = self.cc.execute(
  101. """ SELECT md5, groupid FROM EmojiInfoDesc """)
  102. for row in query:
  103. md5, group = row
  104. self.emoji_groups[md5] = group
  105. try:
  106. query = self.cc.execute(
  107. """ SELECT md5, catalog, name, cdnUrl, encrypturl, aeskey FROM EmojiInfo""")
  108. except: # old database does not have cdnurl
  109. pass
  110. else:
  111. for row in query:
  112. md5, catalog, name, cdnUrl, encrypturl, aeskey = row
  113. if cdnUrl or encrypturl:
  114. self.emoji_info[md5] = (catalog, cdnUrl, encrypturl, aeskey)
  115. def _parse_img_flag(self):
  116. """Parse the img_flag table which stores avatar for each id."""
  117. query = self.cc.execute(
  118. """ SELECT username, reserved1 FROM img_flag """)
  119. for row in query:
  120. username, url = row
  121. if url:
  122. self.avatar_urls[username] = url
  123. def _parse(self):
  124. self._parse_contact()
  125. self._parse_userinfo() # depend on self.contacts
  126. self._parse_msg()
  127. self._parse_imginfo()
  128. self._parse_emoji()
  129. self._parse_img_flag()
  130. def get_emoji_encryption_key(self):
  131. # obtain local encryption key in a special entry in the database
  132. # this also equals to md5(imei)
  133. query = self.cc.execute("SELECT md5 FROM EmojiInfo where catalog == 153")
  134. results = list(query)
  135. if len(results):
  136. assert len(results) == 1, "Found > 1 encryption keys in EmojiInfo. This is a bug!"
  137. return results[0][0]
  138. return None
  139. # process the values in a row
  140. def _parse_msg_row(self, row):
  141. """Parse a record of message into my format.
  142. Note that message are read in binary format.
  143. """
  144. values = dict(zip(WeChatDBParser.FIELDS, row))
  145. values['createTime'] = datetime.fromtimestamp(values['createTime']/ 1000)
  146. if values['content']:
  147. try:
  148. values['content'] = values['content'].decode()
  149. except:
  150. logger.warning(f"Invalid byte sequence in message content (type={values['type']}, createTime={values['createTime']})")
  151. values['content'] = 'FAILED TO DECODE'
  152. else:
  153. values['content'] = ''
  154. values['talker'] = values['talker'].decode()
  155. if values['imgPath']:
  156. values['imgPath'] = values['imgPath'].decode()
  157. values['chat'] = values['talker']
  158. try:
  159. if values['chat'].endswith('@chatroom'):
  160. values['chat_nickname'] = self.contacts[values['chat']]
  161. content = values['content']
  162. if values['isSend'] == 1:
  163. values['talker'] = self.username
  164. elif values['type'] == TYPE_SYSTEM:
  165. values['talker'] = 'SYSTEM'
  166. else:
  167. talker = content[:content.find(':')]
  168. values['talker'] = talker
  169. values['talker_nickname'] = self.contacts.get(talker, talker)
  170. values['content'] = content[content.find('\n') + 1:]
  171. else:
  172. tk_id = values['talker']
  173. values['chat'] = tk_id
  174. values['chat_nickname'] = self.contacts[tk_id]
  175. values['talker'] = tk_id
  176. values['talker_nickname'] = self.contacts[tk_id]
  177. except KeyError:
  178. # It's possible that messages are kept in database after contacts been deleted
  179. logger.warn("Unknown contact: {}".format(values.get('talker', '')))
  180. return None
  181. return values
  182. @property
  183. def all_chat_ids(self):
  184. return self.msgs_by_chat.keys()
  185. @property
  186. def all_chat_nicknames(self):
  187. return [self.contacts[k] for k in self.all_chat_ids if len(self.contacts[k])]
  188. def get_id_by_nickname(self, nickname):
  189. """
  190. Get chat id by nickname.
  191. """
  192. l = self.contacts_rev[nickname]
  193. if len(l) == 0:
  194. raise KeyError("No contacts have nickname {}".format(nickname))
  195. if len(l) > 1:
  196. logger.warn("More than one contacts have nickname {}! Using the first contact".format(nickname))
  197. return l[0]
  198. def get_chat_id(self, nick_name_or_id):
  199. """
  200. Get the unique chat id by either chat id itself, or the nickname of the chat.
  201. """
  202. if nick_name_or_id in self.contacts:
  203. return nick_name_or_id
  204. else:
  205. return self.get_id_by_nickname(nick_name_or_id)