libchathelper.py 3.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. # -*- coding: UTF-8 -*-
  2. import base64
  3. from pyquery import PyQuery
  4. import logging
  5. import json
  6. logger = logging.getLogger(__name__)
  7. from libchat.libchat import SqliteLibChat, ChatMsg
  8. from .msg import *
  9. from .common.timer import timing
  10. from .common.progress import ProgressReporter
  11. class LibChatHelper(object):
  12. """ Build LibChat messages from WeChat Msg"""
  13. """ Types of message whose contents are fully parsed.
  14. No need to save extra data for them. """
  15. FullyParsed = [TYPE_MSG, TYPE_SPEAK, TYPE_EMOJI,
  16. TYPE_CUSTOM_EMOJI, TYPE_IMG]
  17. def __init__(self, parser, res):
  18. """ res: a 'Resource' instance
  19. parser: a 'WeChatDBParser' instance
  20. """
  21. self.parser = parser
  22. self.res = res
  23. def _get_image(self, msg):
  24. """ get image content and type from a message"""
  25. if msg.type == TYPE_IMG:
  26. # imgPath was original THUMBNAIL_DIRPATH://th_xxxxxxxxx
  27. imgpath = msg.imgPath.split('_')[-1]
  28. if not imgpath:
  29. logger.warn(
  30. 'No imgpath in an image message. Perhaps a bug in wechat: {}'.format(msg))
  31. return '', ''
  32. bigimgpath = self.parser.imginfo.get(msg.msgSvrId)
  33. img = self.res.get_img([imgpath, bigimgpath])
  34. if not img:
  35. logger.warn("No image found for {}".format(imgpath))
  36. return img, 'jpeg'
  37. elif msg.type == TYPE_EMOJI:
  38. md5 = msg.imgPath
  39. if md5:
  40. emoji_img, format = self.res.get_emoji_by_md5(md5)
  41. return emoji_img, format
  42. else:
  43. return '', ''
  44. elif msg.type == TYPE_CUSTOM_EMOJI:
  45. pq = PyQuery(msg.content)
  46. md5 = pq('emoticonmd5').text()
  47. if md5:
  48. img, format = self.res.get_emoji(md5, None)
  49. return img, format
  50. else:
  51. return '', ''
  52. else:
  53. return '', ''
  54. def _get_sound(self, msg):
  55. if msg.type == TYPE_SPEAK:
  56. audio_str, duration = self.res.get_voice_mp3(msg.imgPath)
  57. return base64.b64decode(audio_str)
  58. return b''
  59. def _get_extra(self, msg):
  60. ret = {}
  61. ret['type'] = msg.type
  62. if msg.type not in LibChatHelper.FullyParsed:
  63. ret['content'] = msg.content
  64. return json.dumps(ret)
  65. def _convert_msg(self, msg):
  66. sender = 'me' if msg.isSend else msg.talker
  67. chatroom = msg.get_chatroom()
  68. text = msg.content if msg.type == TYPE_MSG else ''
  69. img, format = self._get_image(msg)
  70. if img:
  71. # TODO don't use b64, directly return image content
  72. img = base64.b64decode(img)
  73. # TODO do we need to save format or voice duration?
  74. sound = self._get_sound(msg)
  75. extra = self._get_extra(msg)
  76. self.prgs.trigger()
  77. return ChatMsg(
  78. 'wechat', msg.createTime, sender, chatroom,
  79. text, img, sound, extra)
  80. def convert_msgs(self, msgs):
  81. self.prgs = ProgressReporter("Parse Messages", total=len(msgs))
  82. ret = [self._convert_msg(m) for m in msgs]
  83. self.prgs.finish()
  84. return ret