598 lines
22 KiB
Python
598 lines
22 KiB
Python
"""微信数据适配层 — 封装 wechat_cli.core,提供群聊发现和增量消息查询"""
|
||
|
||
import hashlib
|
||
import logging
|
||
import os
|
||
import re
|
||
import sqlite3
|
||
import xml.etree.ElementTree as ET
|
||
from contextlib import closing
|
||
from datetime import datetime
|
||
|
||
from wechat_cli.core.context import AppContext
|
||
from wechat_cli.core.contacts import (
|
||
get_contact_names,
|
||
resolve_username,
|
||
display_name_for_username,
|
||
)
|
||
from wechat_cli.core.messages import (
|
||
find_msg_db_keys,
|
||
_find_msg_tables_for_user,
|
||
_is_safe_msg_table_name,
|
||
_build_message_filters,
|
||
decompress_content,
|
||
_parse_message_content,
|
||
_split_msg_type,
|
||
format_msg_type,
|
||
_load_name2id_maps,
|
||
)
|
||
|
||
log = logging.getLogger(__name__)
|
||
|
||
# 消息 base_type → 简化类型名
|
||
_TYPE_MAP = {
|
||
1: "text", 3: "image", 34: "voice", 42: "contact_card",
|
||
43: "video", 47: "sticker", 48: "location", 49: "link",
|
||
50: "call", 10000: "system", 10002: "revoked",
|
||
}
|
||
|
||
# 需要上传 COS 的媒体类型(image/voice 只上传已解密的可读文件,跳过 .dat)
|
||
MEDIA_TYPES = {"video", "file", "image", "voice"}
|
||
|
||
|
||
def _table_has_column(conn, table_name: str, column_name: str) -> bool:
|
||
"""检查 SQLite 表是否包含指定列"""
|
||
try:
|
||
cols = conn.execute(f"PRAGMA table_info([{table_name}])").fetchall()
|
||
return any(row[1] == column_name for row in cols)
|
||
except Exception:
|
||
return False
|
||
|
||
|
||
def _extract_appmsg_meta(content_xml: str) -> dict | None:
|
||
"""从 type=49 消息的 XML 中提取 appmsg 元信息(文件名、大小、类型等)
|
||
|
||
返回 dict: {title, des, file_size, file_ext, app_type,
|
||
refer_svrid, refer_displayname, refer_content} 或 None
|
||
"""
|
||
if not content_xml:
|
||
return None
|
||
try:
|
||
root = ET.fromstring(content_xml) if content_xml.lstrip().startswith("<") else None
|
||
if root is None:
|
||
return None
|
||
appmsg = root.find(".//appmsg")
|
||
if appmsg is None:
|
||
return None
|
||
app_type = int((appmsg.findtext("type") or "0").strip())
|
||
title = (appmsg.findtext("title") or "").strip()
|
||
des = (appmsg.findtext("des") or "").strip()
|
||
# 附件信息
|
||
attach = appmsg.find("appattach")
|
||
file_size = 0
|
||
file_ext = ""
|
||
if attach is not None:
|
||
file_size = int((attach.findtext("totallen") or "0").strip())
|
||
file_ext = (attach.findtext("fileext") or "").strip()
|
||
result = {
|
||
"app_type": app_type,
|
||
"title": title,
|
||
"des": des,
|
||
"file_size": file_size,
|
||
"file_ext": file_ext,
|
||
}
|
||
# 引用消息 (app_type=57): 提取 refermsg 中的 svrid 和被引用内容
|
||
if app_type == 57:
|
||
ref = appmsg.find(".//refermsg")
|
||
if ref is not None:
|
||
svrid_text = (ref.findtext("svrid") or "").strip()
|
||
if svrid_text:
|
||
try:
|
||
result["refer_svrid"] = int(svrid_text)
|
||
except ValueError:
|
||
pass
|
||
result["refer_displayname"] = (ref.findtext("displayname") or "").strip()
|
||
result["refer_content"] = (ref.findtext("content") or "").strip()
|
||
return result
|
||
except Exception:
|
||
return None
|
||
|
||
|
||
def _format_file_content(meta: dict) -> str:
|
||
"""将文件元信息格式化为可读的 content 字符串"""
|
||
parts = []
|
||
if meta.get("title"):
|
||
parts.append(meta["title"])
|
||
size = meta.get("file_size", 0)
|
||
if size > 0:
|
||
if size >= 1024 * 1024:
|
||
parts.append(f"({size / 1024 / 1024:.1f}MB)")
|
||
elif size >= 1024:
|
||
parts.append(f"({size / 1024:.1f}KB)")
|
||
else:
|
||
parts.append(f"({size}B)")
|
||
return " ".join(parts)
|
||
|
||
|
||
def _extract_video_meta(content_xml: str) -> str:
|
||
"""从视频消息 XML 中提取描述信息"""
|
||
if not content_xml:
|
||
return "[视频]"
|
||
try:
|
||
root = ET.fromstring(content_xml) if content_xml.lstrip().startswith("<") else None
|
||
if root is None:
|
||
return "[视频]"
|
||
video = root.find(".//videomsg")
|
||
if video is not None:
|
||
length = video.get("length", "")
|
||
raw_length = video.get("rawlength", "")
|
||
dur = length or raw_length
|
||
if dur:
|
||
return f"[视频] {dur}秒"
|
||
return "[视频]"
|
||
except Exception:
|
||
return "[视频]"
|
||
|
||
|
||
def _extract_chat_record(content_xml: str) -> str | None:
|
||
"""从 type=49, app_type=19 的聊天记录消息中提取纯文本
|
||
|
||
格式:
|
||
[聊天记录] 标题
|
||
发送者A: 消息内容
|
||
发送者B: 消息内容
|
||
...
|
||
"""
|
||
if not content_xml:
|
||
return None
|
||
try:
|
||
root = ET.fromstring(content_xml) if content_xml.lstrip().startswith("<") else None
|
||
if root is None:
|
||
return None
|
||
appmsg = root.find(".//appmsg")
|
||
if appmsg is None:
|
||
return None
|
||
|
||
title = (appmsg.findtext("title") or "聊天记录").strip()
|
||
lines = [f"[聊天记录] {title}"]
|
||
|
||
# recorditem 内嵌了一段 XML 字符串
|
||
recorditem_text = appmsg.findtext("recorditem") or ""
|
||
if not recorditem_text.strip():
|
||
return lines[0] if lines else None
|
||
|
||
rec_root = ET.fromstring(recorditem_text)
|
||
for item in rec_root.findall(".//datalist/dataitem"):
|
||
sender = (item.findtext("sourcename") or "").strip()
|
||
# datatitle 是聊天内容,datadesc 是附加描述
|
||
msg_text = (item.findtext("datatitle") or "").strip()
|
||
if not msg_text:
|
||
msg_text = (item.findtext("datadesc") or "").strip()
|
||
if not msg_text:
|
||
# 可能是图片/视频等非文本
|
||
data_type = item.get("datatype", "")
|
||
if data_type == "2":
|
||
msg_text = "[图片]"
|
||
elif data_type == "4":
|
||
msg_text = "[视频]"
|
||
elif data_type == "6":
|
||
msg_text = "[文件]"
|
||
else:
|
||
msg_text = "[其他]"
|
||
if sender:
|
||
lines.append(f"{sender}: {msg_text}")
|
||
else:
|
||
lines.append(msg_text)
|
||
|
||
return "\n".join(lines)
|
||
except Exception:
|
||
return None
|
||
|
||
|
||
def _extract_image_meta(content_xml: str) -> dict | None:
|
||
"""从图片消息 XML 中提取 md5 和 length 属性"""
|
||
if not content_xml:
|
||
return None
|
||
md5_m = re.search(r'\bmd5="([a-fA-F0-9]{32})"', content_xml)
|
||
len_m = re.search(r'\blength="(\d+)"', content_xml)
|
||
if not md5_m:
|
||
return None
|
||
return {
|
||
"md5": md5_m.group(1).lower(),
|
||
"length": int(len_m.group(1)) if len_m else 0,
|
||
}
|
||
|
||
|
||
def _file_md5(filepath: str) -> str:
|
||
"""计算文件内容的 md5"""
|
||
h = hashlib.md5()
|
||
with open(filepath, "rb") as f:
|
||
for chunk in iter(lambda: f.read(8192), b""):
|
||
h.update(chunk)
|
||
return h.hexdigest()
|
||
|
||
|
||
class WeChatAdapter:
|
||
def __init__(self):
|
||
self._app = AppContext()
|
||
self._names = get_contact_names(self._app.cache, self._app.decrypted_dir)
|
||
|
||
@property
|
||
def db_dir(self):
|
||
return self._app.db_dir
|
||
|
||
def refresh_names(self):
|
||
"""刷新联系人名称缓存(全局单例需要重置)"""
|
||
import wechat_cli.core.contacts as _c
|
||
_c._contact_names = None
|
||
_c._contact_full = None
|
||
self._names = get_contact_names(self._app.cache, self._app.decrypted_dir)
|
||
|
||
def list_group_sessions(self, limit=500) -> list[dict]:
|
||
"""列出所有群聊会话"""
|
||
path = self._app.cache.get(os.path.join("session", "session.db"))
|
||
if not path:
|
||
log.error("无法解密 session.db")
|
||
return []
|
||
|
||
with closing(sqlite3.connect(path)) as conn:
|
||
rows = conn.execute("""
|
||
SELECT username, unread_count, summary, last_timestamp,
|
||
last_msg_type, last_msg_sender, last_sender_display_name
|
||
FROM SessionTable
|
||
WHERE last_timestamp > 0
|
||
ORDER BY last_timestamp DESC
|
||
LIMIT ?
|
||
""", (limit,)).fetchall()
|
||
|
||
groups = []
|
||
for r in rows:
|
||
username, unread, summary, ts, msg_type, sender, sender_name = r
|
||
if "@chatroom" not in username:
|
||
continue
|
||
display = self._names.get(username, username)
|
||
groups.append({
|
||
"username": username,
|
||
"display_name": display,
|
||
"last_timestamp": ts,
|
||
"unread": unread or 0,
|
||
})
|
||
return groups
|
||
|
||
def resolve_group_username(self, group_name: str) -> str | None:
|
||
return resolve_username(group_name, self._app.cache, self._app.decrypted_dir)
|
||
|
||
def display_name(self, username: str) -> str:
|
||
return self._names.get(username, username)
|
||
|
||
def query_new_messages(self, username: str, after_ts: int, limit: int = 200) -> list[dict]:
|
||
"""增量查询某群新消息(create_time > after_ts),按时间升序返回结构化数据"""
|
||
tables = _find_msg_tables_for_user(
|
||
username, self._app.msg_db_keys, self._app.cache
|
||
)
|
||
if not tables:
|
||
return []
|
||
|
||
group_name = self._names.get(username, username)
|
||
is_group = "@chatroom" in username
|
||
all_messages = []
|
||
|
||
for table_info in tables:
|
||
# 优化:跳过 max_create_time <= after_ts 的表
|
||
if after_ts and table_info["max_create_time"] <= after_ts:
|
||
continue
|
||
|
||
db_path = table_info["db_path"]
|
||
table_name = table_info["table_name"]
|
||
|
||
if not _is_safe_msg_table_name(table_name):
|
||
continue
|
||
|
||
try:
|
||
conn = sqlite3.connect(db_path, timeout=5)
|
||
try:
|
||
id_to_username = _load_name2id_maps(conn)
|
||
|
||
has_server_id = _table_has_column(conn, table_name, "server_id")
|
||
|
||
# 自定义查询:ORDER BY create_time ASC
|
||
clauses, params = _build_message_filters(start_ts=after_ts)
|
||
# 改为严格大于(排除已入库的那条)
|
||
if clauses:
|
||
clauses[0] = "create_time > ?"
|
||
where_sql = f"WHERE {' AND '.join(clauses)}" if clauses else ""
|
||
extra_col = ", server_id" if has_server_id else ""
|
||
sql = f"""
|
||
SELECT local_id, local_type, create_time, real_sender_id,
|
||
message_content, WCDB_CT_message_content{extra_col}
|
||
FROM [{table_name}]
|
||
{where_sql}
|
||
ORDER BY create_time ASC
|
||
LIMIT ?
|
||
"""
|
||
rows = conn.execute(sql, (*params, limit)).fetchall()
|
||
|
||
for row in rows:
|
||
msg = self._parse_row(
|
||
row, username, group_name, is_group,
|
||
id_to_username, db_path, has_server_id
|
||
)
|
||
if msg:
|
||
all_messages.append(msg)
|
||
finally:
|
||
conn.close()
|
||
except Exception as e:
|
||
log.warning("查询 %s 的 %s 失败: %s", username, db_path, e)
|
||
|
||
# 跨表合并后按时间排序,截断到 limit
|
||
all_messages.sort(key=lambda m: m["create_time"])
|
||
return all_messages[:limit]
|
||
|
||
def _parse_row(self, row, username, group_name, is_group, id_to_username, db_path, has_server_id=False):
|
||
"""解析单条消息原始行为结构化 dict"""
|
||
local_id, local_type, create_time, real_sender_id, content_raw, ct = row[:6]
|
||
server_id = row[6] if has_server_id and len(row) > 6 else None
|
||
|
||
content_raw = decompress_content(content_raw, ct)
|
||
if content_raw is None:
|
||
content_raw = ""
|
||
|
||
# 解析发送者和消息内容
|
||
sender_from_content, text = _parse_message_content(content_raw, local_type, is_group)
|
||
|
||
# 解析发送者
|
||
sender_username = id_to_username.get(real_sender_id, "")
|
||
if not sender_username and sender_from_content:
|
||
sender_username = sender_from_content
|
||
sender_name = self._names.get(sender_username, sender_username)
|
||
|
||
# 解析消息类型
|
||
base_type, sub_type = _split_msg_type(local_type)
|
||
msg_type = _TYPE_MAP.get(base_type, "other")
|
||
if base_type == 49 and sub_type == 6:
|
||
msg_type = "file"
|
||
|
||
# content 处理:文本消息存原文,非文本消息提取元信息
|
||
# 注意:群聊 content_raw 格式为 "wxid:\n<msg>...",用 text(剥离发送者后的部分)解析 XML
|
||
xml_content = text if text else content_raw
|
||
refer_msg_svrid = None
|
||
if base_type == 1:
|
||
final_content = text
|
||
elif base_type == 49:
|
||
# appmsg 类型:文件(6)、链接(5)、小程序(33/36)、聊天记录(19)、引用(57) 等
|
||
meta = _extract_appmsg_meta(xml_content)
|
||
if meta and meta["app_type"] == 19:
|
||
# 聊天记录合并转发
|
||
final_content = _extract_chat_record(xml_content) or meta.get("title", "")
|
||
elif meta and meta["app_type"] == 57:
|
||
# 引用/回复消息
|
||
refer_msg_svrid = meta.get("refer_svrid")
|
||
quote_text = meta.get("title") or "[引用消息]"
|
||
ref_name = meta.get("refer_displayname", "")
|
||
ref_content = meta.get("refer_content", "")
|
||
if len(ref_content) > 160:
|
||
ref_content = ref_content[:160] + "..."
|
||
if ref_content:
|
||
prefix = f"回复 {ref_name}: " if ref_name else "回复: "
|
||
quote_text += f"\n ↳ {prefix}{ref_content}"
|
||
final_content = quote_text
|
||
log.debug("引用消息: refer_svrid=%s, title=%s", refer_msg_svrid, meta.get("title", ""))
|
||
elif meta:
|
||
if meta["app_type"] == 6:
|
||
final_content = _format_file_content(meta)
|
||
elif meta.get("title"):
|
||
final_content = meta["title"]
|
||
if meta.get("des"):
|
||
final_content += f" - {meta['des']}"
|
||
else:
|
||
final_content = ""
|
||
log.debug("appmsg 元信息: type=%d, title=%s", meta["app_type"], meta.get("title", ""))
|
||
else:
|
||
final_content = ""
|
||
elif base_type == 43:
|
||
final_content = _extract_video_meta(xml_content)
|
||
elif base_type == 34:
|
||
final_content = "[语音]"
|
||
elif base_type == 3:
|
||
img_meta = _extract_image_meta(xml_content)
|
||
if img_meta:
|
||
final_content = f"[图片] {img_meta['md5']}"
|
||
if img_meta["length"]:
|
||
final_content += f" size:{img_meta['length']}"
|
||
else:
|
||
final_content = "[图片]"
|
||
elif base_type == 47:
|
||
final_content = "[表情]"
|
||
elif base_type == 48:
|
||
final_content = "[位置]"
|
||
elif base_type == 42:
|
||
final_content = "[名片]"
|
||
else:
|
||
final_content = text if text else ""
|
||
|
||
# 解析媒体路径
|
||
media_path = None
|
||
if msg_type in MEDIA_TYPES and self._app.db_dir:
|
||
try:
|
||
if msg_type == "image":
|
||
media_path = self._resolve_readable_media(
|
||
base_type, content_raw, create_time, username
|
||
)
|
||
if media_path:
|
||
log.debug("图片路径解析成功: %s", media_path)
|
||
else:
|
||
log.info("图片文件未找到 (local_id=%d, ts=%d), 可能未点开",
|
||
local_id, create_time)
|
||
elif msg_type == "video":
|
||
media_path = self._resolve_video_path(content_raw, create_time)
|
||
if media_path:
|
||
log.debug("视频路径解析成功: %s", media_path)
|
||
else:
|
||
log.info("视频文件未找到 (local_id=%d, ts=%d), 可能未下载",
|
||
local_id, create_time)
|
||
elif msg_type == "file":
|
||
media_path = self._resolve_msg_file(final_content, create_time)
|
||
if media_path:
|
||
log.debug("文件路径解析成功: %s", media_path)
|
||
else:
|
||
log.info("文件未找到 (local_id=%d, ts=%d), 可能未下载",
|
||
local_id, create_time)
|
||
elif msg_type == "voice":
|
||
media_path = self._resolve_readable_media(
|
||
base_type, content_raw, create_time, username
|
||
)
|
||
if media_path:
|
||
log.debug("语音路径解析成功: %s", media_path)
|
||
else:
|
||
log.info("语音文件未找到 (local_id=%d, ts=%d)",
|
||
local_id, create_time)
|
||
except Exception as e:
|
||
log.warning("媒体路径解析异常 (local_id=%d, type=%s): %s",
|
||
local_id, msg_type, e)
|
||
|
||
return {
|
||
"group_username": username,
|
||
"group_name": group_name,
|
||
"local_id": local_id,
|
||
"local_type": local_type,
|
||
"create_time": create_time,
|
||
"sender_username": sender_username,
|
||
"sender_name": sender_name,
|
||
"msg_type": msg_type,
|
||
"content": final_content,
|
||
"media_path": media_path,
|
||
"source_db": os.path.basename(db_path),
|
||
"svr_msg_id": server_id,
|
||
"refer_msg_svrid": refer_msg_svrid,
|
||
}
|
||
|
||
def _resolve_msg_file(self, content: str, create_time: int) -> str | None:
|
||
"""在 msg/file/YYYY-MM/ 中按文件名查找(文件/视频/音频等都在此目录)"""
|
||
wechat_base = os.path.dirname(self._app.db_dir)
|
||
dt = datetime.fromtimestamp(create_time)
|
||
date_prefix = dt.strftime("%Y-%m")
|
||
|
||
file_dir = os.path.join(wechat_base, "msg", "file", date_prefix)
|
||
if not os.path.isdir(file_dir):
|
||
return None
|
||
|
||
# 从 content 提取文件名: "filename.ext (1.2MB)" 或 "[视频] 30秒" 等
|
||
title = (content or "").split(" (")[0].strip()
|
||
# 去掉前缀标记如 [视频]、[语音]
|
||
for prefix in ("[视频]", "[语音]"):
|
||
if title.startswith(prefix):
|
||
title = title[len(prefix):].strip()
|
||
break
|
||
|
||
if not title:
|
||
return None
|
||
|
||
# 精确匹配
|
||
target = os.path.join(file_dir, title)
|
||
if os.path.isfile(target):
|
||
return target
|
||
|
||
# 模糊匹配
|
||
for f in os.listdir(file_dir):
|
||
fp = os.path.join(file_dir, f)
|
||
if not os.path.isfile(fp):
|
||
continue
|
||
if title in f or f in title:
|
||
return fp
|
||
|
||
return None
|
||
|
||
def _resolve_readable_media(self, base_type: int, content: str, create_time: int, chat_username: str) -> str | None:
|
||
"""解析图片/语音的本地文件路径。
|
||
|
||
图片: 从 temp/RWTemp/YYYY-MM/ 中按 md5 查找原始图片。
|
||
语音: 只返回已解密的可读文件。
|
||
"""
|
||
wechat_base = os.path.dirname(self._app.db_dir)
|
||
|
||
dt = datetime.fromtimestamp(create_time)
|
||
date_prefix = dt.strftime("%Y-%m")
|
||
|
||
if base_type == 3:
|
||
img_meta = _extract_image_meta(content)
|
||
if not img_meta:
|
||
return None
|
||
rwtemp_dir = os.path.join(wechat_base, "temp", "RWTemp", date_prefix)
|
||
if not os.path.isdir(rwtemp_dir):
|
||
return None
|
||
target_size = img_meta["length"]
|
||
for f in os.listdir(rwtemp_dir):
|
||
fp = os.path.join(rwtemp_dir, f)
|
||
ext = os.path.splitext(f)[1].lower()
|
||
if ext not in (".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"):
|
||
continue
|
||
if not os.path.isfile(fp):
|
||
continue
|
||
if target_size and os.path.getsize(fp) == target_size:
|
||
return fp
|
||
if _file_md5(fp) == img_meta["md5"]:
|
||
return fp
|
||
return None
|
||
else:
|
||
# 语音:只返回已解密的可读文件
|
||
msg_dir = os.path.join(wechat_base, "msg")
|
||
attach_dir = os.path.join(msg_dir, "attach")
|
||
readable_exts = (".mp3", ".wav", ".amr", ".silk", ".m4a", ".ogg")
|
||
|
||
search_hashes = []
|
||
if chat_username:
|
||
h = hashlib.md5(chat_username.encode()).hexdigest()
|
||
candidate = os.path.join(attach_dir, h)
|
||
if os.path.isdir(candidate):
|
||
search_hashes.append(h)
|
||
if not search_hashes and os.path.isdir(attach_dir):
|
||
search_hashes = [
|
||
d for d in os.listdir(attach_dir)
|
||
if os.path.isdir(os.path.join(attach_dir, d))
|
||
]
|
||
|
||
for h in search_hashes:
|
||
sub = os.path.join(attach_dir, h, date_prefix, "Voice")
|
||
if not os.path.isdir(sub):
|
||
continue
|
||
for f in os.listdir(sub):
|
||
fp = os.path.join(sub, f)
|
||
if not os.path.isfile(fp):
|
||
continue
|
||
if f.endswith(".dat"):
|
||
continue
|
||
if f.lower().endswith(readable_exts):
|
||
return fp
|
||
|
||
return None
|
||
|
||
def _resolve_video_path(self, content: str, create_time: int) -> str | None:
|
||
"""在 msg/video/YYYY-MM/ 中按 rawmd5 查找 mp4"""
|
||
wechat_base = os.path.dirname(self._app.db_dir)
|
||
video_dir = os.path.join(wechat_base, "msg", "video")
|
||
if not os.path.isdir(video_dir):
|
||
return None
|
||
|
||
rawmd5 = None
|
||
md5_m = re.search(r'rawmd5="([a-f0-9]+)"', content or "")
|
||
if md5_m:
|
||
rawmd5 = md5_m.group(1)
|
||
if not rawmd5:
|
||
return None
|
||
|
||
dt = datetime.fromtimestamp(create_time)
|
||
date_prefix = dt.strftime("%Y-%m")
|
||
|
||
month_dir = os.path.join(video_dir, date_prefix)
|
||
if not os.path.isdir(month_dir):
|
||
return None
|
||
|
||
for f in os.listdir(month_dir):
|
||
if rawmd5 not in f:
|
||
continue
|
||
fp = os.path.join(month_dir, f)
|
||
if f.endswith(".mp4") and os.path.isfile(fp):
|
||
return fp
|
||
|
||
return None
|