wechat_msg_crawler/collector/wechat_adapter.py

598 lines
22 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""微信数据适配层 — 封装 wechat_cli.core提供群聊发现和增量消息查询"""
import hashlib
import logging
import os
import re
import sqlite3
import xml.etree.ElementTree as ET
from contextlib import closing
from datetime import datetime
from wechat_cli.core.context import AppContext
from wechat_cli.core.contacts import (
get_contact_names,
resolve_username,
display_name_for_username,
)
from wechat_cli.core.messages import (
find_msg_db_keys,
_find_msg_tables_for_user,
_is_safe_msg_table_name,
_build_message_filters,
decompress_content,
_parse_message_content,
_split_msg_type,
format_msg_type,
_load_name2id_maps,
)
log = logging.getLogger(__name__)
# 消息 base_type → 简化类型名
_TYPE_MAP = {
1: "text", 3: "image", 34: "voice", 42: "contact_card",
43: "video", 47: "sticker", 48: "location", 49: "link",
50: "call", 10000: "system", 10002: "revoked",
}
# 需要上传 COS 的媒体类型image/voice 只上传已解密的可读文件,跳过 .dat
MEDIA_TYPES = {"video", "file", "image", "voice"}
def _table_has_column(conn, table_name: str, column_name: str) -> bool:
"""检查 SQLite 表是否包含指定列"""
try:
cols = conn.execute(f"PRAGMA table_info([{table_name}])").fetchall()
return any(row[1] == column_name for row in cols)
except Exception:
return False
def _extract_appmsg_meta(content_xml: str) -> dict | None:
"""从 type=49 消息的 XML 中提取 appmsg 元信息(文件名、大小、类型等)
返回 dict: {title, des, file_size, file_ext, app_type,
refer_svrid, refer_displayname, refer_content} 或 None
"""
if not content_xml:
return None
try:
root = ET.fromstring(content_xml) if content_xml.lstrip().startswith("<") else None
if root is None:
return None
appmsg = root.find(".//appmsg")
if appmsg is None:
return None
app_type = int((appmsg.findtext("type") or "0").strip())
title = (appmsg.findtext("title") or "").strip()
des = (appmsg.findtext("des") or "").strip()
# 附件信息
attach = appmsg.find("appattach")
file_size = 0
file_ext = ""
if attach is not None:
file_size = int((attach.findtext("totallen") or "0").strip())
file_ext = (attach.findtext("fileext") or "").strip()
result = {
"app_type": app_type,
"title": title,
"des": des,
"file_size": file_size,
"file_ext": file_ext,
}
# 引用消息 (app_type=57): 提取 refermsg 中的 svrid 和被引用内容
if app_type == 57:
ref = appmsg.find(".//refermsg")
if ref is not None:
svrid_text = (ref.findtext("svrid") or "").strip()
if svrid_text:
try:
result["refer_svrid"] = int(svrid_text)
except ValueError:
pass
result["refer_displayname"] = (ref.findtext("displayname") or "").strip()
result["refer_content"] = (ref.findtext("content") or "").strip()
return result
except Exception:
return None
def _format_file_content(meta: dict) -> str:
"""将文件元信息格式化为可读的 content 字符串"""
parts = []
if meta.get("title"):
parts.append(meta["title"])
size = meta.get("file_size", 0)
if size > 0:
if size >= 1024 * 1024:
parts.append(f"({size / 1024 / 1024:.1f}MB)")
elif size >= 1024:
parts.append(f"({size / 1024:.1f}KB)")
else:
parts.append(f"({size}B)")
return " ".join(parts)
def _extract_video_meta(content_xml: str) -> str:
"""从视频消息 XML 中提取描述信息"""
if not content_xml:
return "[视频]"
try:
root = ET.fromstring(content_xml) if content_xml.lstrip().startswith("<") else None
if root is None:
return "[视频]"
video = root.find(".//videomsg")
if video is not None:
length = video.get("length", "")
raw_length = video.get("rawlength", "")
dur = length or raw_length
if dur:
return f"[视频] {dur}"
return "[视频]"
except Exception:
return "[视频]"
def _extract_chat_record(content_xml: str) -> str | None:
"""从 type=49, app_type=19 的聊天记录消息中提取纯文本
格式:
[聊天记录] 标题
发送者A: 消息内容
发送者B: 消息内容
...
"""
if not content_xml:
return None
try:
root = ET.fromstring(content_xml) if content_xml.lstrip().startswith("<") else None
if root is None:
return None
appmsg = root.find(".//appmsg")
if appmsg is None:
return None
title = (appmsg.findtext("title") or "聊天记录").strip()
lines = [f"[聊天记录] {title}"]
# recorditem 内嵌了一段 XML 字符串
recorditem_text = appmsg.findtext("recorditem") or ""
if not recorditem_text.strip():
return lines[0] if lines else None
rec_root = ET.fromstring(recorditem_text)
for item in rec_root.findall(".//datalist/dataitem"):
sender = (item.findtext("sourcename") or "").strip()
# datatitle 是聊天内容datadesc 是附加描述
msg_text = (item.findtext("datatitle") or "").strip()
if not msg_text:
msg_text = (item.findtext("datadesc") or "").strip()
if not msg_text:
# 可能是图片/视频等非文本
data_type = item.get("datatype", "")
if data_type == "2":
msg_text = "[图片]"
elif data_type == "4":
msg_text = "[视频]"
elif data_type == "6":
msg_text = "[文件]"
else:
msg_text = "[其他]"
if sender:
lines.append(f"{sender}: {msg_text}")
else:
lines.append(msg_text)
return "\n".join(lines)
except Exception:
return None
def _extract_image_meta(content_xml: str) -> dict | None:
"""从图片消息 XML 中提取 md5 和 length 属性"""
if not content_xml:
return None
md5_m = re.search(r'\bmd5="([a-fA-F0-9]{32})"', content_xml)
len_m = re.search(r'\blength="(\d+)"', content_xml)
if not md5_m:
return None
return {
"md5": md5_m.group(1).lower(),
"length": int(len_m.group(1)) if len_m else 0,
}
def _file_md5(filepath: str) -> str:
"""计算文件内容的 md5"""
h = hashlib.md5()
with open(filepath, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):
h.update(chunk)
return h.hexdigest()
class WeChatAdapter:
def __init__(self):
self._app = AppContext()
self._names = get_contact_names(self._app.cache, self._app.decrypted_dir)
@property
def db_dir(self):
return self._app.db_dir
def refresh_names(self):
"""刷新联系人名称缓存(全局单例需要重置)"""
import wechat_cli.core.contacts as _c
_c._contact_names = None
_c._contact_full = None
self._names = get_contact_names(self._app.cache, self._app.decrypted_dir)
def list_group_sessions(self, limit=500) -> list[dict]:
"""列出所有群聊会话"""
path = self._app.cache.get(os.path.join("session", "session.db"))
if not path:
log.error("无法解密 session.db")
return []
with closing(sqlite3.connect(path)) as conn:
rows = conn.execute("""
SELECT username, unread_count, summary, last_timestamp,
last_msg_type, last_msg_sender, last_sender_display_name
FROM SessionTable
WHERE last_timestamp > 0
ORDER BY last_timestamp DESC
LIMIT ?
""", (limit,)).fetchall()
groups = []
for r in rows:
username, unread, summary, ts, msg_type, sender, sender_name = r
if "@chatroom" not in username:
continue
display = self._names.get(username, username)
groups.append({
"username": username,
"display_name": display,
"last_timestamp": ts,
"unread": unread or 0,
})
return groups
def resolve_group_username(self, group_name: str) -> str | None:
return resolve_username(group_name, self._app.cache, self._app.decrypted_dir)
def display_name(self, username: str) -> str:
return self._names.get(username, username)
def query_new_messages(self, username: str, after_ts: int, limit: int = 200) -> list[dict]:
"""增量查询某群新消息create_time > after_ts按时间升序返回结构化数据"""
tables = _find_msg_tables_for_user(
username, self._app.msg_db_keys, self._app.cache
)
if not tables:
return []
group_name = self._names.get(username, username)
is_group = "@chatroom" in username
all_messages = []
for table_info in tables:
# 优化:跳过 max_create_time <= after_ts 的表
if after_ts and table_info["max_create_time"] <= after_ts:
continue
db_path = table_info["db_path"]
table_name = table_info["table_name"]
if not _is_safe_msg_table_name(table_name):
continue
try:
conn = sqlite3.connect(db_path, timeout=5)
try:
id_to_username = _load_name2id_maps(conn)
has_server_id = _table_has_column(conn, table_name, "server_id")
# 自定义查询ORDER BY create_time ASC
clauses, params = _build_message_filters(start_ts=after_ts)
# 改为严格大于(排除已入库的那条)
if clauses:
clauses[0] = "create_time > ?"
where_sql = f"WHERE {' AND '.join(clauses)}" if clauses else ""
extra_col = ", server_id" if has_server_id else ""
sql = f"""
SELECT local_id, local_type, create_time, real_sender_id,
message_content, WCDB_CT_message_content{extra_col}
FROM [{table_name}]
{where_sql}
ORDER BY create_time ASC
LIMIT ?
"""
rows = conn.execute(sql, (*params, limit)).fetchall()
for row in rows:
msg = self._parse_row(
row, username, group_name, is_group,
id_to_username, db_path, has_server_id
)
if msg:
all_messages.append(msg)
finally:
conn.close()
except Exception as e:
log.warning("查询 %s%s 失败: %s", username, db_path, e)
# 跨表合并后按时间排序,截断到 limit
all_messages.sort(key=lambda m: m["create_time"])
return all_messages[:limit]
def _parse_row(self, row, username, group_name, is_group, id_to_username, db_path, has_server_id=False):
"""解析单条消息原始行为结构化 dict"""
local_id, local_type, create_time, real_sender_id, content_raw, ct = row[:6]
server_id = row[6] if has_server_id and len(row) > 6 else None
content_raw = decompress_content(content_raw, ct)
if content_raw is None:
content_raw = ""
# 解析发送者和消息内容
sender_from_content, text = _parse_message_content(content_raw, local_type, is_group)
# 解析发送者
sender_username = id_to_username.get(real_sender_id, "")
if not sender_username and sender_from_content:
sender_username = sender_from_content
sender_name = self._names.get(sender_username, sender_username)
# 解析消息类型
base_type, sub_type = _split_msg_type(local_type)
msg_type = _TYPE_MAP.get(base_type, "other")
if base_type == 49 and sub_type == 6:
msg_type = "file"
# content 处理:文本消息存原文,非文本消息提取元信息
# 注意:群聊 content_raw 格式为 "wxid:\n<msg>...",用 text剥离发送者后的部分解析 XML
xml_content = text if text else content_raw
refer_msg_svrid = None
if base_type == 1:
final_content = text
elif base_type == 49:
# appmsg 类型:文件(6)、链接(5)、小程序(33/36)、聊天记录(19)、引用(57) 等
meta = _extract_appmsg_meta(xml_content)
if meta and meta["app_type"] == 19:
# 聊天记录合并转发
final_content = _extract_chat_record(xml_content) or meta.get("title", "")
elif meta and meta["app_type"] == 57:
# 引用/回复消息
refer_msg_svrid = meta.get("refer_svrid")
quote_text = meta.get("title") or "[引用消息]"
ref_name = meta.get("refer_displayname", "")
ref_content = meta.get("refer_content", "")
if len(ref_content) > 160:
ref_content = ref_content[:160] + "..."
if ref_content:
prefix = f"回复 {ref_name}: " if ref_name else "回复: "
quote_text += f"\n{prefix}{ref_content}"
final_content = quote_text
log.debug("引用消息: refer_svrid=%s, title=%s", refer_msg_svrid, meta.get("title", ""))
elif meta:
if meta["app_type"] == 6:
final_content = _format_file_content(meta)
elif meta.get("title"):
final_content = meta["title"]
if meta.get("des"):
final_content += f" - {meta['des']}"
else:
final_content = ""
log.debug("appmsg 元信息: type=%d, title=%s", meta["app_type"], meta.get("title", ""))
else:
final_content = ""
elif base_type == 43:
final_content = _extract_video_meta(xml_content)
elif base_type == 34:
final_content = "[语音]"
elif base_type == 3:
img_meta = _extract_image_meta(xml_content)
if img_meta:
final_content = f"[图片] {img_meta['md5']}"
if img_meta["length"]:
final_content += f" size:{img_meta['length']}"
else:
final_content = "[图片]"
elif base_type == 47:
final_content = "[表情]"
elif base_type == 48:
final_content = "[位置]"
elif base_type == 42:
final_content = "[名片]"
else:
final_content = text if text else ""
# 解析媒体路径
media_path = None
if msg_type in MEDIA_TYPES and self._app.db_dir:
try:
if msg_type == "image":
media_path = self._resolve_readable_media(
base_type, content_raw, create_time, username
)
if media_path:
log.debug("图片路径解析成功: %s", media_path)
else:
log.info("图片文件未找到 (local_id=%d, ts=%d), 可能未点开",
local_id, create_time)
elif msg_type == "video":
media_path = self._resolve_video_path(content_raw, create_time)
if media_path:
log.debug("视频路径解析成功: %s", media_path)
else:
log.info("视频文件未找到 (local_id=%d, ts=%d), 可能未下载",
local_id, create_time)
elif msg_type == "file":
media_path = self._resolve_msg_file(final_content, create_time)
if media_path:
log.debug("文件路径解析成功: %s", media_path)
else:
log.info("文件未找到 (local_id=%d, ts=%d), 可能未下载",
local_id, create_time)
elif msg_type == "voice":
media_path = self._resolve_readable_media(
base_type, content_raw, create_time, username
)
if media_path:
log.debug("语音路径解析成功: %s", media_path)
else:
log.info("语音文件未找到 (local_id=%d, ts=%d)",
local_id, create_time)
except Exception as e:
log.warning("媒体路径解析异常 (local_id=%d, type=%s): %s",
local_id, msg_type, e)
return {
"group_username": username,
"group_name": group_name,
"local_id": local_id,
"local_type": local_type,
"create_time": create_time,
"sender_username": sender_username,
"sender_name": sender_name,
"msg_type": msg_type,
"content": final_content,
"media_path": media_path,
"source_db": os.path.basename(db_path),
"svr_msg_id": server_id,
"refer_msg_svrid": refer_msg_svrid,
}
def _resolve_msg_file(self, content: str, create_time: int) -> str | None:
"""在 msg/file/YYYY-MM/ 中按文件名查找(文件/视频/音频等都在此目录)"""
wechat_base = os.path.dirname(self._app.db_dir)
dt = datetime.fromtimestamp(create_time)
date_prefix = dt.strftime("%Y-%m")
file_dir = os.path.join(wechat_base, "msg", "file", date_prefix)
if not os.path.isdir(file_dir):
return None
# 从 content 提取文件名: "filename.ext (1.2MB)" 或 "[视频] 30秒" 等
title = (content or "").split(" (")[0].strip()
# 去掉前缀标记如 [视频]、[语音]
for prefix in ("[视频]", "[语音]"):
if title.startswith(prefix):
title = title[len(prefix):].strip()
break
if not title:
return None
# 精确匹配
target = os.path.join(file_dir, title)
if os.path.isfile(target):
return target
# 模糊匹配
for f in os.listdir(file_dir):
fp = os.path.join(file_dir, f)
if not os.path.isfile(fp):
continue
if title in f or f in title:
return fp
return None
def _resolve_readable_media(self, base_type: int, content: str, create_time: int, chat_username: str) -> str | None:
"""解析图片/语音的本地文件路径。
图片: 从 temp/RWTemp/YYYY-MM/ 中按 md5 查找原始图片。
语音: 只返回已解密的可读文件。
"""
wechat_base = os.path.dirname(self._app.db_dir)
dt = datetime.fromtimestamp(create_time)
date_prefix = dt.strftime("%Y-%m")
if base_type == 3:
img_meta = _extract_image_meta(content)
if not img_meta:
return None
rwtemp_dir = os.path.join(wechat_base, "temp", "RWTemp", date_prefix)
if not os.path.isdir(rwtemp_dir):
return None
target_size = img_meta["length"]
for f in os.listdir(rwtemp_dir):
fp = os.path.join(rwtemp_dir, f)
ext = os.path.splitext(f)[1].lower()
if ext not in (".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"):
continue
if not os.path.isfile(fp):
continue
if target_size and os.path.getsize(fp) == target_size:
return fp
if _file_md5(fp) == img_meta["md5"]:
return fp
return None
else:
# 语音:只返回已解密的可读文件
msg_dir = os.path.join(wechat_base, "msg")
attach_dir = os.path.join(msg_dir, "attach")
readable_exts = (".mp3", ".wav", ".amr", ".silk", ".m4a", ".ogg")
search_hashes = []
if chat_username:
h = hashlib.md5(chat_username.encode()).hexdigest()
candidate = os.path.join(attach_dir, h)
if os.path.isdir(candidate):
search_hashes.append(h)
if not search_hashes and os.path.isdir(attach_dir):
search_hashes = [
d for d in os.listdir(attach_dir)
if os.path.isdir(os.path.join(attach_dir, d))
]
for h in search_hashes:
sub = os.path.join(attach_dir, h, date_prefix, "Voice")
if not os.path.isdir(sub):
continue
for f in os.listdir(sub):
fp = os.path.join(sub, f)
if not os.path.isfile(fp):
continue
if f.endswith(".dat"):
continue
if f.lower().endswith(readable_exts):
return fp
return None
def _resolve_video_path(self, content: str, create_time: int) -> str | None:
"""在 msg/video/YYYY-MM/ 中按 rawmd5 查找 mp4"""
wechat_base = os.path.dirname(self._app.db_dir)
video_dir = os.path.join(wechat_base, "msg", "video")
if not os.path.isdir(video_dir):
return None
rawmd5 = None
md5_m = re.search(r'rawmd5="([a-f0-9]+)"', content or "")
if md5_m:
rawmd5 = md5_m.group(1)
if not rawmd5:
return None
dt = datetime.fromtimestamp(create_time)
date_prefix = dt.strftime("%Y-%m")
month_dir = os.path.join(video_dir, date_prefix)
if not os.path.isdir(month_dir):
return None
for f in os.listdir(month_dir):
if rawmd5 not in f:
continue
fp = os.path.join(month_dir, f)
if f.endswith(".mp4") and os.path.isfile(fp):
return fp
return None