"""微信数据适配层 — 封装 wechat_cli.core,提供群聊发现和增量消息查询""" import hashlib import logging import os import re import sqlite3 import xml.etree.ElementTree as ET from contextlib import closing from datetime import datetime from wechat_cli.core.context import AppContext from wechat_cli.core.contacts import ( get_contact_names, resolve_username, display_name_for_username, ) from wechat_cli.core.messages import ( find_msg_db_keys, _find_msg_tables_for_user, _is_safe_msg_table_name, _build_message_filters, decompress_content, _parse_message_content, _split_msg_type, format_msg_type, _load_name2id_maps, ) log = logging.getLogger(__name__) # 消息 base_type → 简化类型名 _TYPE_MAP = { 1: "text", 3: "image", 34: "voice", 42: "contact_card", 43: "video", 47: "sticker", 48: "location", 49: "link", 50: "call", 10000: "system", 10002: "revoked", } # 需要上传 COS 的媒体类型(image/voice 只上传已解密的可读文件,跳过 .dat) MEDIA_TYPES = {"video", "file", "image", "voice"} def _table_has_column(conn, table_name: str, column_name: str) -> bool: """检查 SQLite 表是否包含指定列""" try: cols = conn.execute(f"PRAGMA table_info([{table_name}])").fetchall() return any(row[1] == column_name for row in cols) except Exception: return False def _extract_appmsg_meta(content_xml: str) -> dict | None: """从 type=49 消息的 XML 中提取 appmsg 元信息(文件名、大小、类型等) 返回 dict: {title, des, file_size, file_ext, app_type, refer_svrid, refer_displayname, refer_content} 或 None """ if not content_xml: return None try: root = ET.fromstring(content_xml) if content_xml.lstrip().startswith("<") else None if root is None: return None appmsg = root.find(".//appmsg") if appmsg is None: return None app_type = int((appmsg.findtext("type") or "0").strip()) title = (appmsg.findtext("title") or "").strip() des = (appmsg.findtext("des") or "").strip() # 附件信息 attach = appmsg.find("appattach") file_size = 0 file_ext = "" if attach is not None: file_size = int((attach.findtext("totallen") or "0").strip()) file_ext = (attach.findtext("fileext") or "").strip() result = { "app_type": app_type, "title": title, "des": des, "file_size": file_size, "file_ext": file_ext, } # 引用消息 (app_type=57): 提取 refermsg 中的 svrid 和被引用内容 if app_type == 57: ref = appmsg.find(".//refermsg") if ref is not None: svrid_text = (ref.findtext("svrid") or "").strip() if svrid_text: try: result["refer_svrid"] = int(svrid_text) except ValueError: pass result["refer_displayname"] = (ref.findtext("displayname") or "").strip() result["refer_content"] = (ref.findtext("content") or "").strip() return result except Exception: return None def _format_file_content(meta: dict) -> str: """将文件元信息格式化为可读的 content 字符串""" parts = [] if meta.get("title"): parts.append(meta["title"]) size = meta.get("file_size", 0) if size > 0: if size >= 1024 * 1024: parts.append(f"({size / 1024 / 1024:.1f}MB)") elif size >= 1024: parts.append(f"({size / 1024:.1f}KB)") else: parts.append(f"({size}B)") parts.append(f"size:{size}") return " ".join(parts) def _extract_video_meta(content_xml: str) -> str: """从视频消息 XML 中提取描述信息,包含匹配所需的 rawmd5 和 size 直接发送的视频: rawmd5 为空, rawlength=0, 但 length 为实际文件大小 文件方式发送的视频: rawmd5 和 rawlength 有值 """ if not content_xml: return "[视频]" try: root = ET.fromstring(content_xml) if content_xml.lstrip().startswith("<") else None if root is None: return "[视频]" video = root.find(".//videomsg") if video is not None: playlength = video.get("playlength", "") rawmd5 = video.get("rawmd5", "") rawlength = video.get("rawlength", "0") length = video.get("length", "0") parts = ["[视频]"] if playlength: parts.append(f"{playlength}秒") if rawmd5: parts.append(f"rawmd5:{rawmd5}") # 优先使用 rawlength(文件发送),回退到 length(直接发送) effective_size = rawlength if rawlength and rawlength != "0" else length if effective_size and effective_size != "0": parts.append(f"size:{effective_size}") return " ".join(parts) return "[视频]" except Exception: return "[视频]" def _extract_voice_meta(content_xml: str) -> str: """从语音消息 XML 中提取时长和数据大小""" if not content_xml: return "[语音]" try: root = ET.fromstring(content_xml) if content_xml.lstrip().startswith("<") else None if root is None: return "[语音]" voice = root.find(".//voicemsg") if voice is not None: voicelength = voice.get("voicelength", "") length = voice.get("length", "") parts = ["[语音]"] if voicelength: dur_ms = int(voicelength) parts.append(f"{(dur_ms + 500) // 1000}秒") if length: parts.append(f"size:{length}") return " ".join(parts) return "[语音]" except Exception: return "[语音]" def _extract_chat_record(content_xml: str) -> str | None: """从 type=49, app_type=19 的聊天记录消息中提取纯文本 格式: [聊天记录] 标题 发送者A: 消息内容 发送者B: 消息内容 ... """ if not content_xml: return None try: root = ET.fromstring(content_xml) if content_xml.lstrip().startswith("<") else None if root is None: return None appmsg = root.find(".//appmsg") if appmsg is None: return None title = (appmsg.findtext("title") or "聊天记录").strip() lines = [f"[聊天记录] {title}"] # recorditem 内嵌了一段 XML 字符串 recorditem_text = appmsg.findtext("recorditem") or "" if not recorditem_text.strip(): return lines[0] if lines else None rec_root = ET.fromstring(recorditem_text) for item in rec_root.findall(".//datalist/dataitem"): sender = (item.findtext("sourcename") or "").strip() # datatitle 是聊天内容,datadesc 是附加描述 msg_text = (item.findtext("datatitle") or "").strip() if not msg_text: msg_text = (item.findtext("datadesc") or "").strip() if not msg_text: # 可能是图片/视频等非文本 data_type = item.get("datatype", "") if data_type == "2": msg_text = "[图片]" elif data_type == "4": msg_text = "[视频]" elif data_type == "6": msg_text = "[文件]" else: msg_text = "[其他]" if sender: lines.append(f"{sender}: {msg_text}") else: lines.append(msg_text) return "\n".join(lines) except Exception: return None def _extract_image_meta(content_xml: str) -> dict | None: """从图片消息 XML 中提取 md5 和 length 属性""" if not content_xml: return None md5_m = re.search(r'\bmd5="([a-fA-F0-9]{32})"', content_xml) len_m = re.search(r'\blength="(\d+)"', content_xml) if not md5_m: return None return { "md5": md5_m.group(1).lower(), "length": int(len_m.group(1)) if len_m else 0, } def _file_md5(filepath: str) -> str: """计算文件内容的 md5""" h = hashlib.md5() with open(filepath, "rb") as f: for chunk in iter(lambda: f.read(8192), b""): h.update(chunk) return h.hexdigest() class WeChatAdapter: def __init__(self): self._app = AppContext() self._names = get_contact_names(self._app.cache, self._app.decrypted_dir) @property def db_dir(self): return self._app.db_dir def refresh_names(self): """刷新联系人名称缓存(全局单例需要重置)""" import wechat_cli.core.contacts as _c _c._contact_names = None _c._contact_full = None self._names = get_contact_names(self._app.cache, self._app.decrypted_dir) def list_group_sessions(self, limit=500) -> list[dict]: """列出所有群聊会话""" path = self._app.cache.get(os.path.join("session", "session.db")) if not path: log.error("无法解密 session.db") return [] with closing(sqlite3.connect(path)) as conn: rows = conn.execute(""" SELECT username, unread_count, summary, last_timestamp, last_msg_type, last_msg_sender, last_sender_display_name FROM SessionTable WHERE last_timestamp > 0 ORDER BY last_timestamp DESC LIMIT ? """, (limit,)).fetchall() groups = [] for r in rows: username, unread, summary, ts, msg_type, sender, sender_name = r if "@chatroom" not in username: continue display = self._names.get(username, username) groups.append({ "username": username, "display_name": display, "last_timestamp": ts, "unread": unread or 0, }) return groups def resolve_group_username(self, group_name: str) -> str | None: return resolve_username(group_name, self._app.cache, self._app.decrypted_dir) def display_name(self, username: str) -> str: return self._names.get(username, username) def query_new_messages(self, username: str, after_ts: int, limit: int = 200) -> list[dict]: """增量查询某群新消息(create_time > after_ts),按时间升序返回结构化数据""" tables = _find_msg_tables_for_user( username, self._app.msg_db_keys, self._app.cache ) if not tables: return [] group_name = self._names.get(username, username) is_group = "@chatroom" in username all_messages = [] for table_info in tables: # 优化:跳过 max_create_time <= after_ts 的表 if after_ts and table_info["max_create_time"] <= after_ts: continue db_path = table_info["db_path"] table_name = table_info["table_name"] if not _is_safe_msg_table_name(table_name): continue try: conn = sqlite3.connect(db_path, timeout=5) try: id_to_username = _load_name2id_maps(conn) has_server_id = _table_has_column(conn, table_name, "server_id") # 自定义查询:ORDER BY create_time ASC clauses, params = _build_message_filters(start_ts=after_ts) # 改为严格大于(排除已入库的那条) if clauses: clauses[0] = "create_time > ?" where_sql = f"WHERE {' AND '.join(clauses)}" if clauses else "" extra_col = ", server_id" if has_server_id else "" sql = f""" SELECT local_id, local_type, create_time, real_sender_id, message_content, WCDB_CT_message_content{extra_col} FROM [{table_name}] {where_sql} ORDER BY create_time ASC LIMIT ? """ rows = conn.execute(sql, (*params, limit)).fetchall() for row in rows: msg = self._parse_row( row, username, group_name, is_group, id_to_username, db_path, has_server_id ) if msg: all_messages.append(msg) finally: conn.close() except Exception as e: log.warning("查询 %s 的 %s 失败: %s", username, db_path, e) # 跨表合并后按时间排序,截断到 limit all_messages.sort(key=lambda m: m["create_time"]) return all_messages[:limit] def _parse_row(self, row, username, group_name, is_group, id_to_username, db_path, has_server_id=False): """解析单条消息原始行为结构化 dict""" local_id, local_type, create_time, real_sender_id, content_raw, ct = row[:6] server_id = row[6] if has_server_id and len(row) > 6 else None content_raw = decompress_content(content_raw, ct) if content_raw is None: content_raw = "" # 解析发送者和消息内容 sender_from_content, text = _parse_message_content(content_raw, local_type, is_group) # 解析发送者 sender_username = id_to_username.get(real_sender_id, "") if not sender_username and sender_from_content: sender_username = sender_from_content sender_name = self._names.get(sender_username, sender_username) # 解析消息类型 base_type, sub_type = _split_msg_type(local_type) msg_type = _TYPE_MAP.get(base_type, "other") if base_type == 49 and sub_type == 6: msg_type = "file" # content 处理:文本消息存原文,非文本消息提取元信息 # 注意:群聊 content_raw 格式为 "wxid:\n...",用 text(剥离发送者后的部分)解析 XML xml_content = text if text else content_raw refer_msg_svrid = None if base_type == 1: final_content = text elif base_type == 49: # appmsg 类型:文件(6)、链接(5)、小程序(33/36)、聊天记录(19)、引用(57) 等 meta = _extract_appmsg_meta(xml_content) if meta and meta["app_type"] == 19: # 聊天记录合并转发 final_content = _extract_chat_record(xml_content) or meta.get("title", "") elif meta and meta["app_type"] == 57: # 引用/回复消息 refer_msg_svrid = meta.get("refer_svrid") quote_text = meta.get("title") or "[引用消息]" ref_name = meta.get("refer_displayname", "") ref_content = meta.get("refer_content", "") if len(ref_content) > 160: ref_content = ref_content[:160] + "..." if ref_content: prefix = f"回复 {ref_name}: " if ref_name else "回复: " quote_text += f"\n ↳ {prefix}{ref_content}" final_content = quote_text log.debug("引用消息: refer_svrid=%s, title=%s", refer_msg_svrid, meta.get("title", "")) elif meta: if meta["app_type"] == 6: final_content = _format_file_content(meta) elif meta.get("title"): final_content = meta["title"] if meta.get("des"): final_content += f" - {meta['des']}" else: final_content = "" log.debug("appmsg 元信息: type=%d, title=%s", meta["app_type"], meta.get("title", "")) else: final_content = "" elif base_type == 43: final_content = _extract_video_meta(xml_content) elif base_type == 34: final_content = _extract_voice_meta(xml_content) elif base_type == 3: img_meta = _extract_image_meta(xml_content) if img_meta: final_content = f"[图片] {img_meta['md5']}" if img_meta["length"]: final_content += f" size:{img_meta['length']}" else: final_content = "[图片]" elif base_type == 47: final_content = "[表情]" elif base_type == 48: final_content = "[位置]" elif base_type == 42: final_content = "[名片]" else: final_content = text if text else "" # 解析媒体路径 media_path = None if msg_type in MEDIA_TYPES and self._app.db_dir: try: if msg_type == "image": media_path = self._resolve_readable_media( base_type, content_raw, create_time, username ) if media_path: log.debug("图片路径解析成功: %s", media_path) else: log.info("图片文件未找到 (local_id=%d, ts=%d), 可能未点开", local_id, create_time) elif msg_type == "video": media_path = self._resolve_video_path(content_raw, create_time) if media_path: log.debug("视频路径解析成功: %s", media_path) else: log.info("视频文件未找到 (local_id=%d, ts=%d), 可能未下载", local_id, create_time) elif msg_type == "file": media_path = self._resolve_msg_file(final_content, create_time) if media_path: log.debug("文件路径解析成功: %s", media_path) else: log.info("文件未找到 (local_id=%d, ts=%d), 可能未下载", local_id, create_time) elif msg_type == "voice": media_path = self._resolve_readable_media( base_type, content_raw, create_time, username ) if media_path: log.debug("语音路径解析成功: %s", media_path) else: log.info("语音文件未找到 (local_id=%d, ts=%d)", local_id, create_time) except Exception as e: log.warning("媒体路径解析异常 (local_id=%d, type=%s): %s", local_id, msg_type, e) return { "group_username": username, "group_name": group_name, "local_id": local_id, "local_type": local_type, "create_time": create_time, "sender_username": sender_username, "sender_name": sender_name, "msg_type": msg_type, "content": final_content, "media_path": media_path, "source_db": os.path.basename(db_path), "svr_msg_id": server_id, "refer_msg_svrid": refer_msg_svrid, } def _resolve_msg_file(self, content: str, create_time: int) -> str | None: """在 msg/file/YYYY-MM/ 中按文件名查找(文件/视频/音频等都在此目录)""" wechat_base = os.path.dirname(self._app.db_dir) dt = datetime.fromtimestamp(create_time) date_prefix = dt.strftime("%Y-%m") file_dir = os.path.join(wechat_base, "msg", "file", date_prefix) if not os.path.isdir(file_dir): return None # 从 content 提取文件名: "filename.ext (1.2MB)" 或 "[视频] 30秒" 等 title = (content or "").split(" (")[0].strip() # 去掉前缀标记如 [视频]、[语音] for prefix in ("[视频]", "[语音]"): if title.startswith(prefix): title = title[len(prefix):].strip() break if not title: return None # 精确匹配 target = os.path.join(file_dir, title) if os.path.isfile(target): return target # 模糊匹配 for f in os.listdir(file_dir): fp = os.path.join(file_dir, f) if not os.path.isfile(fp): continue if title in f or f in title: return fp return None def _resolve_readable_media(self, base_type: int, content: str, create_time: int, chat_username: str) -> str | None: """解析图片/语音的本地文件路径。 图片: 从 temp/RWTemp/YYYY-MM/ 中按 md5 查找原始图片。 语音: 只返回已解密的可读文件。 """ wechat_base = os.path.dirname(self._app.db_dir) dt = datetime.fromtimestamp(create_time) date_prefix = dt.strftime("%Y-%m") if base_type == 3: img_meta = _extract_image_meta(content) if not img_meta: return None rwtemp_dir = os.path.join(wechat_base, "temp", "RWTemp", date_prefix) if not os.path.isdir(rwtemp_dir): return None target_size = img_meta["length"] for f in os.listdir(rwtemp_dir): fp = os.path.join(rwtemp_dir, f) ext = os.path.splitext(f)[1].lower() if ext not in (".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"): continue if not os.path.isfile(fp): continue if target_size and os.path.getsize(fp) == target_size: return fp if _file_md5(fp) == img_meta["md5"]: return fp return None else: # 语音:按文件大小匹配已解密的可读文件 voice_meta = _extract_voice_meta(content) target_size = None m = re.search(r'size:(\d+)', voice_meta) if m: target_size = int(m.group(1)) msg_dir = os.path.join(wechat_base, "msg") attach_dir = os.path.join(msg_dir, "attach") readable_exts = (".mp3", ".wav", ".amr", ".silk", ".m4a", ".ogg") search_hashes = [] if chat_username: h = hashlib.md5(chat_username.encode()).hexdigest() candidate = os.path.join(attach_dir, h) if os.path.isdir(candidate): search_hashes.append(h) if not search_hashes and os.path.isdir(attach_dir): search_hashes = [ d for d in os.listdir(attach_dir) if os.path.isdir(os.path.join(attach_dir, d)) ] size_match = None for h in search_hashes: sub = os.path.join(attach_dir, h, date_prefix, "Voice") if not os.path.isdir(sub): continue for f in os.listdir(sub): fp = os.path.join(sub, f) if not os.path.isfile(fp): continue if f.endswith(".dat"): continue if not f.lower().endswith(readable_exts): continue if target_size and os.path.getsize(fp) == target_size: return fp if size_match is None: size_match = fp return size_match def _resolve_video_path(self, content: str, create_time: int) -> str | None: """在 msg/video/YYYY-MM/ 和 temp/RWTemp/YYYY-MM/ 中按 rawmd5 或 size 查找视频""" wechat_base = os.path.dirname(self._app.db_dir) rawmd5 = None md5_m = re.search(r'rawmd5="([a-f0-9]+)"', content or "") if md5_m: rawmd5 = md5_m.group(1) rawlength = None len_m = re.search(r'rawlength="(\d+)"', content or "") if len_m: rawlength = int(len_m.group(1)) if not rawmd5 and not rawlength: return None dt = datetime.fromtimestamp(create_time) date_prefix = dt.strftime("%Y-%m") video_exts = (".mp4", ".mov", ".avi") search_dirs = [] video_dir = os.path.join(wechat_base, "msg", "video", date_prefix) if os.path.isdir(video_dir): search_dirs.append(video_dir) rwtemp_dir = os.path.join(wechat_base, "temp", "RWTemp", date_prefix) if os.path.isdir(rwtemp_dir): search_dirs.append(rwtemp_dir) if not search_dirs: return None for d in search_dirs: if rawmd5: for f in os.listdir(d): ext = os.path.splitext(f)[1].lower() if ext not in video_exts: continue if rawmd5 in f: fp = os.path.join(d, f) if os.path.isfile(fp): return fp for d in search_dirs: if rawlength: for f in os.listdir(d): ext = os.path.splitext(f)[1].lower() if ext not in video_exts: continue fp = os.path.join(d, f) if os.path.isfile(fp) and os.path.getsize(fp) == rawlength: return fp return None