基于文本大小匹配图片

This commit is contained in:
astro 2026-04-23 12:12:16 +08:00
parent 96f31bb549
commit b1f02cceab
5 changed files with 100 additions and 41 deletions

View File

@ -34,12 +34,12 @@
- 聊天记录合并转发app_type=19解析 recorditem XML格式化为多行纯文本 - 聊天记录合并转发app_type=19解析 recorditem XML格式化为多行纯文本
- 引用/回复消息app_type=57提取 refermsg 中的 svrid 建立消息关联content 格式化为 "正文\n ↳ 回复 发送者: 被引用内容" - 引用/回复消息app_type=57提取 refermsg 中的 svrid 建立消息关联content 格式化为 "正文\n ↳ 回复 发送者: 被引用内容"
- 数据库通过 `svr_msg_id`(微信 server_id`refer_msg_svrid`refermsg/svrid实现引用消息的关联查询 - 数据库通过 `svr_msg_id`(微信 server_id`refer_msg_svrid`refermsg/svrid实现引用消息的关联查询
- 采集器扫描策略: hot=15s, warm=30s, cold 退避至 120sbackoff 1.2),保证 ≤2min 入库 - 采集器扫描策略: hot=5s, warm=10s, cold 退避至 60sbackoff 1.2),保证 ≤1min 入库
- 媒体文件只采集原始可读文件,不做 .dat 解密 - 媒体文件只采集原始可读文件,不做 .dat 解密
- 图片: 从 `temp/RWTemp/YYYY-MM/{md5}.ext` 获取(微信查看图片后自动生成) - 图片: 从 `temp/RWTemp/YYYY-MM/{md5}.ext` 获取(微信查看图片后自动生成)
- 文件: 从 `msg/file/YYYY-MM/filename` 获取 - 文件: 从 `msg/file/YYYY-MM/filename` 获取
- 视频: 从 `msg/video/YYYY-MM/{rawmd5}.mp4` 获取 - 视频: 从 `msg/video/YYYY-MM/{rawmd5}.mp4` 获取
- 图片消息 content 格式为 `[图片] {md5}`md5 是图片原始内容的 hash来自消息 XML 的 md5 属性) - 图片消息 content 格式为 `[图片] {md5} size:{length}`md5 来自消息 XML 的 md5 属性length 为 XML 的 length 属性(= RWTemp 中文件的精确字节数
- **重要**: RWTemp 中的图片文件名是派生 hash与消息 XML 中的 md5/aeskey 等属性均不对应。匹配图片必须通过 `hashlib.md5(文件内容)` 计算后与 XML md5 属性比对,不能用文件名匹配 - **重要**: XML md5 属性并非文件内容的 md5可能是加密前或服务端 hash文件名也是派生 hash。匹配图片的可靠方式是通过 XML length 与文件大小精确比对md5 仅作为兼容旧记录的回退
- 回溯补录: watchdog 监听 temp/RWTemp + msg/file + msg/video 目录 + 每 10 分钟定时扫描 media_url=NULL 记录7天内 - 回溯补录: PollingObserver5s轮询监听 temp/RWTemp + msg/file + msg/video 目录,匹配失败 30s 后自动重试;每 1 分钟定时扫描 media_url=NULL 记录7天内兜底
- 默认日志级别 DEBUG可通过配置文件 `~/.wechat-cli/collect-chats/config.json` 修改 - 默认日志级别 DEBUG可通过配置文件 `~/.wechat-cli/collect-chats/config.json` 修改

View File

@ -15,7 +15,7 @@ import threading
import time import time
from datetime import datetime from datetime import datetime
from watchdog.observers import Observer from watchdog.observers.polling import PollingObserver
from watchdog.events import FileSystemEventHandler, FileCreatedEvent from watchdog.events import FileSystemEventHandler, FileCreatedEvent
from .config import CollectorConfig from .config import CollectorConfig
@ -67,6 +67,14 @@ def _extract_md5_from_content(content: str) -> str | None:
return m.group(1) if m else None return m.group(1) if m else None
def _extract_size_from_content(content: str) -> int | None:
"""从 DB content 字段提取图片文件大小: '[图片] {md5} size:110348'"""
if not content:
return None
m = re.search(r'size:(\d+)', content)
return int(m.group(1)) if m else None
class MediaFileHandler(FileSystemEventHandler): class MediaFileHandler(FileSystemEventHandler):
"""watchdog 事件处理器 — 新文件出现时触发回溯匹配""" """watchdog 事件处理器 — 新文件出现时触发回溯匹配"""
@ -94,10 +102,25 @@ class MediaFileHandler(FileSystemEventHandler):
if _is_in_rwtemp(filepath): if _is_in_rwtemp(filepath):
media_type = "image" media_type = "image"
log.info("[watchdog] 检测到新媒体文件: %s (type=%s)", filepath, media_type) log.info("[watchdog] 检测到新媒体文件: %s (type=%s)", filepath, media_type)
self._backfiller.try_backfill_file(filepath, media_type) matched = self._backfiller.try_backfill_file(filepath, media_type)
if not matched:
log.info("[watchdog] 未匹配到DB记录, 30s后重试: %s", os.path.basename(filepath))
threading.Timer(30.0, self._retry_backfill, args=[filepath, media_type]).start()
except Exception as e: except Exception as e:
log.warning("[watchdog] 处理文件异常 %s: %s", filepath, e) log.warning("[watchdog] 处理文件异常 %s: %s", filepath, e)
def _retry_backfill(self, filepath: str, media_type: str):
try:
if not os.path.isfile(filepath):
return
matched = self._backfiller.try_backfill_file(filepath, media_type)
if matched:
log.info("[watchdog] 重试匹配成功: %s", os.path.basename(filepath))
else:
log.info("[watchdog] 重试仍未匹配: %s, 等待定时扫描兜底", os.path.basename(filepath))
except Exception as e:
log.warning("[watchdog] 重试异常 %s: %s", filepath, e)
class MediaBackfiller: class MediaBackfiller:
"""媒体回溯补录器 """媒体回溯补录器
@ -113,7 +136,7 @@ class MediaBackfiller:
self._cfg = config self._cfg = config
self._cos = cos_uploader self._cos = cos_uploader
self._wechat_base = wechat_base_dir self._wechat_base = wechat_base_dir
self._observer: Observer | None = None self._observer: PollingObserver | None = None
self._scan_thread: threading.Thread | None = None self._scan_thread: threading.Thread | None = None
self._running = False self._running = False
self._upload_lock = threading.Lock() self._upload_lock = threading.Lock()
@ -126,13 +149,13 @@ class MediaBackfiller:
watch_dirs = self._get_watch_dirs() watch_dirs = self._get_watch_dirs()
if watch_dirs: if watch_dirs:
self._observer = Observer() self._observer = PollingObserver(timeout=5)
handler = MediaFileHandler(self) handler = MediaFileHandler(self)
for d in watch_dirs: for d in watch_dirs:
self._observer.schedule(handler, d, recursive=True) self._observer.schedule(handler, d, recursive=True)
log.info("[backfill] 监听目录: %s", d) log.info("[backfill] 监听目录: %s", d)
self._observer.start() self._observer.start()
log.info("[backfill] watchdog 已启动, 监听 %d 个目录", len(watch_dirs)) log.info("[backfill] PollingObserver 已启动, 监听 %d 个目录, 轮询间隔 5s", len(watch_dirs))
else: else:
log.warning("[backfill] 未找到可监听的微信媒体目录") log.warning("[backfill] 未找到可监听的微信媒体目录")
@ -178,11 +201,11 @@ class MediaBackfiller:
return dirs return dirs
def try_backfill_file(self, filepath: str, media_type: str): def try_backfill_file(self, filepath: str, media_type: str) -> bool:
"""尝试将新文件匹配到 DB 中的 pending 记录并上传""" """尝试将新文件匹配到 DB 中的 pending 记录并上传,返回是否匹配成功"""
if not self._cos: if not self._cos:
log.debug("[backfill] COS 未配置, 跳过上传") log.debug("[backfill] COS 未配置, 跳过上传")
return return False
with self._upload_lock: with self._upload_lock:
pending = self._storage.get_pending_media_messages( pending = self._storage.get_pending_media_messages(
@ -191,6 +214,8 @@ class MediaBackfiller:
matched = self._match_file_to_record(filepath, media_type, pending) matched = self._match_file_to_record(filepath, media_type, pending)
if matched: if matched:
self._do_upload_and_update(filepath, matched) self._do_upload_and_update(filepath, matched)
return True
return False
def _match_file_to_record(self, filepath: str, media_type: str, def _match_file_to_record(self, filepath: str, media_type: str,
records: list[dict]) -> dict | None: records: list[dict]) -> dict | None:
@ -203,18 +228,34 @@ class MediaBackfiller:
return self._match_file(filepath, records) return self._match_file(filepath, records)
def _match_image(self, filepath: str, records: list[dict]) -> dict | None: def _match_image(self, filepath: str, records: list[dict]) -> dict | None:
"""图片匹配: 计算文件内容 md5匹配 DB content 中的 md5""" """图片匹配: 优先按文件大小匹配,回退到内容 md5"""
try:
file_size = os.path.getsize(filepath)
except Exception:
return None
image_records = [r for r in records if r["msg_type"] == "image"]
# 优先: 按文件大小匹配(新格式 content 含 size:NNN
for rec in image_records:
rec_size = _extract_size_from_content(rec.get("content", ""))
if rec_size and rec_size == file_size:
log.info("[backfill] 图片文件大小匹配: %d bytes → record id=%d", file_size, rec["id"])
return rec
# 回退: 按内容 md5 匹配(兼容旧记录)
try: try:
file_hash = _file_md5(filepath) file_hash = _file_md5(filepath)
except Exception: except Exception:
return None file_hash = None
for rec in records: if file_hash:
if rec["msg_type"] != "image": for rec in image_records:
continue
rec_md5 = _extract_md5_from_content(rec.get("content", "")) rec_md5 = _extract_md5_from_content(rec.get("content", ""))
if rec_md5 and rec_md5 == file_hash: if rec_md5 and rec_md5 == file_hash:
log.info("[backfill] 图片内容md5匹配: %s → record id=%d", file_hash, rec["id"]) log.info("[backfill] 图片内容md5匹配: %s → record id=%d", file_hash, rec["id"])
return rec return rec
log.info("[backfill] 图片未匹配: size=%d, 文件=%s, DB中%d条image待匹配",
file_size, os.path.basename(filepath), len(image_records))
return None return None
def _match_file(self, filepath: str, records: list[dict]) -> dict | None: def _match_file(self, filepath: str, records: list[dict]) -> dict | None:
@ -338,9 +379,11 @@ class MediaBackfiller:
return None return None
def _resolve_image_path(self, date_prefix: str, record: dict) -> str | None: def _resolve_image_path(self, date_prefix: str, record: dict) -> str | None:
"""在 temp/RWTemp/YYYY-MM/ 中按文件内容 md5 查找图片""" """在 temp/RWTemp/YYYY-MM/ 中按文件大小或内容 md5 查找图片"""
img_md5 = _extract_md5_from_content(record.get("content", "")) content = record.get("content", "")
if not img_md5: target_size = _extract_size_from_content(content)
img_md5 = _extract_md5_from_content(content)
if not target_size and not img_md5:
return None return None
rwtemp_dir = os.path.join(self._wechat_base, "temp", "RWTemp", date_prefix) rwtemp_dir = os.path.join(self._wechat_base, "temp", "RWTemp", date_prefix)
@ -354,8 +397,10 @@ class MediaBackfiller:
continue continue
if not os.path.isfile(fp): if not os.path.isfile(fp):
continue continue
if target_size and os.path.getsize(fp) == target_size:
return fp
try: try:
if _file_md5(fp) == img_md5: if img_md5 and _file_md5(fp) == img_md5:
return fp return fp
except Exception: except Exception:
continue continue

View File

@ -29,14 +29,14 @@ class CollectorConfig:
cos_base_path: str = os.environ.get("COS_BASE_PATH", "") cos_base_path: str = os.environ.get("COS_BASE_PATH", "")
# ---- 扫描策略 ---- # ---- 扫描策略 ----
min_interval: float = 15.0 # hot 群扫描间隔(秒) min_interval: float = 5.0 # hot 群扫描间隔(秒)
base_interval: float = 30.0 # warm 群扫描间隔 base_interval: float = 10.0 # warm 群扫描间隔
max_interval: float = 120.0 # cold 群最大间隔(保证 ≤2min 入库) max_interval: float = 60.0 # cold 群最大间隔(保证 ≤1min 入库)
backoff_factor: float = 1.2 # cold 退避系数 backoff_factor: float = 1.2 # cold 退避系数
batch_size: int = 10 # 每轮最多扫描群数 batch_size: int = 10 # 每轮最多扫描群数
messages_per_scan: int = 200 # 每群每次最多拉取消息数 messages_per_scan: int = 200 # 每群每次最多拉取消息数
jitter_max: float = 1.0 # 群间随机延迟上限(秒) jitter_max: float = 1.0 # 群间随机延迟上限(秒)
cycle_sleep: float = 5.0 # 轮次间休眠(秒) cycle_sleep: float = 1.0 # 轮次间休眠(秒)
discovery_interval: float = 180.0 # 群聊发现间隔(秒) discovery_interval: float = 180.0 # 群聊发现间隔(秒)
hot_threshold: int = 300 # 最新消息 < N秒 算 hot hot_threshold: int = 300 # 最新消息 < N秒 算 hot
warm_threshold: int = 3600 # 最新消息 < N秒 算 warm warm_threshold: int = 3600 # 最新消息 < N秒 算 warm
@ -46,7 +46,7 @@ class CollectorConfig:
blacklist: list = field(default_factory=list) # 跳过的群(支持正则) blacklist: list = field(default_factory=list) # 跳过的群(支持正则)
# ---- 回溯补录 ---- # ---- 回溯补录 ----
backfill_interval: float = 600.0 # 定时扫描间隔(秒), 默认 10 分钟 backfill_interval: float = 60.0 # 定时扫描间隔(秒), 默认 1 分钟
backfill_lookback_days: int = 7 # 回溯天数 backfill_lookback_days: int = 7 # 回溯天数
backfill_enabled: bool = True # 是否启用回溯补录 backfill_enabled: bool = True # 是否启用回溯补录

View File

@ -189,12 +189,18 @@ def _extract_chat_record(content_xml: str) -> str | None:
return None return None
def _extract_image_md5(content_xml: str) -> str | None: def _extract_image_meta(content_xml: str) -> dict | None:
"""从图片消息 XML 中提取 md5 属性(图片内容 hash""" """从图片消息 XML 中提取 md5 和 length 属性"""
if not content_xml: if not content_xml:
return None return None
m = re.search(r'\bmd5="([a-fA-F0-9]{32})"', content_xml) md5_m = re.search(r'\bmd5="([a-fA-F0-9]{32})"', content_xml)
return m.group(1).lower() if m else None len_m = re.search(r'\blength="(\d+)"', content_xml)
if not md5_m:
return None
return {
"md5": md5_m.group(1).lower(),
"length": int(len_m.group(1)) if len_m else 0,
}
def _file_md5(filepath: str) -> str: def _file_md5(filepath: str) -> str:
@ -388,8 +394,13 @@ class WeChatAdapter:
elif base_type == 34: elif base_type == 34:
final_content = "[语音]" final_content = "[语音]"
elif base_type == 3: elif base_type == 3:
img_md5 = _extract_image_md5(xml_content) img_meta = _extract_image_meta(xml_content)
final_content = f"[图片] {img_md5}" if img_md5 else "[图片]" if img_meta:
final_content = f"[图片] {img_meta['md5']}"
if img_meta["length"]:
final_content += f" size:{img_meta['length']}"
else:
final_content = "[图片]"
elif base_type == 47: elif base_type == 47:
final_content = "[表情]" final_content = "[表情]"
elif base_type == 48: elif base_type == 48:
@ -503,12 +514,13 @@ class WeChatAdapter:
date_prefix = dt.strftime("%Y-%m") date_prefix = dt.strftime("%Y-%m")
if base_type == 3: if base_type == 3:
img_md5 = _extract_image_md5(content) img_meta = _extract_image_meta(content)
if not img_md5: if not img_meta:
return None return None
rwtemp_dir = os.path.join(wechat_base, "temp", "RWTemp", date_prefix) rwtemp_dir = os.path.join(wechat_base, "temp", "RWTemp", date_prefix)
if not os.path.isdir(rwtemp_dir): if not os.path.isdir(rwtemp_dir):
return None return None
target_size = img_meta["length"]
for f in os.listdir(rwtemp_dir): for f in os.listdir(rwtemp_dir):
fp = os.path.join(rwtemp_dir, f) fp = os.path.join(rwtemp_dir, f)
ext = os.path.splitext(f)[1].lower() ext = os.path.splitext(f)[1].lower()
@ -516,7 +528,9 @@ class WeChatAdapter:
continue continue
if not os.path.isfile(fp): if not os.path.isfile(fp):
continue continue
if _file_md5(fp) == img_md5: if target_size and os.path.getsize(fp) == target_size:
return fp
if _file_md5(fp) == img_meta["md5"]:
return fp return fp
return None return None
else: else:

View File

@ -35,9 +35,9 @@ v0.2.4
- [x] 详细扫描日志(每条消息类型/发送者/媒体状态) - [x] 详细扫描日志(每条消息类型/发送者/媒体状态)
- [x] 聊天记录合并转发解析app_type=19 → 多行纯文本) - [x] 聊天记录合并转发解析app_type=19 → 多行纯文本)
- [x] 引用/回复消息关联app_type=57 → svr_msg_id + refer_msg_svrid支持消息间关联查询 - [x] 引用/回复消息关联app_type=57 → svr_msg_id + refer_msg_svrid支持消息间关联查询
- [x] 扫描频率优化hot=15s, warm=30s, cold≤120s保证 ≤2min 入库) - [x] 扫描频率优化hot=5s, warm=10s, cold≤60s保证 ≤1min 入库)
- [x] 默认 DEBUG 日志级别 - [x] 默认 DEBUG 日志级别
- [x] 媒体回溯补录(watchdog 监听 RWTemp + msg/file + 10min 定时兜底扫描,补录 7 天内记录) - [x] 媒体回溯补录(PollingObserver 5s轮询监听 RWTemp + msg/file + msg/video匹配失败30s自动重试 + 1min 定时兜底扫描,补录 7 天内记录)
## 待规划功能 ## 待规划功能
- [ ] 采集数据 Web 查询界面 - [ ] 采集数据 Web 查询界面