update video msg
This commit is contained in:
parent
b1f02cceab
commit
c3ccab77d6
@ -38,8 +38,12 @@
|
|||||||
- 媒体文件只采集原始可读文件,不做 .dat 解密
|
- 媒体文件只采集原始可读文件,不做 .dat 解密
|
||||||
- 图片: 从 `temp/RWTemp/YYYY-MM/{md5}.ext` 获取(微信查看图片后自动生成)
|
- 图片: 从 `temp/RWTemp/YYYY-MM/{md5}.ext` 获取(微信查看图片后自动生成)
|
||||||
- 文件: 从 `msg/file/YYYY-MM/filename` 获取
|
- 文件: 从 `msg/file/YYYY-MM/filename` 获取
|
||||||
- 视频: 从 `msg/video/YYYY-MM/{rawmd5}.mp4` 获取
|
- 视频: 从 `msg/video/YYYY-MM/{rawmd5}.mp4` 获取,或从 `temp/RWTemp/YYYY-MM/{rawmd5}.mp4` 获取(直接发送的视频点击预览后缓存于此)
|
||||||
- 图片消息 content 格式为 `[图片] {md5} size:{length}`,md5 来自消息 XML 的 md5 属性,length 为 XML 的 length 属性(= RWTemp 中文件的精确字节数)
|
- 图片消息 content 格式为 `[图片] {md5} size:{length}`,md5 来自消息 XML 的 md5 属性,length 为 XML 的 length 属性(= RWTemp 中文件的精确字节数)
|
||||||
- **重要**: XML md5 属性并非文件内容的 md5(可能是加密前或服务端 hash),文件名也是派生 hash。匹配图片的可靠方式是通过 XML length 与文件大小精确比对,md5 仅作为兼容旧记录的回退
|
- **重要**: XML md5 属性并非文件内容的 md5(可能是加密前或服务端 hash),文件名也是派生 hash。匹配图片的可靠方式是通过 XML length 与文件大小精确比对,md5 仅作为兼容旧记录的回退
|
||||||
|
- 视频消息 content 格式为 `[视频] {playlength}秒 rawmd5:{rawmd5} size:{rawlength}`,rawmd5 是本地 mp4 文件名,rawlength 是文件字节数。匹配优先 rawmd5(文件名精确匹配),回退 rawlength 与文件大小比对
|
||||||
|
- **注意**: 直接发送的视频 rawmd5 和 rawlength 为空/0,此时使用 XML 的 `length` 属性作为 size(即缓存到 RWTemp 中文件的精确字节数),content 为 `[视频] {playlength}秒 size:{length}`
|
||||||
|
- 语音消息 content 格式为 `[语音] {N}秒 size:{length}`,voicelength 为语音时长(ms→s四舍五入),length 为语音数据字节数。匹配通过文件大小精确比对
|
||||||
|
- 文件消息 content 格式为 `{filename} ({human_size}) size:{exact_bytes}`,匹配优先文件名+精确size双重校验,回退纯文件名子串匹配
|
||||||
- 回溯补录: PollingObserver(5s轮询)监听 temp/RWTemp + msg/file + msg/video 目录,匹配失败 30s 后自动重试;每 1 分钟定时扫描 media_url=NULL 记录(7天内)兜底
|
- 回溯补录: PollingObserver(5s轮询)监听 temp/RWTemp + msg/file + msg/video 目录,匹配失败 30s 后自动重试;每 1 分钟定时扫描 media_url=NULL 记录(7天内)兜底
|
||||||
- 默认日志级别 DEBUG,可通过配置文件 `~/.wechat-cli/collect-chats/config.json` 修改
|
- 默认日志级别 DEBUG,可通过配置文件 `~/.wechat-cli/collect-chats/config.json` 修改
|
||||||
|
|||||||
@ -75,6 +75,22 @@ def _extract_size_from_content(content: str) -> int | None:
|
|||||||
return int(m.group(1)) if m else None
|
return int(m.group(1)) if m else None
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_video_rawmd5(content: str) -> str | None:
|
||||||
|
"""从 DB content 字段提取视频 rawmd5: '[视频] 15秒 rawmd5:abc123...'"""
|
||||||
|
if not content:
|
||||||
|
return None
|
||||||
|
m = re.search(r'rawmd5:([a-f0-9]+)', content)
|
||||||
|
return m.group(1) if m else None
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_video_size(content: str) -> int | None:
|
||||||
|
"""从 DB content 字段提取视频文件大小: '[视频] 15秒 rawmd5:xxx size:12345'"""
|
||||||
|
if not content:
|
||||||
|
return None
|
||||||
|
m = re.search(r'size:(\d+)', content)
|
||||||
|
return int(m.group(1)) if m else None
|
||||||
|
|
||||||
|
|
||||||
class MediaFileHandler(FileSystemEventHandler):
|
class MediaFileHandler(FileSystemEventHandler):
|
||||||
"""watchdog 事件处理器 — 新文件出现时触发回溯匹配"""
|
"""watchdog 事件处理器 — 新文件出现时触发回溯匹配"""
|
||||||
|
|
||||||
@ -98,9 +114,11 @@ class MediaFileHandler(FileSystemEventHandler):
|
|||||||
media_type = _is_readable_media(filepath)
|
media_type = _is_readable_media(filepath)
|
||||||
if not media_type:
|
if not media_type:
|
||||||
return
|
return
|
||||||
# RWTemp 中的都是图片
|
# RWTemp 中:视频格式保持 video,其余为 image
|
||||||
if _is_in_rwtemp(filepath):
|
if _is_in_rwtemp(filepath):
|
||||||
media_type = "image"
|
ext = os.path.splitext(filepath)[1].lower()
|
||||||
|
if ext not in VIDEO_EXTS:
|
||||||
|
media_type = "image"
|
||||||
log.info("[watchdog] 检测到新媒体文件: %s (type=%s)", filepath, media_type)
|
log.info("[watchdog] 检测到新媒体文件: %s (type=%s)", filepath, media_type)
|
||||||
matched = self._backfiller.try_backfill_file(filepath, media_type)
|
matched = self._backfiller.try_backfill_file(filepath, media_type)
|
||||||
if not matched:
|
if not matched:
|
||||||
@ -259,10 +277,12 @@ class MediaBackfiller:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def _match_file(self, filepath: str, records: list[dict]) -> dict | None:
|
def _match_file(self, filepath: str, records: list[dict]) -> dict | None:
|
||||||
"""文件匹配: 通过文件名匹配 DB content 中的标题"""
|
"""文件匹配: 文件名 + size 双重校验,回退纯文件名"""
|
||||||
filename = os.path.basename(filepath)
|
filename = os.path.basename(filepath)
|
||||||
date_prefix = self._extract_date_from_path(filepath)
|
date_prefix = self._extract_date_from_path(filepath)
|
||||||
|
file_size = os.path.getsize(filepath)
|
||||||
|
|
||||||
|
name_matched = None
|
||||||
for rec in records:
|
for rec in records:
|
||||||
if rec["msg_type"] != "file":
|
if rec["msg_type"] != "file":
|
||||||
continue
|
continue
|
||||||
@ -272,15 +292,24 @@ class MediaBackfiller:
|
|||||||
continue
|
continue
|
||||||
content = rec.get("content", "") or ""
|
content = rec.get("content", "") or ""
|
||||||
title = content.split(" (")[0].strip() if content else ""
|
title = content.split(" (")[0].strip() if content else ""
|
||||||
if title and (title in filename or filename in title):
|
if not title or (title not in filename and filename not in title):
|
||||||
log.info("[backfill] 文件匹配: %s → record id=%d", filename, rec["id"])
|
continue
|
||||||
|
rec_size = _extract_video_size(content)
|
||||||
|
if rec_size and rec_size == file_size:
|
||||||
|
log.info("[backfill] 文件匹配(name+size): %s → record id=%d", filename, rec["id"])
|
||||||
return rec
|
return rec
|
||||||
return None
|
if name_matched is None:
|
||||||
|
name_matched = rec
|
||||||
|
|
||||||
|
if name_matched:
|
||||||
|
log.info("[backfill] 文件匹配(name): %s → record id=%d", filename, name_matched["id"])
|
||||||
|
return name_matched
|
||||||
|
|
||||||
def _match_video(self, filepath: str, records: list[dict]) -> dict | None:
|
def _match_video(self, filepath: str, records: list[dict]) -> dict | None:
|
||||||
"""视频匹配: 通过文件名中的 rawmd5 匹配同月的 video 记录"""
|
"""视频匹配: 优先 rawmd5 精确匹配,回退 size 匹配(支持 msg/video 和 RWTemp)"""
|
||||||
filename = os.path.splitext(os.path.basename(filepath))[0].lower()
|
filename = os.path.splitext(os.path.basename(filepath))[0].lower()
|
||||||
date_prefix = self._extract_date_from_path(filepath)
|
date_prefix = self._extract_date_from_path(filepath)
|
||||||
|
file_size = os.path.getsize(filepath)
|
||||||
|
|
||||||
for rec in records:
|
for rec in records:
|
||||||
if rec["msg_type"] != "video":
|
if rec["msg_type"] != "video":
|
||||||
@ -289,9 +318,26 @@ class MediaBackfiller:
|
|||||||
rec_date = datetime.fromtimestamp(rec["msg_timestamp"]).strftime("%Y-%m")
|
rec_date = datetime.fromtimestamp(rec["msg_timestamp"]).strftime("%Y-%m")
|
||||||
if rec_date != date_prefix:
|
if rec_date != date_prefix:
|
||||||
continue
|
continue
|
||||||
if re.fullmatch(r'[a-f0-9]{32}', filename):
|
content = rec.get("content", "") or ""
|
||||||
log.info("[backfill] 视频匹配: %s → record id=%d", filename, rec["id"])
|
rec_rawmd5 = _extract_video_rawmd5(content)
|
||||||
|
if rec_rawmd5 and rec_rawmd5 in filename:
|
||||||
|
log.info("[backfill] 视频匹配(rawmd5): %s → record id=%d", filename, rec["id"])
|
||||||
return rec
|
return rec
|
||||||
|
|
||||||
|
for rec in records:
|
||||||
|
if rec["msg_type"] != "video":
|
||||||
|
continue
|
||||||
|
if date_prefix:
|
||||||
|
rec_date = datetime.fromtimestamp(rec["msg_timestamp"]).strftime("%Y-%m")
|
||||||
|
if rec_date != date_prefix:
|
||||||
|
continue
|
||||||
|
content = rec.get("content", "") or ""
|
||||||
|
rec_size = _extract_video_size(content)
|
||||||
|
if rec_size and rec_size == file_size:
|
||||||
|
log.info("[backfill] 视频匹配(size): %s (%d bytes) → record id=%d",
|
||||||
|
filename, file_size, rec["id"])
|
||||||
|
return rec
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _do_upload_and_update(self, filepath: str, record: dict):
|
def _do_upload_and_update(self, filepath: str, record: dict):
|
||||||
@ -373,9 +419,11 @@ class MediaBackfiller:
|
|||||||
if msg_type == "image":
|
if msg_type == "image":
|
||||||
return self._resolve_image_path(date_prefix, record)
|
return self._resolve_image_path(date_prefix, record)
|
||||||
elif msg_type == "video":
|
elif msg_type == "video":
|
||||||
return self._resolve_video_path(date_prefix)
|
return self._resolve_video_path(date_prefix, record)
|
||||||
elif msg_type == "file":
|
elif msg_type == "file":
|
||||||
return self._resolve_file_path(date_prefix, record)
|
return self._resolve_file_path(date_prefix, record)
|
||||||
|
elif msg_type == "voice":
|
||||||
|
return self._resolve_voice_path(date_prefix, record)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _resolve_image_path(self, date_prefix: str, record: dict) -> str | None:
|
def _resolve_image_path(self, date_prefix: str, record: dict) -> str | None:
|
||||||
@ -406,35 +454,113 @@ class MediaBackfiller:
|
|||||||
continue
|
continue
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _resolve_video_path(self, date_prefix: str) -> str | None:
|
def _resolve_video_path(self, date_prefix: str, record: dict) -> str | None:
|
||||||
"""在 msg/video/YYYY-MM/ 中查找 mp4"""
|
"""在 msg/video/YYYY-MM/ 和 temp/RWTemp/YYYY-MM/ 中按 rawmd5 或 size 查找视频"""
|
||||||
|
content = record.get("content", "") or ""
|
||||||
|
rawmd5 = _extract_video_rawmd5(content)
|
||||||
|
target_size = _extract_video_size(content)
|
||||||
|
|
||||||
|
search_dirs = []
|
||||||
video_dir = os.path.join(self._wechat_base, "msg", "video", date_prefix)
|
video_dir = os.path.join(self._wechat_base, "msg", "video", date_prefix)
|
||||||
if not os.path.isdir(video_dir):
|
if os.path.isdir(video_dir):
|
||||||
|
search_dirs.append(video_dir)
|
||||||
|
rwtemp_dir = os.path.join(self._wechat_base, "temp", "RWTemp", date_prefix)
|
||||||
|
if os.path.isdir(rwtemp_dir):
|
||||||
|
search_dirs.append(rwtemp_dir)
|
||||||
|
|
||||||
|
if not search_dirs:
|
||||||
return None
|
return None
|
||||||
for f in os.listdir(video_dir):
|
|
||||||
if f.endswith(".mp4") and os.path.isfile(os.path.join(video_dir, f)):
|
for d in search_dirs:
|
||||||
return os.path.join(video_dir, f)
|
if rawmd5:
|
||||||
|
for f in os.listdir(d):
|
||||||
|
ext = os.path.splitext(f)[1].lower()
|
||||||
|
if ext not in VIDEO_EXTS:
|
||||||
|
continue
|
||||||
|
if rawmd5 in f:
|
||||||
|
fp = os.path.join(d, f)
|
||||||
|
if os.path.isfile(fp):
|
||||||
|
return fp
|
||||||
|
|
||||||
|
for d in search_dirs:
|
||||||
|
if target_size:
|
||||||
|
for f in os.listdir(d):
|
||||||
|
ext = os.path.splitext(f)[1].lower()
|
||||||
|
if ext not in VIDEO_EXTS:
|
||||||
|
continue
|
||||||
|
fp = os.path.join(d, f)
|
||||||
|
if os.path.isfile(fp) and os.path.getsize(fp) == target_size:
|
||||||
|
return fp
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _resolve_file_path(self, date_prefix: str, record: dict) -> str | None:
|
def _resolve_file_path(self, date_prefix: str, record: dict) -> str | None:
|
||||||
"""在 msg/file/YYYY-MM/ 中按文件名查找"""
|
"""在 msg/file/YYYY-MM/ 中按文件名+size查找"""
|
||||||
content = record.get("content", "") or ""
|
content = record.get("content", "") or ""
|
||||||
title = content.split(" (")[0].strip() if content else ""
|
title = content.split(" (")[0].strip() if content else ""
|
||||||
if not title:
|
if not title:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
target_size = _extract_video_size(content)
|
||||||
|
|
||||||
file_dir = os.path.join(self._wechat_base, "msg", "file", date_prefix)
|
file_dir = os.path.join(self._wechat_base, "msg", "file", date_prefix)
|
||||||
if not os.path.isdir(file_dir):
|
if not os.path.isdir(file_dir):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
target = os.path.join(file_dir, title)
|
target = os.path.join(file_dir, title)
|
||||||
if os.path.isfile(target):
|
if os.path.isfile(target):
|
||||||
return target
|
if target_size and os.path.getsize(target) == target_size:
|
||||||
|
return target
|
||||||
|
if not target_size:
|
||||||
|
return target
|
||||||
|
|
||||||
|
name_match = None
|
||||||
for f in os.listdir(file_dir):
|
for f in os.listdir(file_dir):
|
||||||
if title in f or f in title:
|
if title not in f and f not in title:
|
||||||
fp = os.path.join(file_dir, f)
|
continue
|
||||||
if os.path.isfile(fp):
|
fp = os.path.join(file_dir, f)
|
||||||
|
if not os.path.isfile(fp):
|
||||||
|
continue
|
||||||
|
if target_size and os.path.getsize(fp) == target_size:
|
||||||
|
return fp
|
||||||
|
if name_match is None:
|
||||||
|
name_match = fp
|
||||||
|
return name_match
|
||||||
|
|
||||||
|
def _resolve_voice_path(self, date_prefix: str, record: dict) -> str | None:
|
||||||
|
"""在 msg/attach/{hash}/YYYY-MM/Voice/ 中按 size 查找语音"""
|
||||||
|
content = record.get("content", "") or ""
|
||||||
|
target_size = _extract_video_size(content)
|
||||||
|
|
||||||
|
attach_dir = os.path.join(self._wechat_base, "msg", "attach")
|
||||||
|
if not os.path.isdir(attach_dir):
|
||||||
|
return None
|
||||||
|
|
||||||
|
readable_exts = (".mp3", ".wav", ".amr", ".silk", ".m4a", ".ogg")
|
||||||
|
group_username = record.get("group_username", "")
|
||||||
|
search_hashes = []
|
||||||
|
if group_username:
|
||||||
|
h = hashlib.md5(group_username.encode()).hexdigest()
|
||||||
|
candidate = os.path.join(attach_dir, h)
|
||||||
|
if os.path.isdir(candidate):
|
||||||
|
search_hashes.append(h)
|
||||||
|
if not search_hashes:
|
||||||
|
search_hashes = [
|
||||||
|
d for d in os.listdir(attach_dir)
|
||||||
|
if os.path.isdir(os.path.join(attach_dir, d))
|
||||||
|
]
|
||||||
|
|
||||||
|
for h in search_hashes:
|
||||||
|
voice_dir = os.path.join(attach_dir, h, date_prefix, "Voice")
|
||||||
|
if not os.path.isdir(voice_dir):
|
||||||
|
continue
|
||||||
|
for f in os.listdir(voice_dir):
|
||||||
|
fp = os.path.join(voice_dir, f)
|
||||||
|
if not os.path.isfile(fp) or f.endswith(".dat"):
|
||||||
|
continue
|
||||||
|
if not f.lower().endswith(readable_exts):
|
||||||
|
continue
|
||||||
|
if target_size and os.path.getsize(fp) == target_size:
|
||||||
return fp
|
return fp
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|||||||
@ -111,11 +111,16 @@ def _format_file_content(meta: dict) -> str:
|
|||||||
parts.append(f"({size / 1024:.1f}KB)")
|
parts.append(f"({size / 1024:.1f}KB)")
|
||||||
else:
|
else:
|
||||||
parts.append(f"({size}B)")
|
parts.append(f"({size}B)")
|
||||||
|
parts.append(f"size:{size}")
|
||||||
return " ".join(parts)
|
return " ".join(parts)
|
||||||
|
|
||||||
|
|
||||||
def _extract_video_meta(content_xml: str) -> str:
|
def _extract_video_meta(content_xml: str) -> str:
|
||||||
"""从视频消息 XML 中提取描述信息"""
|
"""从视频消息 XML 中提取描述信息,包含匹配所需的 rawmd5 和 size
|
||||||
|
|
||||||
|
直接发送的视频: rawmd5 为空, rawlength=0, 但 length 为实际文件大小
|
||||||
|
文件方式发送的视频: rawmd5 和 rawlength 有值
|
||||||
|
"""
|
||||||
if not content_xml:
|
if not content_xml:
|
||||||
return "[视频]"
|
return "[视频]"
|
||||||
try:
|
try:
|
||||||
@ -124,16 +129,49 @@ def _extract_video_meta(content_xml: str) -> str:
|
|||||||
return "[视频]"
|
return "[视频]"
|
||||||
video = root.find(".//videomsg")
|
video = root.find(".//videomsg")
|
||||||
if video is not None:
|
if video is not None:
|
||||||
length = video.get("length", "")
|
playlength = video.get("playlength", "")
|
||||||
raw_length = video.get("rawlength", "")
|
rawmd5 = video.get("rawmd5", "")
|
||||||
dur = length or raw_length
|
rawlength = video.get("rawlength", "0")
|
||||||
if dur:
|
length = video.get("length", "0")
|
||||||
return f"[视频] {dur}秒"
|
parts = ["[视频]"]
|
||||||
|
if playlength:
|
||||||
|
parts.append(f"{playlength}秒")
|
||||||
|
if rawmd5:
|
||||||
|
parts.append(f"rawmd5:{rawmd5}")
|
||||||
|
# 优先使用 rawlength(文件发送),回退到 length(直接发送)
|
||||||
|
effective_size = rawlength if rawlength and rawlength != "0" else length
|
||||||
|
if effective_size and effective_size != "0":
|
||||||
|
parts.append(f"size:{effective_size}")
|
||||||
|
return " ".join(parts)
|
||||||
return "[视频]"
|
return "[视频]"
|
||||||
except Exception:
|
except Exception:
|
||||||
return "[视频]"
|
return "[视频]"
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_voice_meta(content_xml: str) -> str:
|
||||||
|
"""从语音消息 XML 中提取时长和数据大小"""
|
||||||
|
if not content_xml:
|
||||||
|
return "[语音]"
|
||||||
|
try:
|
||||||
|
root = ET.fromstring(content_xml) if content_xml.lstrip().startswith("<") else None
|
||||||
|
if root is None:
|
||||||
|
return "[语音]"
|
||||||
|
voice = root.find(".//voicemsg")
|
||||||
|
if voice is not None:
|
||||||
|
voicelength = voice.get("voicelength", "")
|
||||||
|
length = voice.get("length", "")
|
||||||
|
parts = ["[语音]"]
|
||||||
|
if voicelength:
|
||||||
|
dur_ms = int(voicelength)
|
||||||
|
parts.append(f"{(dur_ms + 500) // 1000}秒")
|
||||||
|
if length:
|
||||||
|
parts.append(f"size:{length}")
|
||||||
|
return " ".join(parts)
|
||||||
|
return "[语音]"
|
||||||
|
except Exception:
|
||||||
|
return "[语音]"
|
||||||
|
|
||||||
|
|
||||||
def _extract_chat_record(content_xml: str) -> str | None:
|
def _extract_chat_record(content_xml: str) -> str | None:
|
||||||
"""从 type=49, app_type=19 的聊天记录消息中提取纯文本
|
"""从 type=49, app_type=19 的聊天记录消息中提取纯文本
|
||||||
|
|
||||||
@ -392,7 +430,7 @@ class WeChatAdapter:
|
|||||||
elif base_type == 43:
|
elif base_type == 43:
|
||||||
final_content = _extract_video_meta(xml_content)
|
final_content = _extract_video_meta(xml_content)
|
||||||
elif base_type == 34:
|
elif base_type == 34:
|
||||||
final_content = "[语音]"
|
final_content = _extract_voice_meta(xml_content)
|
||||||
elif base_type == 3:
|
elif base_type == 3:
|
||||||
img_meta = _extract_image_meta(xml_content)
|
img_meta = _extract_image_meta(xml_content)
|
||||||
if img_meta:
|
if img_meta:
|
||||||
@ -534,7 +572,13 @@ class WeChatAdapter:
|
|||||||
return fp
|
return fp
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
# 语音:只返回已解密的可读文件
|
# 语音:按文件大小匹配已解密的可读文件
|
||||||
|
voice_meta = _extract_voice_meta(content)
|
||||||
|
target_size = None
|
||||||
|
m = re.search(r'size:(\d+)', voice_meta)
|
||||||
|
if m:
|
||||||
|
target_size = int(m.group(1))
|
||||||
|
|
||||||
msg_dir = os.path.join(wechat_base, "msg")
|
msg_dir = os.path.join(wechat_base, "msg")
|
||||||
attach_dir = os.path.join(msg_dir, "attach")
|
attach_dir = os.path.join(msg_dir, "attach")
|
||||||
readable_exts = (".mp3", ".wav", ".amr", ".silk", ".m4a", ".ogg")
|
readable_exts = (".mp3", ".wav", ".amr", ".silk", ".m4a", ".ogg")
|
||||||
@ -551,6 +595,7 @@ class WeChatAdapter:
|
|||||||
if os.path.isdir(os.path.join(attach_dir, d))
|
if os.path.isdir(os.path.join(attach_dir, d))
|
||||||
]
|
]
|
||||||
|
|
||||||
|
size_match = None
|
||||||
for h in search_hashes:
|
for h in search_hashes:
|
||||||
sub = os.path.join(attach_dir, h, date_prefix, "Voice")
|
sub = os.path.join(attach_dir, h, date_prefix, "Voice")
|
||||||
if not os.path.isdir(sub):
|
if not os.path.isdir(sub):
|
||||||
@ -561,37 +606,66 @@ class WeChatAdapter:
|
|||||||
continue
|
continue
|
||||||
if f.endswith(".dat"):
|
if f.endswith(".dat"):
|
||||||
continue
|
continue
|
||||||
if f.lower().endswith(readable_exts):
|
if not f.lower().endswith(readable_exts):
|
||||||
|
continue
|
||||||
|
if target_size and os.path.getsize(fp) == target_size:
|
||||||
return fp
|
return fp
|
||||||
|
if size_match is None:
|
||||||
|
size_match = fp
|
||||||
|
|
||||||
return None
|
return size_match
|
||||||
|
|
||||||
def _resolve_video_path(self, content: str, create_time: int) -> str | None:
|
def _resolve_video_path(self, content: str, create_time: int) -> str | None:
|
||||||
"""在 msg/video/YYYY-MM/ 中按 rawmd5 查找 mp4"""
|
"""在 msg/video/YYYY-MM/ 和 temp/RWTemp/YYYY-MM/ 中按 rawmd5 或 size 查找视频"""
|
||||||
wechat_base = os.path.dirname(self._app.db_dir)
|
wechat_base = os.path.dirname(self._app.db_dir)
|
||||||
video_dir = os.path.join(wechat_base, "msg", "video")
|
|
||||||
if not os.path.isdir(video_dir):
|
|
||||||
return None
|
|
||||||
|
|
||||||
rawmd5 = None
|
rawmd5 = None
|
||||||
md5_m = re.search(r'rawmd5="([a-f0-9]+)"', content or "")
|
md5_m = re.search(r'rawmd5="([a-f0-9]+)"', content or "")
|
||||||
if md5_m:
|
if md5_m:
|
||||||
rawmd5 = md5_m.group(1)
|
rawmd5 = md5_m.group(1)
|
||||||
if not rawmd5:
|
|
||||||
|
rawlength = None
|
||||||
|
len_m = re.search(r'rawlength="(\d+)"', content or "")
|
||||||
|
if len_m:
|
||||||
|
rawlength = int(len_m.group(1))
|
||||||
|
|
||||||
|
if not rawmd5 and not rawlength:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
dt = datetime.fromtimestamp(create_time)
|
dt = datetime.fromtimestamp(create_time)
|
||||||
date_prefix = dt.strftime("%Y-%m")
|
date_prefix = dt.strftime("%Y-%m")
|
||||||
|
|
||||||
month_dir = os.path.join(video_dir, date_prefix)
|
video_exts = (".mp4", ".mov", ".avi")
|
||||||
if not os.path.isdir(month_dir):
|
search_dirs = []
|
||||||
|
video_dir = os.path.join(wechat_base, "msg", "video", date_prefix)
|
||||||
|
if os.path.isdir(video_dir):
|
||||||
|
search_dirs.append(video_dir)
|
||||||
|
rwtemp_dir = os.path.join(wechat_base, "temp", "RWTemp", date_prefix)
|
||||||
|
if os.path.isdir(rwtemp_dir):
|
||||||
|
search_dirs.append(rwtemp_dir)
|
||||||
|
|
||||||
|
if not search_dirs:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
for f in os.listdir(month_dir):
|
for d in search_dirs:
|
||||||
if rawmd5 not in f:
|
if rawmd5:
|
||||||
continue
|
for f in os.listdir(d):
|
||||||
fp = os.path.join(month_dir, f)
|
ext = os.path.splitext(f)[1].lower()
|
||||||
if f.endswith(".mp4") and os.path.isfile(fp):
|
if ext not in video_exts:
|
||||||
return fp
|
continue
|
||||||
|
if rawmd5 in f:
|
||||||
|
fp = os.path.join(d, f)
|
||||||
|
if os.path.isfile(fp):
|
||||||
|
return fp
|
||||||
|
|
||||||
|
for d in search_dirs:
|
||||||
|
if rawlength:
|
||||||
|
for f in os.listdir(d):
|
||||||
|
ext = os.path.splitext(f)[1].lower()
|
||||||
|
if ext not in video_exts:
|
||||||
|
continue
|
||||||
|
fp = os.path.join(d, f)
|
||||||
|
if os.path.isfile(fp) and os.path.getsize(fp) == rawlength:
|
||||||
|
return fp
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|||||||
@ -26,8 +26,10 @@ v0.2.4
|
|||||||
- [x] 增量消息采集(基于 create_time 水位线)
|
- [x] 增量消息采集(基于 create_time 水位线)
|
||||||
- [x] MySQL 持久化(wechat_group_message 表)
|
- [x] MySQL 持久化(wechat_group_message 表)
|
||||||
- [x] 媒体文件上传腾讯 COS(只采集原始可读文件,不做 .dat 解密)
|
- [x] 媒体文件上传腾讯 COS(只采集原始可读文件,不做 .dat 解密)
|
||||||
- 图片: temp/RWTemp/YYYY-MM/{md5}.ext(查看后自动生成)
|
- 图片: temp/RWTemp/YYYY-MM/{md5}.ext(查看后自动生成),通过 size/md5 精确匹配
|
||||||
- 文件/视频/音频: msg/file/YYYY-MM/filename
|
- 视频: msg/video/YYYY-MM/{rawmd5}.mp4(文件方式发送)或 temp/RWTemp/YYYY-MM/{rawmd5}.mp4(直接发送后预览缓存),通过 rawmd5/size 精确匹配
|
||||||
|
- 语音: msg/attach/{hash}/YYYY-MM/Voice/(自动生成),通过 size 精确匹配
|
||||||
|
- 文件: msg/file/YYYY-MM/filename,通过文件名+精确size双重校验
|
||||||
- [x] 自适应扫描频率(hot/warm/cold 三级,优先级队列调度)
|
- [x] 自适应扫描频率(hot/warm/cold 三级,优先级队列调度)
|
||||||
- [x] 白名单/黑名单过滤(支持正则)
|
- [x] 白名单/黑名单过滤(支持正则)
|
||||||
- [x] 守护模式 + 一次性扫描模式
|
- [x] 守护模式 + 一次性扫描模式
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user