diff --git a/.vala_skill_hashes b/.vala_skill_hashes index cdf422c..9d9a6f7 100644 --- a/.vala_skill_hashes +++ b/.vala_skill_hashes @@ -9,9 +9,9 @@ lark_wiki_operate_as_bot f84c308bcb69280520dadf9458177d9c4af192d60cf409528bd65e1 pua f6a38fdd39c22c81370abd6b979b58c767e41738d43a26fbbc23d1e933cdc701 smart-auto-model-switch bfb3547dcd6029622c7062b49ae7922614a366b6dfe88c7d0fae9dcd85fc2eb3 vala_git_workspace_backup.vala 4cf352bec88fe84af065ba1ffcbb06647b77df0e01860faaf0bca9fd64b968ec -tencent-cos-upload 172517ed41d06c48425cd961ec5972a48495cfd62ec588bc1c2912ddf31b3a06 user-feedback-collector c0320451bf7ea0ce3d8ceaa603ae0a7b55c373c048363a5142258a4c23f45e81 user-feedback-data-source a95eb9142f3019fd193c46f89147dc7e0bf01dfe250202565a86f8bc52f37b13 user-feedback-processor 61783a8e9f03a973c187b359a87749ad1993dc71f8364b0a853d8b3ff64c75e8 -feishu-group-msg-sync 1b581de76d419e6a33db0836125efc16ef2c972013fcae6f08c03aa7e2276445 feishu-feedback-sync 9c9ca1f0c42a289e037289cd394299b7debf7e240d3b30429899da42b601d953 +feishu-group-msg-sync 953534cc3d1cf4489060afe1cf10cad75faedd3f2d699ffa7478d178d528dc51 +tencent-cos-upload 2505dbc8c7acdaf95f2228598ae12010e09599a25df4319587c4a3109d828053 diff --git a/MEMORY.md b/MEMORY.md index 960a480..4393a94 100644 --- a/MEMORY.md +++ b/MEMORY.md @@ -49,6 +49,16 @@ **不要在 MEMORY.md 中维护静态分类映射表,所有规则变更直接修改 priority_classifier.py。** +### Python 脚本修改后需清理 __pycache__(2026-05-27) +- 修改 Python 脚本(尤其是新增/删除 import)后,旧 `.pyc` 缓存可能导致 `NameError`(模块名未定义) +- 症状:源码中有 `import subprocess`,运行时却报 `NameError: name 'subprocess' is not defined` +- 修复:`find -name "__pycache__" -type d | xargs rm -rf && find -name "*.pyc" -delete` +### P0 实时检测去重:内容语义指纹替代消息ID精确匹配(2026-05-27) +- 原方案用 `sorted(message_ids)` MD5 做去重,但同一话题每次扫描聚类结果不同,签名失效导致重复推送 +- 修复:增加内容语义去重层 — 拼接簇内前5条消息内容 + 发送人集合 + 小时窗口,用 Jaccard 相似度比较 +- 阈值:同小时 + 发送人交集 + 相似度 > 0.20;跨小时 + 发送人 ≥2 重叠 + 相似度 > 0.35 +- 影响文件:`detect_p0_wechat.py`、`detect_p0_realtime.py` + ## 经验教训 ### 微信反馈全链路(2026-05-22 刘新玉确认) diff --git a/data/last_wechat_sync_id b/data/last_wechat_sync_id index f4236d2..adb88b7 100644 --- a/data/last_wechat_sync_id +++ b/data/last_wechat_sync_id @@ -1 +1 @@ -1674 +1772 diff --git a/memory/.dreams/events.jsonl b/memory/.dreams/events.jsonl index 38e49ae..54593b0 100644 --- a/memory/.dreams/events.jsonl +++ b/memory/.dreams/events.jsonl @@ -21,3 +21,5 @@ {"type":"memory.recall.recorded","timestamp":"2026-05-25T07:11:25.527Z","query":"微信反馈 全链路 cron 采集 整理 归纳 分发","resultCount":3,"results":[{"path":"memory/2026-05-25.md","startLine":21,"endLine":42,"score":1},{"path":"memory/2026-05-25.md","startLine":38,"endLine":44,"score":1},{"path":"memory/2026-05-22.md","startLine":129,"endLine":158,"score":1}]} {"type":"memory.recall.recorded","timestamp":"2026-05-26T03:40:13.634Z","query":"微信飞书问题汇总 小葵小葵群 发送汇总","resultCount":3,"results":[{"path":"memory/2026-05-25.md","startLine":21,"endLine":42,"score":1},{"path":"memory/2026-05-25.md","startLine":38,"endLine":44,"score":1},{"path":"memory/2026-05-22.md","startLine":129,"endLine":158,"score":1}]} {"type":"memory.recall.recorded","timestamp":"2026-05-26T03:40:24.708Z","query":"小葵小葵群 chat_id 分发消息群","resultCount":1,"results":[{"path":"memory/2026-04-18.md","startLine":1,"endLine":5,"score":1}]} +{"type":"memory.recall.recorded","timestamp":"2026-05-27T02:15:55.251Z","query":"小葵小葵群 chat_id 反馈同步","resultCount":1,"results":[{"path":"memory/2026-04-18.md","startLine":1,"endLine":5,"score":1}]} +{"type":"memory.recall.recorded","timestamp":"2026-05-27T02:50:23.510Z","query":"pycache python 缓存 导入错误","resultCount":3,"results":[{"path":"memory/2026-05-27.md","startLine":22,"endLine":32,"score":1},{"path":"memory/2026-05-27.md","startLine":1,"endLine":27,"score":1},{"path":"memory/2026-04-10.md","startLine":20,"endLine":52,"score":1}]} diff --git a/memory/.dreams/short-term-recall.json b/memory/.dreams/short-term-recall.json index 94a4cb6..fc0510f 100644 --- a/memory/.dreams/short-term-recall.json +++ b/memory/.dreams/short-term-recall.json @@ -1,6 +1,6 @@ { "version": 1, - "updatedAt": "2026-05-26T03:40:24.708Z", + "updatedAt": "2026-05-27T02:50:23.510Z", "entries": { "memory:memory/2026-04-18.md:1:5": { "key": "memory:memory/2026-04-18.md:1:5", @@ -9,13 +9,13 @@ "endLine": 5, "source": "memory", "snippet": "# 2026-04-18 工作日志 ## 术语共识 [李若松确认] 术语「飞书反馈消息数据库」默认指代用户反馈收集技能中的飞书内部测试反馈MySQL数据表 `vala_test.lark_group_message`,存储「内容测试问题反馈」群(oc_fabff7672e62a9ced7b326ee4a286c26)的同步消息数据。", - "recallCount": 7, + "recallCount": 8, "dailyCount": 0, "groundedCount": 0, - "totalScore": 7, + "totalScore": 8, "maxScore": 1, "firstRecalledAt": "2026-04-30T03:47:21.989Z", - "lastRecalledAt": "2026-05-26T03:40:24.708Z", + "lastRecalledAt": "2026-05-27T02:15:55.251Z", "queryHashes": [ "353f9765c086", "a6b740c99377", @@ -23,14 +23,16 @@ "f865295b9ac7", "42fe8210f22c", "81f7a2647922", - "261597c52d5b" + "261597c52d5b", + "3fe44d618bf6" ], "recallDays": [ "2026-04-30", "2026-05-06", "2026-05-07", "2026-05-25", - "2026-05-26" + "2026-05-26", + "2026-05-27" ], "conceptTags": [ "vala-test.lark-group-message", @@ -980,6 +982,99 @@ "姓氏", "中文" ] + }, + "memory:memory/2026-05-27.md:22:32": { + "key": "memory:memory/2026-05-27.md:22:32", + "path": "memory/2026-05-27.md", + "startLine": 22, + "endLine": 32, + "source": "memory", + "snippet": "| 微信 | `--skip-dispatch` → 不分发 | `--apply-ai` → 替换占位符+分发 | 微信依赖 AI 归纳成功才能分发,飞书双重分发(占位符+AI)。 ### 5月26日反馈概况 - 飞书:17条消息,3个问题簇(2个有效:录音识别率低、飞船音乐不保存) - 微信:27条消息,14个问题簇(4个有效:飞船音乐、录音识别、音频无法播放、AI回复无关语句) ### 后续注意 - 删除/修改 `ai_summarize_feedback.py` 后需清理 `__pycache__`,否则缓存版本可能落后于源码", + "recallCount": 1, + "dailyCount": 0, + "groundedCount": 0, + "totalScore": 1, + "maxScore": 1, + "firstRecalledAt": "2026-05-27T02:50:23.510Z", + "lastRecalledAt": "2026-05-27T02:50:23.510Z", + "queryHashes": [ + "b15d92b2dda8" + ], + "recallDays": [ + "2026-05-27" + ], + "conceptTags": [ + "skip-dispatch", + "apply-ai", + "删除/修改", + "ai-summarize-feedback.py", + "skip", + "dispatch", + "不分", + "apply" + ] + }, + "memory:memory/2026-05-27.md:1:27": { + "key": "memory:memory/2026-05-27.md:1:27", + "path": "memory/2026-05-27.md", + "startLine": 1, + "endLine": 27, + "source": "memory", + "snippet": "# 2026-05-27 工作日志 ## 用户反馈同步故障排查与修复 [刘新玉反馈] ### 问题 5月26日飞书/微信用户反馈定时任务出现回写失败。 ### 根因 1. **Python 缓存过期** — `ai_summarize_feedback.py` 中 `subprocess` 模块导入失败(`NameError: name 'subprocess' is not defined`),实际源码有 `import subprocess`,但 `__pycache__` 中的旧 `.pyc` 未包含此导入 2. **sync_wechat_feedback.py** — 同样因缓存问题导致 `with open(tmp_md)` 写入失败 ### 修复 - 清理了工作区内所有 `__pycache__` 和 `.pyc` 文件 - 重新执行飞书 AI 归纳回写:`sync_feishu_feedback.py --date 2026-05-26 --apply-ai` - 重新执行微信 AI 归纳回写:`sync_wechat_feedback.py --date 2026-05-26 --apply-ai` - 两个渠道均成功回写并分发到「小葵小葵」群 ### 分发架构确认 | 渠道 | 10:00/10:02 Wrapper | 10:05/10:07 AI 归纳 | |------|---------------------|---------------------| | 飞书 | `--dispatch-mode all` → 分发占位符 | `--", + "recallCount": 1, + "dailyCount": 0, + "groundedCount": 0, + "totalScore": 1, + "maxScore": 1, + "firstRecalledAt": "2026-05-27T02:50:23.510Z", + "lastRecalledAt": "2026-05-27T02:50:23.510Z", + "queryHashes": [ + "b15d92b2dda8" + ], + "recallDays": [ + "2026-05-27" + ], + "conceptTags": [ + "5月26日飞书/微信用户反馈定时任务出现回写失败", + "ai-summarize-feedback.py", + "sync-wechat-feedback.py", + "tmp-md", + "sync-feishu-feedback.py", + "apply-ai", + "00/10", + "05/10" + ] + }, + "memory:memory/2026-04-10.md:20:52": { + "key": "memory:memory/2026-04-10.md:20:52", + "path": "memory/2026-04-10.md", + "startLine": 20, + "endLine": 52, + "source": "memory", + "snippet": "- 图片先下载到工作目录(相对路径),再用 `docs +media-insert` 插入文档 - 去掉 `set -e`,改为手动错误处理避免单条消息失败导致整个脚本退出 - 时间用 ISO 8601 格式存储和传递 ### 验证结果(第一版 → Wiki文档) - 全量同步成功:49 条消息 + 5 张图片写入 Wiki 文档 - Wiki文档:DfUqddItXoDsnNxPypncbinknxh ## 迭代:改为电子表格 + 腾讯COS **来源:** [李若松] 要求改用表格存储,媒体文件上传COS ### 方案 - 脚本改为 Python:`scripts/sync_feedback_group.py` - 记录写入飞书电子表格:`E8vFsCmPBhT4SCtNmnJchqeJnJe`,sheet_id `7bce8f` - 列:时间 | 反馈人 | 信息类型 | 信息内容(或地址) - 非文本消息(图片/视频/音频/文件)下载后上传到腾讯COS - COS桶:`static-1317843270`,区域:`ap-beijing` - COS路径结构:`vala_llm/user_feedback/{type}/{date}/{filename}` - type: image / video / audio / file - date: YYYY-MM-DD - 访问域名:`https://static.valavala.com/vala_llm/user_feedback/...` - COS凭证已存入 `secrets.md` ### 验证结", + "recallCount": 1, + "dailyCount": 0, + "groundedCount": 0, + "totalScore": 1, + "maxScore": 1, + "firstRecalledAt": "2026-05-27T02:50:23.510Z", + "lastRecalledAt": "2026-05-27T02:50:23.510Z", + "queryHashes": [ + "b15d92b2dda8" + ], + "recallDays": [ + "2026-05-27" + ], + "conceptTags": [ + "media-insert", + "scripts/sync-feedback-group.py", + "sheet-id", + "图片/视频/音频/文件", + "static-1317843270", + "ap-beijing", + "vala-llm/user-feedback", + "yyyy-mm-dd" + ] } } } diff --git a/memory/2026-05-27.md b/memory/2026-05-27.md new file mode 100644 index 0000000..6c1f2f0 --- /dev/null +++ b/memory/2026-05-27.md @@ -0,0 +1,50 @@ +# 2026-05-27 工作日志 + +## 用户反馈同步故障排查与修复 [刘新玉反馈] + +### 问题 +5月26日飞书/微信用户反馈定时任务出现回写失败。 + +### 根因 +1. **Python 缓存过期** — `ai_summarize_feedback.py` 中 `subprocess` 模块导入失败(`NameError: name 'subprocess' is not defined`),实际源码有 `import subprocess`,但 `__pycache__` 中的旧 `.pyc` 未包含此导入 +2. **sync_wechat_feedback.py** — 同样因缓存问题导致 `with open(tmp_md)` 写入失败 + +### 修复 +- 清理了工作区内所有 `__pycache__` 和 `.pyc` 文件 +- 重新执行飞书 AI 归纳回写:`sync_feishu_feedback.py --date 2026-05-26 --apply-ai` +- 重新执行微信 AI 归纳回写:`sync_wechat_feedback.py --date 2026-05-26 --apply-ai` +- 两个渠道均成功回写并分发到「小葵小葵」群 + +### 分发架构确认 +| 渠道 | 10:00/10:02 Wrapper | 10:05/10:07 AI 归纳 | +|------|---------------------|---------------------| +| 飞书 | `--dispatch-mode all` → 分发占位符 | `--apply-ai` → 替换占位符+重新分发 | +| 微信 | `--skip-dispatch` → 不分发 | `--apply-ai` → 替换占位符+分发 | + +微信依赖 AI 归纳成功才能分发,飞书双重分发(占位符+AI)。 + +### 5月26日反馈概况 +- 飞书:17条消息,3个问题簇(2个有效:录音识别率低、飞船音乐不保存) +- 微信:27条消息,14个问题簇(4个有效:飞船音乐、录音识别、音频无法播放、AI回复无关语句) + +### 后续注意 +- 删除/修改 `ai_summarize_feedback.py` 后需清理 `__pycache__`,否则缓存版本可能落后于源码 + +## P0 实时检测去重修复 [刘新玉反馈] + +### 问题 +微信 `detect_p0_wechat.py` 每分钟扫描最近120分钟消息,同一个问题因讨论线程持续生长,`sort_threads` 聚类每次产生不同消息集合,导致: +- 不同次的聚类有不同的 `cluster_signature`(基于 `sorted(message_ids)` MD5) +- 去重完全失效,同一问题被重复推送(今天2个真实问题各推了3次 = 6次) + +### 修复 +在 `detect_p0_wechat.py` 和 `detect_p0_realtime.py` 中增加**内容语义去重**: +1. 新增 `cluster_content_fingerprint()`:拼接簇内前5条有意义消息作为内容指纹 + 发送人集合 + 小时粒度时间窗口 +2. 新增 `is_duplicate_p0()`:基于内容相似度(Jaccard)+ 发送人重叠 + 时间窗口三层判断 + - 同小时 + 发送人交集 + 内容相似度 > 0.20 → 重复 + - 发送人高度重叠(≥2) + 内容相似度 > 0.35 → 跨小时重复 +3. 状态文件改为 `{"time": ..., "fp": {...}}` 格式存储指纹信息 +4. 飞书 P0 检测器同步修复 + +### 测试验证 +360分钟窗口测试:同一话题在不同扫描窗口下签名不同(c69d... vs 70a4...),但内容指纹正确识别为重复(相似度 0.462,is_duplicate=True) diff --git a/output/daily_feedback/ai_descriptions_feishu_2026-05-26.json b/output/daily_feedback/ai_descriptions_feishu_2026-05-26.json new file mode 100644 index 0000000..20ca5ca --- /dev/null +++ b/output/daily_feedback/ai_descriptions_feishu_2026-05-26.json @@ -0,0 +1,17 @@ +{ + "date": "2026-05-26", + "descriptions": [ + { + "index": 1, + "description": "在应用录音环节,背景音乐音量过相关人员致用户人声微弱,造成语音识别率低,且麦克风界面动画卡顿。" + }, + { + "index": 2, + "description": "在飞船系统中,用户更换音乐后重新进入时音乐自动恢复为默认,无法保存切换设置。" + }, + { + "index": 3, + "description": "无明确问题" + } + ] +} \ No newline at end of file diff --git a/output/daily_feedback/ai_descriptions_wechat_2026-05-26.json b/output/daily_feedback/ai_descriptions_wechat_2026-05-26.json new file mode 100644 index 0000000..f6253fa --- /dev/null +++ b/output/daily_feedback/ai_descriptions_wechat_2026-05-26.json @@ -0,0 +1,61 @@ +{ + "date": "2026-05-26", + "descriptions": [ + { + "index": 1, + "description": "无明确问题" + }, + { + "index": 2, + "description": "飞船音乐切换后,重新进出时音乐会恢复为默认歌曲,无法保存切换设置。" + }, + { + "index": 3, + "description": "无明确问题" + }, + { + "index": 4, + "description": "在移动端瓦拉英语应用中,录音识别功能存在识别率低且录音时麦克风动画卡顿的问题。" + }, + { + "index": 5, + "description": "无明确问题" + }, + { + "index": 6, + "description": "无明确问题" + }, + { + "index": 7, + "description": "无明确问题" + }, + { + "index": 8, + "description": "无明确问题" + }, + { + "index": 9, + "description": "无明确问题" + }, + { + "index": 10, + "description": "英相关人员频无法播放声音" + }, + { + "index": 11, + "description": "无明确问题" + }, + { + "index": 12, + "description": "无明确问题" + }, + { + "index": 13, + "description": "在强化练习环节中,AI语音回复出现了与当前练习内容无关的语句。" + }, + { + "index": 14, + "description": "无明确问题" + } + ] +} \ No newline at end of file diff --git a/output/daily_feedback/cluster_context_2026-05-26.json b/output/daily_feedback/cluster_context_2026-05-26.json new file mode 100644 index 0000000..438869a --- /dev/null +++ b/output/daily_feedback/cluster_context_2026-05-26.json @@ -0,0 +1,166 @@ +{ + "date": "2026-05-26", + "total_clusters": 3, + "clusters": [ + { + "index": 1, + "cluster_id": "4702315644940596587", + "location": { + "端": "未知", + "环节": "未知", + "课程": "", + "角色/组件": "" + }, + "priority": "P2", + "priority_detail": "", + "category": "其他问题", + "conclusion": "**当前问题排查结论:** 暂无结论排查中", + "messages": [ + { + "sender": "瓦拉英语-萌萌老师(早10晚7)", + "content": "麻烦帮看下吧,我的飞船-音乐切换了歌曲,再重新进出的话音乐又变成了默认的,切换歌曲不能保存吗?", + "msg_type": "text", + "media_url": "", + "time": "2026-05-26 09:28:15" + }, + { + "sender": "嘿哈", + "content": "这个目前设计就是如此,不会保存\n ↳ 回复 瓦拉英语-萌萌老师(早10晚7): 麻烦帮看下吧,我的飞船-音乐切换了歌曲,再重新进出的话音乐又变成了默认的,切换歌曲不能保存吗?", + "msg_type": "link", + "media_url": "", + "time": "2026-05-26 10:58:11" + } + ] + }, + { + "index": 2, + "cluster_id": "6332894499636566314", + "location": { + "端": "移动端", + "环节": "未知", + "课程": "", + "角色/组件": "音频" + }, + "priority": "P2", + "priority_detail": "", + "category": "声音/音频类", + "conclusion": "**当前问题排查结论:** 暂无结论排查中", + "messages": [ + { + "sender": "胡陈辰🦉", + "content": "@许悦 我们可以加个需求 ", + "msg_type": "text", + "media_url": "", + "time": "2026-05-26 12:57:57" + }, + { + "sender": "瓦拉英语-Tom老师", + "content": "老师,这种录音识别率比较低,是正常的么?辛苦帮忙看看", + "msg_type": "text", + "media_url": "", + "time": "2026-05-26 13:07:45" + }, + { + "sender": "嘿哈", + "content": "@瓦拉英语-Tom老师 这个可以咨询一下用户,是有连接音响什么的不?\n ↳ 回复 瓦拉英语-Tom老师: 老师,这种录音识别率比较低,是正常的么?辛苦帮忙看看", + "msg_type": "link", + "media_url": "", + "time": "2026-05-26 14:42:09" + }, + { + "sender": "瓦拉英语-Tom老师", + "content": "@八哥-16619720408好的\n ↳ 回复 嘿哈: @瓦拉英语-Tom老师 这个可以咨询一下用户,是有连接音响什么的不?", + "msg_type": "link", + "media_url": "", + "time": "2026-05-26 14:44:20" + }, + { + "sender": "嘿哈", + "content": "@瓦拉英语-Tom老师 这个可能需要分两步走,我获取了一下用户的音频信息确实用户的声音很小,先让用户尝试通过设置降低一点音乐音量,然后我们这边也和产品老师说一下再优化一下,录音的时候没有其他的声音https://static.valavala.com/vala_user_audio/c6125977134c_134242247464915130.wav\nhttps://static.valavala.com/vala_user_audio/71bec4fc33a2_134242247584791300.wav\nhttps://static.valavala.com/vala_user_audio/b4c7d8eff63c_134242247700853530.wav", + "msg_type": "text", + "media_url": "", + "time": "2026-05-26 14:50:07" + }, + { + "sender": "瓦拉英语-Tom老师", + "content": "[聊天记录] 雷鸣和瓦拉英语-Tom老师\n雷鸣: [视频]\n雷鸣: [视频]\n雷鸣: [图片]", + "msg_type": "link", + "media_url": "", + "time": "2026-05-26 13:07:44" + }, + { + "sender": "嘿哈", + "content": "发一下用户手机号吧", + "msg_type": "text", + "media_url": "", + "time": "2026-05-26 13:08:10" + }, + { + "sender": "瓦拉英语-Tom老师", + "content": "手机号:13617153553", + "msg_type": "text", + "media_url": "", + "time": "2026-05-26 13:08:53" + }, + { + "sender": "瓦拉英语-Tom老师", + "content": "[视频] 17秒 size:2553865", + "msg_type": "video", + "media_url": "", + "time": "2026-05-26 13:47:26" + }, + { + "sender": "Ariel", + "content": "@kevin 而且它这个麦克风的动画看起来超级卡,一秒动一帧的感觉\n ↳ 回复 瓦拉英语-Tom老师: 25984984606212559@openim:\n\n\n\t cutoff} - return state + cleaned = {} + for k, v in state.items(): + ts = v if isinstance(v, str) else v.get("time", "") + if ts > cutoff: + cleaned[k] = v + return cleaned def save_dispatched_state(state): @@ -72,6 +74,47 @@ def cluster_signature(cluster_msgs): return hashlib.md5(joined.encode()).hexdigest() +def cluster_content_fingerprint(cluster_msgs): + """生成基于内容语义的簇指纹,用于跨扫描去重(不依赖消息ID集合)""" + all_contents = [] + for m in cluster_msgs: + c = str(m[3]).strip() if m[3] else "" + if c and len(c) > 8: + all_contents.append(c[:300]) + aggregated = " | ".join(all_contents[:5]) + senders = sorted(set(m[1] for m in cluster_msgs if m[1])) + times = [m[6] for m in cluster_msgs if m[6]] + hour = times[0][:13] if times else "unknown" + return { + "content": aggregated, + "senders": senders, + "hour": hour, + "msg_count": len(cluster_msgs), + } + + +def is_duplicate_p0(new_fp, dispatched_entries): + """ + 基于内容语义判断新 P0 是否与已推送 P0 重复。 + dispatched_entries: {sig: {"time": str, "fp": dict}} + """ + for entry in dispatched_entries.values(): + old_fp = entry.get("fp") + if not old_fp: + continue + same_hour = new_fp["hour"] == old_fp["hour"] + sender_overlap = len(set(new_fp["senders"]) & set(old_fp["senders"])) + if same_hour and sender_overlap >= 1: + sim = content_similarity(new_fp["content"], old_fp["content"]) + if sim > 0.20: + return True + if sender_overlap >= 2: + sim = content_similarity(new_fp["content"], old_fp["content"]) + if sim > 0.35: + return True + return False + + def is_probably_p0(cluster_msgs): """ 快速判断一个簇是否是 P0 级别问题。 @@ -229,7 +272,7 @@ def main(): # 加载已推送状态 state = load_dispatched_state() - print(f"[P0-detect] 已记录 {len(state)} 个已推送簇签名") + print(f"[P0-detect] 已记录 {len(state)} 个已推送簇") # 遍历簇,找出 P0 且未推送的 new_p0_count = 0 @@ -241,7 +284,13 @@ def main(): sig = cluster_signature(cmsgs) if sig in state: - print(f"[P0-detect] 已推送过,跳过: sig={sig[:8]}...") + print(f"[P0-detect] 已推送过(精确匹配),跳过: sig={sig[:8]}...") + continue + + # 内容语义去重 + fp = cluster_content_fingerprint(cmsgs) + if is_duplicate_p0(fp, state): + print(f"[P0-detect] 已推送过(内容匹配),跳过: senders={fp['senders'][:2]}... hour={fp['hour']}") continue print(f"[P0-detect] 🚨 发现新 P0! sig={sig[:8]}... {len(cmsgs)}条消息") @@ -249,14 +298,14 @@ def main(): if args.dry_run: alert = generate_p0_alert_text(cmsgs, info) print(f"[DRY-RUN] 将发送:\n{alert}") - state[sig] = datetime.now().isoformat() + state[sig] = {"time": datetime.now().isoformat(), "fp": fp} new_p0_count += 1 else: alert = generate_p0_alert_text(cmsgs, info) success = dispatch_p0_alert(alert) if success: print(f"[P0-detect] ✅ P0 已实时推送") - state[sig] = datetime.now().isoformat() + state[sig] = {"time": datetime.now().isoformat(), "fp": fp} new_p0_count += 1 else: print(f"[P0-detect] ❌ 推送失败") diff --git a/scripts/detect_p0_wechat.py b/scripts/detect_p0_wechat.py index b4c531e..46c2499 100755 --- a/scripts/detect_p0_wechat.py +++ b/scripts/detect_p0_wechat.py @@ -25,7 +25,7 @@ SKILL_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "skil sys.path.insert(0, SKILL_DIR) from sync_feishu_feedback import ( - sort_threads, get_tenant_token, + sort_threads, get_tenant_token, content_similarity, DISPATCH_CHAT_ID, DISPATCH_CRED_DIR, P0_NOTIFY_USERS, MYSQL_HOST, MYSQL_PORT, MYSQL_USER, MYSQL_PASS, MYSQL_DB, ) @@ -44,7 +44,13 @@ def load_dispatched_state(): except (FileNotFoundError, json.JSONDecodeError): state = {} cutoff = (datetime.now() - timedelta(hours=24)).isoformat() - return {k: v for k, v in state.items() if v > cutoff} + # 兼容新旧格式:新格式 value 是 {"time": ..., "fp": ...},旧格式是纯时间字符串 + cleaned = {} + for k, v in state.items(): + ts = v if isinstance(v, str) else v.get("time", "") + if ts > cutoff: + cleaned[k] = v + return cleaned def save_dispatched_state(state): @@ -60,6 +66,53 @@ def cluster_signature(cluster_msgs): return hashlib.md5(",".join(ids).encode()).hexdigest() +def cluster_content_fingerprint(cluster_msgs): + """生成基于内容语义的簇指纹,用于跨扫描去重(不依赖消息ID集合)""" + # 拼接簇内所有有意义的消息内容(跳过纯图片/文件/表情) + all_contents = [] + for m in cluster_msgs: + c = str(m[3]).strip() if m[3] else "" + if c and len(c) > 8: + all_contents.append(c[:300]) + # 取前5条聚合,保证核心问题描述稳定 + aggregated = " | ".join(all_contents[:5]) + # 提取发送人集合(排序保证一致性) + senders = sorted(set(m[1] for m in cluster_msgs if m[1])) + # 提取小时粒度的时间窗口 + times = [m[6] for m in cluster_msgs if m[6]] + hour = times[0][:13] if times else "unknown" + return { + "content": aggregated, + "senders": senders, + "hour": hour, + "msg_count": len(cluster_msgs), + } + + +def is_duplicate_p0(new_fp, dispatched_entries): + """ + 基于内容语义判断新 P0 是否与已推送 P0 重复。 + dispatched_entries: {sig: {"time": str, "fp": dict}} + """ + for entry in dispatched_entries.values(): + old_fp = entry.get("fp") + if not old_fp: + continue + same_hour = new_fp["hour"] == old_fp["hour"] + sender_overlap = len(set(new_fp["senders"]) & set(old_fp["senders"])) + # 条件1: 同一小时 + 发送人有交集 + 内容相似度 > 0.20(聚合内容稳定,宽松阈值足够区分) + if same_hour and sender_overlap >= 1: + sim = content_similarity(new_fp["content"], old_fp["content"]) + if sim > 0.20: + return True + # 条件2: 发送人高度重叠 + 内容相似度 > 0.35(跨小时场景) + if sender_overlap >= 2: + sim = content_similarity(new_fp["content"], old_fp["content"]) + if sim > 0.35: + return True + return False + + def is_probably_p0(cluster_msgs): if len(cluster_msgs) < CLUSTER_MIN_SIZE: return False, None @@ -200,7 +253,7 @@ def main(): print(f"[P0-wechat] 聚类完成:{len(clusters)} 个簇") state = load_dispatched_state() - print(f"[P0-wechat] 已记录 {len(state)} 个已推送簇签名") + print(f"[P0-wechat] 已记录 {len(state)} 个已推送簇") new_p0_count = 0 for cid in cluster_order: @@ -211,7 +264,13 @@ def main(): sig = cluster_signature(cmsgs) if sig in state: - print(f"[P0-wechat] 已推送过,跳过: sig={sig[:8]}...") + print(f"[P0-wechat] 已推送过(精确匹配),跳过: sig={sig[:8]}...") + continue + + # 内容语义去重 + fp = cluster_content_fingerprint(cmsgs) + if is_duplicate_p0(fp, state): + print(f"[P0-wechat] 已推送过(内容匹配),跳过: senders={fp['senders'][:2]}... hour={fp['hour']}") continue print(f"[P0-wechat] 🚨 发现新 P0! sig={sig[:8]}... {len(cmsgs)}条消息") @@ -219,13 +278,13 @@ def main(): if args.dry_run: alert = generate_p0_alert_text(cmsgs, info) print(f"[DRY-RUN] 将发送:\n{alert}") - state[sig] = datetime.now().isoformat() + state[sig] = {"time": datetime.now().isoformat(), "fp": fp} new_p0_count += 1 else: alert = generate_p0_alert_text(cmsgs, info) if dispatch_p0_alert(alert): print(f"[P0-wechat] ✅ P0 已实时推送") - state[sig] = datetime.now().isoformat() + state[sig] = {"time": datetime.now().isoformat(), "fp": fp} new_p0_count += 1 else: print(f"[P0-wechat] ❌ 推送失败") diff --git a/skills/feishu-group-msg-sync/scripts/__pycache__/sync_group_to_mysql.cpython-312.pyc b/skills/feishu-group-msg-sync/scripts/__pycache__/sync_group_to_mysql.cpython-312.pyc deleted file mode 100644 index 7c495a4..0000000 Binary files a/skills/feishu-group-msg-sync/scripts/__pycache__/sync_group_to_mysql.cpython-312.pyc and /dev/null differ diff --git a/skills/feishu-group-msg-sync/scripts/__pycache__/sync_group_to_sheet.cpython-312.pyc b/skills/feishu-group-msg-sync/scripts/__pycache__/sync_group_to_sheet.cpython-312.pyc deleted file mode 100644 index 023ca38..0000000 Binary files a/skills/feishu-group-msg-sync/scripts/__pycache__/sync_group_to_sheet.cpython-312.pyc and /dev/null differ diff --git a/skills/tencent-cos-upload/scripts/__pycache__/cos_upload.cpython-312.pyc b/skills/tencent-cos-upload/scripts/__pycache__/cos_upload.cpython-312.pyc index 0eff279..9bae3b2 100644 Binary files a/skills/tencent-cos-upload/scripts/__pycache__/cos_upload.cpython-312.pyc and b/skills/tencent-cos-upload/scripts/__pycache__/cos_upload.cpython-312.pyc differ