auto backup: 2026-06-02 08:10:01

2026-06-02 08:10:01 +08:00 · 2026-06-02 08:10:01 +08:00 · 9365f60be9
commit 9365f60be9
parent 3da7fc67ac
11 changed files with 15317 additions and 34 deletions
--- a/business_knowledge/新知识库初版/全包词汇/三级单词表_已标注词性.xlsx
+++ b/business_knowledge/新知识库初版/全包词汇/三级单词表_已标注词性.xlsx
--- a/business_production/组件配置/interactive-component-json/db/components.db
+++ b/business_production/组件配置/interactive-component-json/db/components.db
--- a/business_production/组件配置/interactive-component-json/outputs/L1-S2-U18-L4
+++ b/business_production/组件配置/interactive-component-json/outputs/L1-S2-U18-L4
--- a/business_production/组件配置/interactive-component-json/outputs/L1-S2-U22-L1
+++ b/business_production/组件配置/interactive-component-json/outputs/L1-S2-U22-L1
--- a/business_production/组件配置/interactive-component-json/outputs/L1-S2-U22-L1
+++ b/business_production/组件配置/interactive-component-json/outputs/L1-S2-U22-L1
--- a/business_production/组件配置/interactive-component-json/scripts/pycache/feishu_client.cpython-312.pyc
+++ b/business_production/组件配置/interactive-component-json/scripts/pycache/feishu_client.cpython-312.pyc
--- a/business_production/组件配置/interactive-component-json/scripts/pycache/pipeline.cpython-312.pyc
+++ b/business_production/组件配置/interactive-component-json/scripts/pycache/pipeline.cpython-312.pyc
--- a/business_production/组件配置/interactive-component-json/scripts/feishu_client.py
+++ b/business_production/组件配置/interactive-component-json/scripts/feishu_client.py
@ -209,10 +209,87 @@ def read_sheet_data(spreadsheet_token, sheet_id, cell_range=None):
    return rows
 def _parse_lark_table(table_html):
    """
    解析飞书 lark-table HTML 为二维数组。
    与 parse_script.parse_lark_table 功能相同，在此独立实现以避免循环导入。
    注意：保留单元格内的换行（<br> 转为 \n），因为组件类型列依赖换行分隔类型名和 cId。
    """
    rows = []
    tr_pattern = re.compile(r'<lark-tr>(.*?)</lark-tr>', re.DOTALL)
    td_pattern = re.compile(r'<lark-td(?:\s+[^>]*)?>(.*?)</lark-td>', re.DOTALL)
    for tr_match in tr_pattern.finditer(table_html):
        tr_content = tr_match.group(1)
        cells = []
        for td_match in td_pattern.finditer(tr_content):
            cell_content = td_match.group(1).strip()
            # <br> 标签转为换行符（保留多行结构）
            cell_text = re.sub(r'<br\s*/?>', '\n', cell_content)
            # 清理其他 HTML 标签
            cell_text = re.sub(r'<[^>]+>', '', cell_text).strip()
            # 清理多余空白，但保留换行符
            cell_text = re.sub(r'[ \t]+', ' ', cell_text).strip()
            cell_text = re.sub(r'\n{3,}', '\n\n', cell_text)
            cells.append(cell_text)
        rows.append(cells)
    return rows
 def _extract_lark_tables_from_markdown(markdown):
    """
    从文档 markdown 中提取所有 lark-table 并转为二维数组。
    当文档没有内嵌 <sheet> 标签时，作为 fallback 数据源。
    过滤规则：
    - 跳过只有 1 行（仅表头无数据）的表格
    - 跳过明显是"角色-section对应"的表格（表头不含"类型"/"组件"等剧本关键词）
    - 跳过纯组件类型汇总表（仅含"序号/类型/知识点"，不含"组件配置"）
    Returns:
        list[list[list]]: 二维数组列表，每个元素代表一个有效表格
    """
    table_pattern = re.compile(r'<lark-table[^>]*>(.*?)</lark-table>', re.DOTALL)
    tables = []
    for match in table_pattern.finditer(markdown):
        table_html = match.group(0)
        rows = _parse_lark_table(table_html)
        if not rows or len(rows) < 2:
            continue
        header = rows[0]
        header_text = ' '.join(str(c) for c in header if c)
        # 跳过"角色-section对应"表
        if '角色' in header_text and 'section' in header_text.lower():
            logger.debug(f"跳过角色-section对应表: {header_text[:60]}")
            continue
        # 跳过纯组件类型汇总表（只有序号/类型/知识点，没有组件配置/台词等列）
        has_script_columns = any(
            keyword in header_text
            for keyword in ['组件配置', '台词', '剧情描述', '角色名', '配置信息']
        )
        if not has_script_columns:
            logger.debug(f"跳过非剧本表格（无组件配置/台词列）: {header_text[:60]}")
            continue
        tables.append(rows)
        logger.info(f"从 markdown lark-table 提取表格: {len(rows)} 行, 表头: {header_text[:80]}")
    return tables
 def read_wiki_doc_with_sheet(wiki_url_or_token):
    """
    一站式读取: wiki URL/token → 文档markdown + 内嵌sheet数据
    数据源优先级：
    1. 内嵌 <sheet> 标签（飞书电子表格）
    2. markdown 中的 <lark-table>（内联表格，fallback）
    Args:
        wiki_url_or_token: 飞书 wiki URL 或 wiki_token
@ -224,6 +301,9 @@ def read_wiki_doc_with_sheet(wiki_url_or_token):
            "markdown": str,
            "sheet_token": (spreadsheet_token, sheet_id) or None,
            "sheet_rows": list[list] or None,
            "all_sheets": list[list[list]],
            "all_sheet_tokens": list,
            "data_source": "sheet" | "lark_table" | None,
        }
    """
    # 解析 wiki_token
@ -244,6 +324,8 @@ def read_wiki_doc_with_sheet(wiki_url_or_token):
    sheet_infos = extract_sheet_tokens(markdown)
    all_sheet_rows = []
    sheet_tokens = []
    data_source = None
    for sheet_info in sheet_infos:
        spreadsheet_token, sheet_id = sheet_info
        if sheet_id:
@ -257,6 +339,19 @@ def read_wiki_doc_with_sheet(wiki_url_or_token):
        else:
            logger.warning(f"sheet_token 中未包含 sheet_id: {sheet_info}")
    if all_sheet_rows:
        data_source = "sheet"
    else:
        # Fallback: 尝试从 markdown lark-table 中提取剧本表格
        logger.info("未找到内嵌 sheet，尝试从 markdown lark-table 解析...")
        lark_tables = _extract_lark_tables_from_markdown(markdown)
        if lark_tables:
            all_sheet_rows = lark_tables
            data_source = "lark_table"
            logger.info(f"从 markdown lark-table 成功提取 {len(lark_tables)} 个表格作为 fallback 数据源")
        else:
            logger.warning("markdown 中也未找到有效的剧本 lark-table")
    # 兼容旧接口：sheet_rows 取第一个（向后兼容），新增 all_sheets
    sheet_rows = all_sheet_rows[0] if all_sheet_rows else None
    sheet_token = sheet_tokens[0] if sheet_tokens else None
@ -270,6 +365,7 @@ def read_wiki_doc_with_sheet(wiki_url_or_token):
        "sheet_rows": sheet_rows,
        "all_sheets": all_sheet_rows,
        "all_sheet_tokens": sheet_tokens,
        "data_source": data_source,
    }
--- a/business_production/组件配置/interactive-component-json/scripts/pipeline.py
+++ b/business_production/组件配置/interactive-component-json/scripts/pipeline.py
@ -219,6 +219,7 @@ def process_script(wiki_url_or_token, db_path=None, dry_run=False, target_cids=N
        "skipped": 0,
        "results": [],
        "errors": [],
        "data_source": None,
    }
    # 设置文件日志（早期设置，后续用标题重命名）
@ -229,15 +230,20 @@ def process_script(wiki_url_or_token, db_path=None, dry_run=False, target_cids=N
    try:
        doc_data = read_wiki_doc_with_sheet(wiki_url_or_token)
        report["title"] = doc_data["title"]
-        logger.info(f"文档: {doc_data['title']}, obj_token={doc_data['obj_token']}")
+        report["data_source"] = doc_data.get("data_source")
        logger.info(f"文档: {doc_data['title']}, obj_token={doc_data['obj_token']}, data_source={doc_data.get('data_source')}")
    except Exception as e:
        report["errors"].append(f"读取文档失败: {e}")
        logger.error(f"读取文档失败: {e}")
        return report
    data_source = doc_data.get("data_source")
    if data_source == "lark_table":
        logger.info("数据源: markdown lark-table (fallback)")
    if not doc_data["sheet_rows"] and not doc_data.get("all_sheets"):
-        report["errors"].append("文档中未找到内嵌sheet数据")
+        report["errors"].append("文档中未找到内嵌sheet数据，也未找到有效的lark-table剧本表格")
-        logger.error("文档中未找到内嵌sheet数据")
+        logger.error("文档中未找到任何有效数据源（sheet + lark-table 均为空）")
        return report
    # Step 2: 解析剧本（尝试所有sheet，取组件数最多的结果）
--- a/memory/.dreams/events.jsonl
+++ b/memory/.dreams/events.jsonl
@ -79,3 +79,6 @@
 {"type":"memory.recall.recorded","timestamp":"2026-05-29T00:23:17.391Z","query":"输出文本和JSON 题型 单元挑战 questionSet 看图组词 单词释义","resultCount":1,"results":[{"path":"memory/2026-05-28.md","startLine":46,"endLine":62,"score":1}]}
 {"type":"memory.recall.recorded","timestamp":"2026-05-29T00:23:17.392Z","query":"reading_pic_makeWord reading_word_definition listening_listenWrite reading_select_cloze 修正","resultCount":5,"results":[{"path":"memory/2026-05-07.md","startLine":1,"endLine":20,"score":1},{"path":"memory/2026-05-11.md","startLine":1,"endLine":25,"score":1},{"path":"memory/2026-05-17.md","startLine":19,"endLine":41,"score":1},{"path":"memory/2026-05-12.md","startLine":170,"endLine":193,"score":1},{"path":"memory/2026-05-21.md","startLine":52,"endLine":67,"score":1}]}
 {"type":"memory.recall.recorded","timestamp":"2026-05-29T02:33:30.526Z","query":"梁晨 l1_pedagogy_rules pedagogy rules JS文件","resultCount":6,"results":[{"path":"memory/2026-05-26.md","startLine":130,"endLine":166,"score":1},{"path":"memory/2026-05-28.md","startLine":46,"endLine":62,"score":1},{"path":"memory/2026-05-25.md","startLine":283,"endLine":302,"score":1},{"path":"memory/2026-05-25.md","startLine":118,"endLine":137,"score":1},{"path":"memory/2026-05-28.md","startLine":60,"endLine":66,"score":1},{"path":"memory/2026-05-20.md","startLine":1,"endLine":31,"score":1}]}
 {"type":"memory.recall.recorded","timestamp":"2026-06-01T03:18:26.408Z","query":"句型库 L1 L2 pattern get home talk to","resultCount":2,"results":[{"path":"memory/2026-05-28.md","startLine":60,"endLine":66,"score":1},{"path":"memory/2026-05-28.md","startLine":46,"endLine":62,"score":1}]}
 {"type":"memory.recall.recorded","timestamp":"2026-06-01T10:44:19.887Z","query":"互动组件生产 中互动 核心互动 剧本表格 对话类","resultCount":3,"results":[{"path":"memory/2026-05-07.md","startLine":354,"endLine":368,"score":1},{"path":"memory/2026-05-18.md","startLine":793,"endLine":810,"score":1},{"path":"memory/2026-04-22.md","startLine":1,"endLine":8,"score":1}]}
 {"type":"memory.recall.recorded","timestamp":"2026-06-01T10:44:47.489Z","query":"王璐辰 反馈 组件配置 中互动 核心互动 问题","resultCount":2,"results":[{"path":"memory/2026-04-30.md","startLine":50,"endLine":63,"score":1},{"path":"memory/2026-05-07.md","startLine":354,"endLine":368,"score":1}]}
--- a/memory/.dreams/short-term-recall.json
+++ b/memory/.dreams/short-term-recall.json
@ -1,6 +1,6 @@
 {
  "version": 1,
-  "updatedAt": "2026-05-29T02:33:30.526Z",
+  "updatedAt": "2026-06-01T10:44:47.489Z",
  "entries": {
    "memory:memory/2026-05-07.md:57:74": {
      "key": "memory:memory/2026-05-07.md:57:74",
@ -439,20 +439,22 @@
      "endLine": 63,
      "source": "memory",
      "snippet": "格式: 【任务标题】【情境引入】【互动内容】【互动反馈】【后置对话】。目标词用$...$包裹，选项为图片区域编号(00/01/02)。 #### 5. 对话选择 (1条) — 规则: 参考 teaching_config 示例格式 + mid_dialog_choose prompt | ID | 任务标题 | 选项 | 知识点 | |---|---------|------|--------| | 1115514 | 判断蕃茄的好坏 | It is a good one. / It is a bad one. | one | 格式: 【任务标题】【资源配置】【情境引入】【互动内容】【后置对话】。互动内容含要求+选项+反馈。 ### 关键经验 - **组件配置列位置变更**: 本轮操作中发现sheet新增了「配置信息」列(C列)，导致组件配置从F列移至G列。后续操作需先确认列结构。 - **Bot身份操作**: 所有飞书 sheet 写入均使用 Bot 身份（App ID: cli_a931175d41799cc7），Feishu API Token: t-g1044ubUXXMPFXBW75RILL7YZKRGAKQGWWDV2R3D - **对话选择组件**: 无独立 SKILL.md，需参照 interactive-component-json 的 prompt_registry.py 中 mid_dialog_choose 模板 + bitable 中已有的 teaching_config 示例格式",
-      "recallCount": 2,
+      "recallCount": 3,
      "dailyCount": 0,
      "groundedCount": 0,
-      "totalScore": 2,
+      "totalScore": 3,
      "maxScore": 1,
      "firstRecalledAt": "2026-05-13T03:09:54.362Z",
-      "lastRecalledAt": "2026-05-28T10:04:34.024Z",
+      "lastRecalledAt": "2026-06-01T10:44:47.489Z",
      "queryHashes": [
        "f151bc633ad1",
-        "71e44ea68b09"
+        "71e44ea68b09",
        "f53c9769e692"
      ],
      "recallDays": [
        "2026-05-13",
-        "2026-05-28"
+        "2026-05-28",
        "2026-06-01"
      ],
      "conceptTags": [
        "00/01/02",
@ -872,13 +874,13 @@
      "endLine": 368,
      "source": "memory",
      "snippet": "- **需求：** 将 020102（I am...）和 020103（I am ready / Thank you）两套题合并为一个 `{first:..., second:...}` JSON，统一 questionSetID=0000001 - **状态：** ✅ 已完成 - **核心考点分析（用户强调）：** 需分析每个句型的核心考点（孩子最容易犯错的地方），挖空对准核心考点 - I am/from 组：am（系动词第一人称）、from（介词选择）、student（a+名词结构） - Thank you for 组：for（介词选择，非 you）、helping（for+动名词，非 help/to help） - **输出文件：** `output/writing_pic_qa_combined.json` ### 刘彦江 — 组件配置-json 请求（L1-S2-U13-L4 沙漠之花） - **时间：** 16:45 ~ 17:51 - **文档：** `https://makee-interactive.feishu.cn/wiki/K5E1wzwk7it9t7kXvcbc6Xugnhc` - **状态：** ⚠️ 未完成 — pipeline 识别到 0 组件 - **根因：** 剧本文档的13个组件数据存储在 markdown 内联表格中（lark-table，5列×36行），而非内嵌 Sheet。当前 pipeline 的 parse_script 只从内嵌 Sheet 读取组件数据，不支持 markdown 表格组件解析 - **已识别组件（ma",
-      "recallCount": 10,
+      "recallCount": 12,
      "dailyCount": 0,
      "groundedCount": 0,
-      "totalScore": 10,
+      "totalScore": 12,
      "maxScore": 1,
      "firstRecalledAt": "2026-05-15T07:13:08.147Z",
-      "lastRecalledAt": "2026-05-28T02:24:09.414Z",
+      "lastRecalledAt": "2026-06-01T10:44:47.489Z",
      "queryHashes": [
        "08364c8746ab",
        "4f08741ab4fd",
@ -889,7 +891,9 @@
        "612fa3b04b06",
        "d6a04b711fd9",
        "8742c0bf4e2b",
-        "eb0902db0156"
+        "eb0902db0156",
        "229e5d3943bb",
        "f53c9769e692"
      ],
      "recallDays": [
        "2026-05-15",
@ -897,7 +901,8 @@
        "2026-05-18",
        "2026-05-21",
        "2026-05-25",
-        "2026-05-28"
+        "2026-05-28",
        "2026-06-01"
      ],
      "conceptTags": [
        "am/from",
@ -917,13 +922,13 @@
      "endLine": 8,
      "source": "memory",
      "snippet": "[李应瑛 2026-04-22 提出要求] 所有需要包含对话的内容（如剧本、互动组件等）必须要有【后置对话】字段，无后置对话时填写“无”。 [李应瑛 2026-04-22 确认规则] 剧本内嵌表格组件填写位置规则：仅当表格第一列（A列）明确标注为对话类类型（对话朗读/对话挖空/对话选读/对话组句等）时，才在同一行的H列（【组件】列）填写对应的组件内容，其他类型行（TL/场景/角色/图片/非对话类等）无需填写。 [李应瑛 2026-04-22 确认格式规则] 对话类组件字段换行规则：每个结构单独占一行，格式为： 【任务标题】xxx 【情境引入】xxx 【互动内容】xxx 【后置对话】xxx 单元格内使用\\n作为换行符实现，后续所有组件均遵循此格式。",
-      "recallCount": 9,
+      "recallCount": 10,
      "dailyCount": 0,
      "groundedCount": 0,
-      "totalScore": 9,
+      "totalScore": 10,
      "maxScore": 1,
      "firstRecalledAt": "2026-05-15T07:13:08.147Z",
-      "lastRecalledAt": "2026-05-28T02:24:09.414Z",
+      "lastRecalledAt": "2026-06-01T10:44:19.887Z",
      "queryHashes": [
        "08364c8746ab",
        "4f08741ab4fd",
@ -933,7 +938,8 @@
        "612fa3b04b06",
        "d6a04b711fd9",
        "8742c0bf4e2b",
-        "eb0902db0156"
+        "eb0902db0156",
        "229e5d3943bb"
      ],
      "recallDays": [
        "2026-05-15",
@ -941,7 +947,8 @@
        "2026-05-18",
        "2026-05-21",
        "2026-05-25",
-        "2026-05-28"
+        "2026-05-28",
        "2026-06-01"
      ],
      "conceptTags": [
        "对话朗读/对话挖空/对话选读/对话组句等",
@ -1484,25 +1491,27 @@
      "endLine": 810,
      "source": "memory",
      "snippet": "- Script sheet: `wMQVyV`（186行×9列） - Knowledge points sheet: `DCcKsLbrmhfXgrtB7N2c9GA4ntf_NtIcXt` - 列结构：A=类型, D=剧情描述, E=角色名, F=编剧台词（English已填好）, **G=组件配置** - 知识点：point, talk, understand, a lot of + `talk to...` / `I can/can't understand...` ### 交互模式差异 - L5 文档 B列/C列 为空，无详细组件类型标签（仅 A=互动/核心互动-口语） - 编剧台词F列已全部填写（英文） - 互动行的 User 台词含红色标注知识点词 ### 完成事项 - 24个互动行全部生成G列配置（组件类型推断：听力挖空、朗读台词、口语表达） - 写入方式：同上 Sheets v2 API - 24/24 全部回读验证通过 ### 脚本 `scripts/write_L5_G_configs.py`",
-      "recallCount": 5,
+      "recallCount": 6,
      "dailyCount": 0,
      "groundedCount": 0,
-      "totalScore": 5,
+      "totalScore": 6,
      "maxScore": 1,
      "firstRecalledAt": "2026-05-20T06:31:39.981Z",
-      "lastRecalledAt": "2026-05-26T02:59:03.427Z",
+      "lastRecalledAt": "2026-06-01T10:44:19.887Z",
      "queryHashes": [
        "688d2dceca9d",
        "d05a0257d44b",
        "e637236fe74b",
        "14d903a64d04",
-        "240a6a5dca41"
+        "240a6a5dca41",
        "229e5d3943bb"
      ],
      "recallDays": [
        "2026-05-20",
        "2026-05-21",
        "2026-05-25",
-        "2026-05-26"
+        "2026-05-26",
        "2026-06-01"
      ],
      "conceptTags": [
        "can/can",
@ -2655,19 +2664,21 @@
      "endLine": 62,
      "source": "memory",
      "snippet": "- B级规则：听力句子 5-8 词（均 7 词），每组 3 张同类物品不同属性图片 - 能力标签：显性事实理解｜关键词识别 ×2 + 基础语境理解｜场景/物品/动作识别 ## 梁辰（user_id: ou_28f02dcada1193913cfbb6310f8daf07）— HTML 诊断页面教研规则 JS 文件 - 用户有一个 L1 关卡诊断工作台 HTML 页面（部署在腾讯云空间），当前只做数量级统计，缺少教研规则校验 - 需求：提供可嵌入 HTML 的前端 JS 校验规则文件，使诊断有据可依 - 输出：`output/l1_pedagogy_rules.js`（42KB），包含 8 个校验维度： 1. 词汇超纲检测（基于 L1/L2 词库） 2. 英式拼写检测（color→colour 等 115 组映射） 3. Markdown 标记检测（`**`/`#`/`>` 等） 4. 标点规范检测（全角混入、`～`、`!!!`） 5. 题型-阶段匹配校验（20 种题型对应的 L1/L2 阶段映射） 6. 台词质量分析（句子长度、负面评价） 7. 知识点曝光度 8. 句型合规检测 - 接入方式：`<script src=\"l1_pedagogy_rules.js\"></script>` + 调用 `PedagogyRules.validate(summary, level)` - 数据源：L1 词库 147 词（过滤 enabled=true）、L2 词库 52 词、L1 句型 8 个、L2 句型（从 437MB bitable 导出中提取唯一结",
-      "recallCount": 2,
+      "recallCount": 3,
      "dailyCount": 0,
      "groundedCount": 0,
-      "totalScore": 2,
+      "totalScore": 3,
      "maxScore": 1,
      "firstRecalledAt": "2026-05-29T00:23:17.391Z",
-      "lastRecalledAt": "2026-05-29T02:33:30.526Z",
+      "lastRecalledAt": "2026-06-01T03:18:26.408Z",
      "queryHashes": [
        "c1d4076205e1",
-        "4aa0ef719160"
+        "4aa0ef719160",
        "e30c130b9d1d"
      ],
      "recallDays": [
-        "2026-05-29"
+        "2026-05-29",
        "2026-06-01"
      ],
      "conceptTags": [
        "5-8",
@ -2842,18 +2853,20 @@
      "endLine": 66,
      "source": "memory",
      "snippet": "8. 句型合规检测 - 接入方式：`<script src=\"l1_pedagogy_rules.js\"></script>` + 调用 `PedagogyRules.validate(summary, level)` - 数据源：L1 词库 147 词（过滤 enabled=true）、L2 词库 52 词、L1 句型 8 个、L2 句型（从 437MB bitable 导出中提取唯一结构） - 技术注意：L2_pattern_list.json 体积 437MB（1,082,450 条记录），直接嵌入前端不可行，已提取唯一句型结构后嵌入 - 所有 6 个测试用例通过 - 用户后续想尝试其他对接方式（API 模式 / 飞书 Bot 联动 / CI 集成）",
-      "recallCount": 1,
+      "recallCount": 2,
      "dailyCount": 0,
      "groundedCount": 0,
-      "totalScore": 1,
+      "totalScore": 2,
      "maxScore": 1,
      "firstRecalledAt": "2026-05-29T02:33:30.526Z",
-      "lastRecalledAt": "2026-05-29T02:33:30.526Z",
+      "lastRecalledAt": "2026-06-01T03:18:26.408Z",
      "queryHashes": [
-        "4aa0ef719160"
+        "4aa0ef719160",
        "e30c130b9d1d"
      ],
      "recallDays": [
-        "2026-05-29"
+        "2026-05-29",
        "2026-06-01"
      ],
      "conceptTags": [
        "l1-pedagogy-rules.js",