feat: 新增按组件维度导出学习记录能力

- 新增 export_component_records.py: 按 c_type + c_id 导出组件学习记录 - 新增 export_component_records.sh: wrapper 脚本（含数据库凭证） - 更新 export-user-data SKILL.md: 增加模式二（按组件导出）文档 - 支持 27 种 mid_* 和 15 种 core_* 组件类型 - 输出字段: user_id, 音频URL, 发音评分, 朗读内容, 判定结果等 - 按 updated_at 倒序排列，支持可选时间范围过滤
2026-06-04 14:57:48 +08:00 · 2026-06-04 14:57:48 +08:00 · d32cccab38
commit d32cccab38
parent 122cd759fc
3 changed files with 559 additions and 10 deletions
--- a/scripts/export_component_records.py
+++ b/scripts/export_component_records.py
@ -0,0 +1,390 @@
 #!/usr/bin/env python3
 """
 按组件维度导出学习记录
 功能：根据指定的组件类型(c_type)和组件ID(c_id)，导出该组件的所有用户学习记录。
 数据来源：PG user_component_play_record_0~7 + MySQL 组件配置表
 用法：
  python3 export_component_records.py --c-type mid_sentence_dialogue --c-id 1112315
  python3 export_component_records.py --c-type mid_sentence_dialogue --c-id 1112315 --start-date 20260101
  python3 export_component_records.py --c-type mid_sentence_dialogue --c-id 1112315 --start-date 20260101 --end-date 20260604
  python3 export_component_records.py --c-type mid_sentence_dialogue --c-id 1112315 --output-dir /tmp/
 """
 import argparse
 import os
 import json
 import sys
 from datetime import datetime
 from typing import Any, Dict, List, Optional
 import psycopg2
 import pymysql
 import pandas as pd
 # ==================== 组件类型映射 ====================
 # 覆盖 MySQL middle_interaction_component 和 core_interaction_component 中所有实际 c_type
 C_TYPE_NAME_MAPPING = {
    # middle_interaction_component — 对话类
    "mid_dialog_choose": "对话选择",
    "mid_dialog_express": "对话表达",
    "mid_dialog_fillin": "对话填空",
    "mid_dialog_repeat": "对话跟读",
    "mid_dialog_select": "对话选择",
    "mid_dialog_sentence": "对话组句",
    # middle_interaction_component — 图片类
    "mid_image_choose": "图片选择",
    "mid_image_drag": "图片拖拽",
    "mid_image_multiple": "图片多选",
    "mid_image_sequence": "图片排序",
    # middle_interaction_component — 消息/文字类
    "mid_message_combine": "消息组合",
    "mid_message_fillin": "消息填空",
    "mid_message_sentence": "消息组句",
    "mid_message_spell": "消息拼写",
    "mid_message_trace": "消息描红",
    "mid_message_word": "消息选词",
    # middle_interaction_component — 语法/发音/词汇/句子
    "mid_grammar_cloze": "语法挖空",
    "mid_grammar_sentence": "语法组句",
    "mid_pron_pron": "发音互动",
    "mid_sentence_dialogue": "句子对话",
    "mid_sentence_makeSentence": "句子造句",
    "mid_sentence_material": "句子材料",
    "mid_sentence_voice": "句子语音",
    "mid_vocab_fillBlank": "词汇填空",
    "mid_vocab_image": "词汇图片",
    "mid_vocab_instruction": "词汇指令",
    "mid_vocab_item": "词汇物品",
    # core_interaction_component
    "core_listening_choose": "听力选择",
    "core_listening_drag": "听力拖拽",
    "core_listening_order": "听力排序",
    "core_reading_imageDrag": "阅读图片拖拽",
    "core_reading_order": "阅读排序",
    "core_speaking_explore": "口语探讨",
    "core_speaking_image": "口语图片",
    "core_speaking_inquiry": "口语妙问",
    "core_speaking_monologue": "口语独白",
    "core_speaking_reply": "口语快答",
    "core_writing_imgMakeSentence": "写作看图组句",
    "core_writing_imgMakeWord": "写作看图组词",
    "core_writing_imgWrite": "写作看图撰写",
    "core_writing_questionMakeSentence": "写作问题组句",
    "core_writing_questionWrite": "写作问题撰写",
 }
 # ==================== 输出列定义 ====================
 OUTPUT_COLUMNS = [
    "user_id",
    "session_id",
    "component_unique_code",
    "c_type",
    "c_id",
    "组件名称",
    "组件标题",
    "mode",
    "参考文本",
    "play_result",
    "发音评分",
    "音频URL",
    "朗读内容",
    "user_behavior_info",
    "updated_at",
 ]
 def get_pg_conn():
    """获取PG连接"""
    return psycopg2.connect(
        host=os.getenv("PG_DB_HOST"),
        port=os.getenv("PG_DB_PORT"),
        user=os.getenv("PG_DB_USER"),
        password=os.getenv("PG_DB_PASSWORD"),
        database=os.getenv("PG_DB_DATABASE"),
    )
 def get_mysql_conn():
    """获取MySQL连接（vala_test库，组件配置表所在库）"""
    return pymysql.connect(
        host=os.getenv("MYSQL_HOST"),
        user=os.getenv("MYSQL_USERNAME"),
        password=os.getenv("MYSQL_PASSWORD"),
        database="vala_test",
        port=int(os.getenv("MYSQL_PORT", 3306)),
        charset="utf8mb4",
    )
 def get_component_config(mysql_conn, c_type: str, c_id: str) -> Optional[Dict]:
    """从MySQL获取组件配置信息"""
    if c_type.startswith("mid"):
        table = "middle_interaction_component"
    elif c_type.startswith("core"):
        table = "core_interaction_component"
    else:
        print(f"  [WARN] 未知的c_type前缀: {c_type}，无法查询组件配置")
        return None
    cursor = mysql_conn.cursor()
    try:
        query = f"SELECT title, component_config FROM {table} WHERE c_type = %s AND c_id = %s"
        cursor.execute(query, (c_type, c_id))
        row = cursor.fetchone()
        if row:
            title, config_str = row
            config = {}
            if config_str:
                try:
                    config = json.loads(config_str)
                except (json.JSONDecodeError, TypeError):
                    pass
            return {"title": title or "", "config": config}
        return None
    finally:
        cursor.close()
 def extract_audio_from_behavior(user_behavior_info: Any) -> Dict[str, Any]:
    """从user_behavior_info JSON中提取音频相关信息"""
    result = {"发音评分": "", "音频URL": "", "朗读内容": ""}
    if not user_behavior_info:
        return result
    # 如果已经是dict，直接处理
    if isinstance(user_behavior_info, dict):
        result["发音评分"] = user_behavior_info.get("pronunciationScore", "")
        result["音频URL"] = user_behavior_info.get("userAudio", "")
        result["朗读内容"] = user_behavior_info.get("expressContent", "")
        return result
    # 字符串类型，尝试JSON解析
    if isinstance(user_behavior_info, str):
        try:
            data = json.loads(user_behavior_info)
            if isinstance(data, list) and len(data) > 0:
                item = data[0]
                result["发音评分"] = item.get("pronunciationScore", "")
                result["音频URL"] = item.get("userAudio", "")
                result["朗读内容"] = item.get("expressContent", "")
            elif isinstance(data, dict):
                result["发音评分"] = data.get("pronunciationScore", "")
                result["音频URL"] = data.get("userAudio", "")
                result["朗读内容"] = data.get("expressContent", "")
        except (json.JSONDecodeError, TypeError):
            pass
    return result
 def get_mode_from_config(c_type: str, config: Dict) -> str:
    """从组件配置中提取mode信息"""
    question = config.get("question", {})
    if isinstance(question, dict):
        # mid_sentence_dialogue 有 mode 字段（express/read）
        mode = question.get("mode", "")
        if mode:
            return mode
        # mid_dialog_express 等有 desc 字段
        desc = question.get("desc", "")
        if desc:
            return desc
    return ""
 def get_ref_text_from_config(c_type: str, config: Dict) -> str:
    """从组件配置中提取参考文本"""
    question = config.get("question", {})
    if isinstance(question, dict):
        # mid_sentence_dialogue: question.content
        content = question.get("content", "")
        if content:
            return content
        # mid_dialog_express: question.desc
        desc = question.get("desc", "")
        if desc:
            return desc
    # 其他类型尝试 example.content
    example = config.get("example", {})
    if isinstance(example, dict):
        return example.get("content", "")
    return ""
 def get_component_display_name(c_type: str, config: Dict) -> str:
    """获取组件展示名称"""
    base_name = C_TYPE_NAME_MAPPING.get(c_type, c_type)
    return base_name
 def query_pg_data(pg_conn, c_type: str, c_id: str,
                  start_time: Optional[str], end_time: Optional[str]) -> pd.DataFrame:
    """从PG 8张分表查询指定组件的数据，按updated_at倒序"""
    all_data = []
    for i in range(8):
        table = f"user_component_play_record_{i}"
        conditions = ["c_type = %s", "c_id = %s"]
        params = [c_type, c_id]
        if start_time:
            conditions.append("updated_at >= %s")
            params.append(start_time)
        if end_time:
            conditions.append("updated_at <= %s")
            params.append(end_time)
        where = " AND ".join(conditions)
        query = f"""
            SELECT user_id, session_id, component_unique_code, c_type, c_id,
                   play_result, user_behavior_info, updated_at
            FROM {table}
            WHERE {where}
            ORDER BY updated_at DESC
        """
        try:
            df = pd.read_sql_query(query, pg_conn, params=params)
            if not df.empty:
                all_data.append(df)
                print(f"  {table}: {len(df)} 条")
        except Exception as e:
            print(f"  {table}: 查询异常 - {e}")
    if not all_data:
        return pd.DataFrame()
    result = pd.concat(all_data, ignore_index=True)
    # 全局按时间倒序排列
    result = result.sort_values("updated_at", ascending=False).reset_index(drop=True)
    return result
 def main():
    parser = argparse.ArgumentParser(
        description="按组件维度导出学习记录",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 示例:
  %(prog)s --c-type mid_sentence_dialogue --c-id 1112315
  %(prog)s --c-type mid_sentence_dialogue --c-id 1112315 --start-date 20260101
  %(prog)s --c-type core_speaking_reply --c-id 42 --start-date 20260501 --end-date 20260604
        """,
    )
    parser.add_argument("--c-type", required=True, help="组件类型，如 mid_sentence_dialogue")
    parser.add_argument("--c-id", required=True, help="组件ID，如 1112315")
    parser.add_argument("--start-date", help="起始日期 YYYYMMDD（可选，不传则不限）")
    parser.add_argument("--end-date", help="截止日期 YYYYMMDD（可选，不传则不限）")
    parser.add_argument("--output-dir", default="output", help="输出目录（默认 output/）")
    args = parser.parse_args()
    c_type = args.c_type.strip()
    c_id = args.c_id.strip()
    # 时间范围处理
    start_time = None
    end_time = None
    if args.start_date:
        start_time = datetime.strptime(args.start_date, "%Y%m%d").strftime("%Y-%m-%d 00:00:00")
    if args.end_date:
        end_time = datetime.strptime(args.end_date, "%Y%m%d").strftime("%Y-%m-%d 23:59:59")
    print(f"{'=' * 60}")
    print(f"按组件导出学习记录")
    print(f"组件类型: {c_type}")
    print(f"组件ID:   {c_id}")
    if start_time:
        print(f"起始时间: {start_time}")
    if end_time:
        print(f"截止时间: {end_time}")
    print(f"{'=' * 60}")
    # ===== 1. 获取组件配置 =====
    print(f"\n[1/3] 获取组件配置...")
    mysql_conn = get_mysql_conn()
    try:
        comp_config = get_component_config(mysql_conn, c_type, c_id)
    finally:
        mysql_conn.close()
    if comp_config is None:
        comp_config = {"title": "", "config": {}}
        print(f"  未找到组件配置（将使用默认值）")
    else:
        print(f"  组件标题: {comp_config['title']}")
    display_name = get_component_display_name(c_type, comp_config["config"])
    mode_str = get_mode_from_config(c_type, comp_config["config"])
    ref_text = get_ref_text_from_config(c_type, comp_config["config"])
    print(f"  组件名称: {display_name}")
    if mode_str:
        print(f"  mode: {mode_str}")
    if ref_text:
        preview = ref_text[:80] + "..." if len(ref_text) > 80 else ref_text
        print(f"  参考文本: {preview}")
    # ===== 2. 查询PG数据 =====
    print(f"\n[2/3] 查询PG分表数据...")
    pg_conn = get_pg_conn()
    try:
        df = query_pg_data(pg_conn, c_type, c_id, start_time, end_time)
    finally:
        pg_conn.close()
    if df.empty:
        print(f"\n  未找到任何记录！请检查 c_type/c_id 是否正确，或扩大时间范围。")
        sys.exit(1)
    print(f"  总计: {len(df)} 条记录")
    # ===== 3. 处理数据并导出 =====
    print(f"\n[3/3] 处理数据并导出...")
    # 提取音频信息
    audio_data = df["user_behavior_info"].apply(extract_audio_from_behavior)
    df["发音评分"] = audio_data.apply(lambda x: x["发音评分"])
    df["音频URL"] = audio_data.apply(lambda x: x["音频URL"])
    df["朗读内容"] = audio_data.apply(lambda x: x["朗读内容"])
    # 添加组件配置字段
    df["组件名称"] = display_name
    df["组件标题"] = comp_config["title"]
    df["mode"] = mode_str
    df["参考文本"] = ref_text
    # 处理时间字段（去除时区信息，兼容Excel）
    if "updated_at" in df.columns:
        df["updated_at"] = df["updated_at"].dt.tz_localize(None)
    # 确保输出列完整且顺序正确
    for col in OUTPUT_COLUMNS:
        if col not in df.columns:
            df[col] = ""
    df = df[OUTPUT_COLUMNS]
    # 输出Excel
    os.makedirs(args.output_dir, exist_ok=True)
    date_str = datetime.now().strftime("%Y%m%d")
    safe_name = display_name.replace("-", "_").replace("/", "_")
    filename = f"组件_{safe_name}_{c_id}_导出时间_{date_str}.xlsx"
    output_path = os.path.join(args.output_dir, filename)
    df.to_excel(output_path, index=False, engine="openpyxl")
    # 汇总统计
    result_stats = df["play_result"].value_counts().to_dict()
    time_min = df["updated_at"].min()
    time_max = df["updated_at"].max()
    print(f"\n{'=' * 60}")
    print(f"导出完成！")
    print(f"文件: {output_path}")
    print(f"记录数: {len(df)}")
    print(f"用户数: {df['user_id'].nunique()}")
    print(f"时间范围: {time_min} ~ {time_max}")
    print(f"判定分布: {result_stats}")
    print(f"{'=' * 60}")
 if __name__ == "__main__":
    main()
--- a/scripts/export_component_records.sh
+++ b/scripts/export_component_records.sh
@ -0,0 +1,44 @@
 #!/bin/bash
 # 按组件维度导出学习记录 wrapper 脚本
 # 用法:
 #   ./scripts/export_component_records.sh --c-type mid_sentence_dialogue --c-id 1112315
 #   ./scripts/export_component_records.sh --c-type mid_sentence_dialogue --c-id 1112315 --start-date 20260101
 #   ./scripts/export_component_records.sh --c-type mid_sentence_dialogue --c-id 1112315 --start-date 20260101 --end-date 20260604
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 WORKSPACE="$(dirname "$SCRIPT_DIR")"
 # 数据库凭证
 export MYSQL_HOST="${MYSQL_HOST:-bj-cdb-8frbdwju.sql.tencentcdb.com}"
 export MYSQL_USERNAME="${MYSQL_USERNAME:-read_only}"
 export MYSQL_PASSWORD='fdsfiidier^$*hjfdijjd232'
 export MYSQL_PORT="${MYSQL_PORT:-25413}"
 export PG_DB_HOST="${PG_DB_HOST:-bj-postgres-16pob4sg.sql.tencentcdb.com}"
 export PG_DB_PORT="${PG_DB_PORT:-28591}"
 export PG_DB_USER="${PG_DB_USER:-ai_member}"
 export PG_DB_PASSWORD='LdfjdjL83h3h3^$&**YGG*'
 export PG_DB_DATABASE="${PG_DB_DATABASE:-vala}"
 # 默认输出目录
 DEFAULT_OUTPUT="${WORKSPACE}/output"
 mkdir -p "$DEFAULT_OUTPUT"
 # 如果未指定 --output-dir，自动加默认值
 HAS_OUTPUT=false
 for arg in "$@"; do
    if [[ "$arg" == "--output-dir" ]]; then
        HAS_OUTPUT=true
        break
    fi
 done
 PYTHON_SCRIPT="${SCRIPT_DIR}/export_component_records.py"
 if [ "$HAS_OUTPUT" = false ]; then
    exec python3 "$PYTHON_SCRIPT" --output-dir "$DEFAULT_OUTPUT" "$@"
 else
    exec python3 "$PYTHON_SCRIPT" "$@"
 fi
--- a/skills/export-user-data/SKILL.md
+++ b/skills/export-user-data/SKILL.md
@ -1,26 +1,31 @@
 ---
 name: export-user-data
 description: |
-  导出一个或多个角色的全量行课数据（音频、互动组件、课程巩固、单元挑战、单元总结、汇总统计）。
+  用户数据导出技能，支持两种模式：
-  支持角色ID或账户ID批量导出，输出多 sheet Excel 文件。
+  1. 按用户维度：导出一个或多个角色的全量行课数据（音频、互动组件、课程巩固、单元挑战、单元总结、汇总统计）
-  触发词：导出角色/行课数据、导出用户数据、export user data。
+  2. 按组件维度：导出指定组件的所有用户学习记录（用户ID、音频、判定结果等），按时间倒序
  触发词：导出角色/行课数据、导出用户数据、导出组件记录、export user data。
 ---
-# 导出角色行课数据
+# 导出用户数据
 ## 前置条件
- wrapper 脚本已配置数据库凭证（`scripts/export_user_data.sh`）
+- wrapper 脚本已配置数据库凭证
- 依赖：`python3`, `pymysql`, `psycopg2`, `pandas`, `requests`
+- 依赖：`python3`, `pymysql`, `psycopg2`, `pandas`, `openpyxl`
-## 执行方式
+---
 ## 模式一：按用户维度导出（全量行课数据）
 ### 执行方式
 ```bash
 cd /root/.openclaw/workspace-xiaoban
 ./scripts/export_user_data.sh <参数>
 ```
-## 参数
+### 参数
 | 参数 | 说明 | 示例 |
 |------|------|------|
@ -31,13 +36,13 @@ cd /root/.openclaw/workspace-xiaoban
 三种模式互斥，只能传一种 ID 参数。
-## 输出
+### 输出
 - 默认输出到 `output/` 目录
 - 单角色：`角色id_{ID}_导出时间_{YYYYMMDD}.xlsx`
 - 账户模式：`账户id_{AID}_角色id_{UID}_导出时间_{YYYYMMDD}.xlsx`
-## Excel 包含的 Sheet
+### Excel 包含的 Sheet
 1. 全部音频数据（ES `user-audio` 索引）
 2. 互动组件学习记录（PG 8张分表 + MySQL 组件配置）
@ -46,6 +51,116 @@ cd /root/.openclaw/workspace-xiaoban
 5. 单元总结记录（PG）
 6. 汇总统计（组件类型统计 / 知识点统计 / 单元时长统计）
 ---
 ## 模式二：按组件维度导出（组件学习记录）
 ### 执行方式
 ```bash
 cd /root/.openclaw/workspace-xiaoban
 ./scripts/export_component_records.sh --c-type <组件类型> --c-id <组件ID> [可选参数]
 ```
 ### 参数
 | 参数 | 说明 | 示例 |
 |------|------|------|
 | `--c-type <type>` | **必填** 组件类型 | `mid_sentence_dialogue` |
 | `--c-id <id>` | **必填** 组件ID | `1112315` |
 | `--start-date <YYYYMMDD>` | 可选，起始日期 | `--start-date 20260101` |
 | `--end-date <YYYYMMDD>` | 可选，截止日期 | `--end-date 20260604` |
 | `--output-dir <dir>` | 输出目录（默认 `output/`） | `--output-dir /tmp/` |
 ### 支持的组件类型（c_type）
 > 覆盖 MySQL `middle_interaction_component` 和 `core_interaction_component` 中所有实际 c_type。
 **中互动组件（mid_*）：**
 | c_type | 组件名称 |
 |--------|---------|
 | `mid_dialog_choose` | 对话选择 |
 | `mid_dialog_express` | 对话表达 |
 | `mid_dialog_fillin` | 对话填空 |
 | `mid_dialog_repeat` | 对话跟读 |
 | `mid_dialog_select` | 对话选择 |
 | `mid_dialog_sentence` | 对话组句 |
 | `mid_image_choose` | 图片选择 |
 | `mid_image_drag` | 图片拖拽 |
 | `mid_image_multiple` | 图片多选 |
 | `mid_image_sequence` | 图片排序 |
 | `mid_message_combine` | 消息组合 |
 | `mid_message_fillin` | 消息填空 |
 | `mid_message_sentence` | 消息组句 |
 | `mid_message_spell` | 消息拼写 |
 | `mid_message_trace` | 消息描红 |
 | `mid_message_word` | 消息选词 |
 | `mid_grammar_cloze` | 语法挖空 |
 | `mid_grammar_sentence` | 语法组句 |
 | `mid_pron_pron` | 发音互动 |
 | `mid_sentence_dialogue` | 句子对话 |
 | `mid_sentence_makeSentence` | 句子造句 |
 | `mid_sentence_material` | 句子材料 |
 | `mid_sentence_voice` | 句子语音 |
 | `mid_vocab_fillBlank` | 词汇填空 |
 | `mid_vocab_image` | 词汇图片 |
 | `mid_vocab_instruction` | 词汇指令 |
 | `mid_vocab_item` | 词汇物品 |
 **核心互动组件（core_*）：**
 | c_type | 组件名称 |
 |--------|---------|
 | `core_listening_choose` | 听力选择 |
 | `core_listening_drag` | 听力拖拽 |
 | `core_listening_order` | 听力排序 |
 | `core_reading_imageDrag` | 阅读图片拖拽 |
 | `core_reading_order` | 阅读排序 |
 | `core_speaking_explore` | 口语探讨 |
 | `core_speaking_image` | 口语图片 |
 | `core_speaking_inquiry` | 口语妙问 |
 | `core_speaking_monologue` | 口语独白 |
 | `core_speaking_reply` | 口语快答 |
 | `core_writing_imgMakeSentence` | 写作看图组句 |
 | `core_writing_imgMakeWord` | 写作看图组词 |
 | `core_writing_imgWrite` | 写作看图撰写 |
 | `core_writing_questionMakeSentence` | 写作问题组句 |
 | `core_writing_questionWrite` | 写作问题撰写 |
 ### 输出
 - 默认输出到 `output/` 目录
 - 文件名：`组件_{组件名称}_{c_id}_导出时间_{YYYYMMDD}.xlsx`
 - 示例：`组件_对话互动-表达_1112315_导出时间_20260604.xlsx`
 ### Excel 包含的字段
 | 字段 | 说明 | 来源 |
 |------|------|------|
 | `user_id` | 用户角色ID | PG |
 | `session_id` | 会话ID | PG |
 | `component_unique_code` | 组件唯一标识 | PG |
 | `c_type` | 组件类型编码 | PG |
 | `c_id` | 组件ID | PG |
 | `组件名称` | 中文组件名称（对话互动自动追加 -表达/-朗读） | MySQL映射 |
 | `组件标题` | 组件配置标题 | MySQL |
 | `mode` | 模式（表达/朗读，仅对话互动有值） | MySQL component_config |
 | `参考文本` | 组件配置中的参考文本 | MySQL component_config |
 | `play_result` | 判定结果（Perfect/Good/Pass/Oops/Failed） | PG |
 | `发音评分` | 发音评分 | PG user_behavior_info |
 | `音频URL` | 用户录音文件地址 | PG user_behavior_info |
 | `朗读内容` | 用户实际朗读内容 | PG user_behavior_info |
 | `user_behavior_info` | 原始用户行为数据 JSON | PG |
 | `updated_at` | 更新时间（倒序排列） | PG |
 ### 数据来源
 - **PG** `user_component_play_record_0~7`（8张分表）：主数据源，按 c_type + c_id 过滤
 - **MySQL** `middle_interaction_component` / `core_interaction_component`：组件配置（标题、mode、参考文本）
 ---
 ## 完成后
 如果是从飞书对话触发的导出，导出完成后通过 `lark-send-message-as-bot` 技能将文件发送给请求用户。