diff --git a/business_knowledge/user_export_skill.md b/business_knowledge/user_export_skill.md new file mode 100644 index 0000000..12506fa --- /dev/null +++ b/business_knowledge/user_export_skill.md @@ -0,0 +1,70 @@ +# 用户学习行为数据导出技能 + +## 功能说明 +可以导出指定账户ID或角色ID的完整学习行为数据,输出为Excel文件,包含多个sheet。 + +## 导出内容说明 +Excel包含以下sheet: +1. **全部音频数据**:用户的所有语音交互数据,包含音频地址、ASR结果等 +2. **互动组件学习记录**:所有组件互动记录,包含组件类型、名称、知识点、互动结果等 +3. **课程巩固记录**:课程课后巩固的做题记录 +4. **单元挑战记录**:单元挑战的答题记录 +5. **单元总结记录**:单元总结的学习记录 +6. **汇总统计**:自动统计的组件通过率、知识点掌握情况、单元学习时长等 + +## 使用方法 +### 1. 导出单个角色ID +修改脚本变量: +```python +USER_ID = "角色ID" +USER_ID_LIST = None +ACCOUNT_ID_LIST = None +``` + +### 2. 导出单个/多个账户ID +修改脚本变量: +```python +USER_ID = None +USER_ID_LIST = None +ACCOUNT_ID_LIST = [账户ID1, 账户ID2, ...] +``` +脚本会自动查询账户对应的所有角色ID并分别导出。 + +## 依赖环境 +需要配置以下环境变量: +``` +# ES 配置 +ES_HOST=es-7vd7jcu9.public.tencentelasticsearch.com +ES_PORT=9200 +ES_SCHEME=https +ES_USER=elastic +ES_PASSWORD=F%?QDcWes7N2WTuiYD11 + +# PG 配置 +PG_DB_HOST=bj-postgres-16pob4sg.sql.tencentcdb.com +PG_DB_PORT=28591 +PG_DB_USER=ai_member +PG_DB_PASSWORD=LdfjdjL83h3h3^$&**YGG* +PG_DB_DATABASE=vala + +# MySQL 配置 +MYSQL_HOST=bj-cdb-8frbdwju.sql.tencentcdb.com +MYSQL_USERNAME=read_only +MYSQL_PASSWORD=fdsfiidier^$*hjfdijjd232 +MYSQL_PORT=25413 + +# MySQL Online 配置 +MYSQL_HOST_online=bj-cdb-dh2fkqa0.sql.tencentcdb.com +MYSQL_USERNAME_online=read_only +MYSQL_PASSWORD_online=fsdo45ijfmfmuu77$%^& +MYSQL_PORT_online=27751 +``` + +## 常见问题排查 +1. **事务异常错误**:一般是前面某个查询失败导致,检查是否有权限、表是否存在 +2. **权限不足**:检查数据库账号的表权限,需要有各分表的SELECT权限 +3. **0条记录**:对应角色没有学习数据,属于正常情况 + +## 导出示例 +- 账户ID 9343(角色12699):导出199条学习记录 +- 角色ID 14607:导出855条完整学习记录,所有sheet都有数据 diff --git a/export_14607.py b/export_14607.py new file mode 100644 index 0000000..aa86b51 --- /dev/null +++ b/export_14607.py @@ -0,0 +1,1846 @@ +""" +初版需求v1.0: 2025.11.18 + +导出 一个userId的多表数据, 最终按照不同sheet,输出到一个 excel文件中。 + +1. 第一个sheet:"全部音频数据" +es相关配置通过以下环境变量 +ES_HOST=xxx +ES_PORT=9200 +ES_SCHEME=https +ES_USER=elastic +ES_PASSWORD=xxx + +index: user-audio + +脚本思路: +过滤字段: +userId == xxxx + +输出该userId的全部记录 按时间倒序排序 +包含以下字段内容: + +userId +userMsg +userName +soeData +audioUrl +asrStatus +componentId +componentType +dataVersion + +2. 第二个sheet:"互动组件学习记录" +在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 +数据库相关配置 从.env中读取: +PG_DB_HOST = xxx +PG_DB_PORT = xxx +PG_DB_USER = xxx +PG_DB_PASSWORD = xxx +PG_DB_DATABASE = xxx + +读取以下数据表: +user_component_play_record_0 ~ user_component_play_record_7 + +输出以下字段: +user_id, +component_unique_code, +session_id, +c_type, +c_id, +play_result, +user_behavior_info, +updated_at + +3.第三个sheet:"课程巩固记录" +在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 + +数据表:user_unit_review_question_result + +输出以下字段: +user_id +story_id +chapter_id +question_list +updated_at + +4.第四个sheet:"单元挑战记录" +在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 + +数据表:user_unit_challenge_question_result + +输出以下字段: +user_id +story_id +category +score_text, +question_list +updated_at +------------ + +需求补充v1.1: +"全部音频数据"这个sheet +输出字段 添加timeStr 并按时间倒序排列 最新的记录 在最上面 + +------------ +需求补充v1.2: +"全部音频数据"这个sheet +如果userMsg字段内容 包含 ”makee_id“ 要进行以下处理: + +从userMsg字段中提取出具体的makee_id: +此时的字段样例: +``` +asr msg信息为:{ + "time_ms": 358, + "time_ms_api": 357, + "hot_words_str": "{\n \"context_type\": \"dialog_ctx\",\n \"context_data\": [\n {\n \"text\": \"planet Walla\"\n },\n {\n \"text\": \"Walla\"\n }\n ]\n}", + "makee_id": "d208c617-902f-4f81-8255-b5fb73599546", + "volcano_fast_x_tt_logid": "202511151541355DF72BE5EBFE73795BFD", + "api_name": "volcano-fast" +} +``` +然后基于makee_id 去另一个表里查记录: index:llm_asr_log +将查询到的记录的 result_text 字段内容 回填到 userMsg。 +将source字段内容 输出 到 source。 + +如果userMsg字段内容 不包含 ”makee_id“ 保持之前的逻辑。 + +-------------- +需求补充 v1.3 +当前输入 只支持配置单个 userId (业务侧名称为角色id) + + +期望扩展为以下逻辑: +1. 改为配置 角色id list , 分别 导出 多份excel文件。命名格式为 角色id_{}_导出时间_{}.xlsx +2. 改为配置 账户id list , 分别 导出 多份excel文件。命名格式为 账户id_{}_角色id_{}_导出时间_{}.xlsx + +关于 账户 id 到角色id 的映射逻辑, +首先 读取 mysql 表 vala_app_character +筛选 account_id字段值 == 账户id 的 记录, 其中 该记录 的 id值,则为角色id 一个 账户id 可以对应多个角色id + +本次需求只针对输入侧调整, 数据抽取聚合逻辑部分和之前保持一致 + +--------------- +需求补充 v1.4 + +增加一个sheet "单元总结记录", +导出对应角色id的单元总结记录。 参考 export_unit_summary.py 中的原始数据提取方案即可(不必关注其中的数据统计部分)。 + +其他已有逻辑保持不动哦。 + +---------------- +需求补充 v1.5 + +1."互动组件学习记录"sheet 增加以下字段 +"互动组件名称"、"组件标题"、"组件配置摘要"、"知识点": +字段取值规则: +根据 c_type 及组件配置(从mysql表获取) 进行映射和处理: +``` +1).如果 c_type 开头为"mid" + +则读取下表:表名:middle_interaction_component + +获取以下字段值: +title (作为组件标题) +component_config (完整的组件配置) 获取其中 的 question 字段值 作为 组件配置摘要; +kp_relation_info 字段值 作为 知识点 + +"互动组件名称"规则: + +"物品互动": "mid_vocab_item", +"图片互动": "mid_vocab_image", +"填词互动": "mid_vocab_fillBlank", +"指令互动": "mid_vocab_instruction" +"对话互动-表达": "mid_sentence_dialogue", 且 component_config->question->mode == "express" +"对话互动-朗读": "mid_sentence_dialogue", 且 component_config->question->mode == "read" +"语音互动": "mid_sentence_voice", +"材料互动": "mid_sentence_material", +"造句互动": "mid_sentence_makeSentence" +"挖空互动": "mid_grammar_cloze", +"组句互动": "mid_grammar_sentence" +"发音互动": "mid_pron_pron" + + +2). 如果 c_type 开头为"core" +则读取下表:表名:core_interaction_component + +获取以下字段值: +title (作为组件标题) +component_config (完整的组件配置) 获取其中 的 taskInfo 字段值 作为 组件配置摘要 +kp_relation_info 字段值 作为 知识点 + +"互动组件名称"规则: +"口语快答": "core_speaking_reply", +"口语妙问": "core_speaking_inquiry", +"口语探讨": "core_speaking_explore", +"口语独白": "core_speaking_monologue" +"合作阅读": "core_reading_order", +"合作听力": "core_listening_order", +"看图组句": "core_writing_imgMakeSentence", +"看图撰写": "core_writing_imgWrite", +"问题组句": "core_writing_questionMakeSentence", +"问题撰写": "core_writing_questionWrite", +``` + +2."课程巩固记录" sheet 增加以下字段 +"正确率": 参考 export_lesson_review.py 中的计算逻辑 + +3. 新增一个"汇总统计"sheet +统计并展示以下内容 请以 可读性 比较好的方式排列、展示 + +a. "所有互动-按互动组件类型-通过情况统计" +以每种"互动组件名称"进行聚合 +统计play_result的取值分布情况,算以下指标: +总数量、Perfect数量、Good数量、Failed数量、Pass数量、Perfect比例、Good比例、Failed比例、Pass比例 + +b. "中互动组件-按知识点-通过情况统计" +以每个知识点进行聚合 + +其中 知识点配置格式如下: +``` +[{"kpId":"0000004","kpType":"sentence","kpTitle":"My name is ...","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000004","kpType":"sentence","kpTitle":"My name is ...","kpSkill":"sentence_meaning","kpSkillName":"语义"},{"kpId":"0000005","kpType":"sentence","kpTitle":"I'm… years old.","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000005","kpType":"sentence","kpTitle":"I'm… years old.","kpSkill":"sentence_meaning","kpSkillName":"语义"},{"kpId":"0000014","kpType":"sentence","kpTitle":"Nice to meet you.","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000014","kpType":"sentence","kpTitle":"Nice to meet you.","kpSkill":"sentence_meaning","kpSkillName":"语义"}] +``` +一个组件可以绑定多个知识点,以每个知识点的 kpId + kpType + kpTitle 进行 展示及聚合 + +对所有绑定了某个知识点的中互动组件(c_type以mid开头) +统计play_result的取值分布情况,算以下指标: +总数量、Perfect数量、Good数量、Failed数量、Pass数量、Perfect比例、Good比例、Failed比例、Pass比例 + +c. "单元总结-按单元统计时长" + +将"单元总结记录"中的"play_time_seconds"字段值 以每个单元id 进行聚合 进行 累加 统计,并增加一列 转换为分钟为单位 取整数 + + +""" +# ==== 可直接修改的脚本变量(不使用命令行传参) ==== +# 三种模式互斥,只能配置一个: +# 模式1:单个角色id +USER_ID = "14607" # 单个角色ID,示例:2911 + +# 模式2:角色id列表(多个角色id批量导出) +USER_ID_LIST = None # 角色ID列表,示例:[2911, 2912, 2913] + +# 模式3:账户id列表(通过账户id查询对应的角色id后批量导出) +ACCOUNT_ID_LIST = None + +OUTPUT_DIR = "output/" # 输出目录,默认为output文件夹 +# ==== 变量结束 ==== +import os +import json +import re +from typing import Any, Dict, List, Optional + +import datetime + +try: + import requests +except Exception: + requests = None + +try: + import psycopg2 + from psycopg2.extras import RealDictCursor +except Exception: + psycopg2 = None + RealDictCursor = None + +try: + import pymysql + import pymysql.cursors +except Exception: + pymysql = None + +try: + import pandas as pd +except Exception: + pd = None + +try: + import urllib3 +except Exception: + urllib3 = None + + +SHEET1_COLUMNS = [ + "userId", + "userMsg", + "source", + "userName", + "soeData", + "audioUrl", + "asrStatus", + "componentId", + "componentType", + "dataVersion", + "timeStr", +] + +SHEET2_COLUMNS = [ + "user_id", + "component_unique_code", + "session_id", + "c_type", + "c_id", + "互动组件名称", + "组件标题", + "组件配置摘要", + "知识点", + "play_result", + "user_behavior_info", + "updated_at", +] + +SHEET3_COLUMNS = [ + "user_id", + "unit_id", + "lesson_id", + "question_list", + "正确率", + "updated_at", +] + +SHEET4_COLUMNS = [ + "user_id", + "unit_id", + "category", + "score_text", + "question_list", + "updated_at", +] + +SHEET5_COLUMNS = [ + "id", + "user_id", + "unit_id", + "updated_at", + "km_id", + "km_type", + "play_time_seconds", +] + + +def _load_env_file(path: str) -> None: + if not os.path.exists(path): + return + try: + with open(path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + if "=" not in line: + continue + k, v = line.split("=", 1) + k = k.strip() + v = v.strip().strip('"').strip("'") + if k and (os.getenv(k) is None): + os.environ[k] = v + except Exception: + pass + + +def load_env() -> None: + _load_env_file(os.path.join(os.getcwd(), ".env")) + _load_env_file(os.path.join(os.getcwd(), ".env.local")) + + +def to_json_str(v: Any) -> Any: + if isinstance(v, (dict, list)): + try: + return json.dumps(v, ensure_ascii=False) + except Exception: + return str(v) + return v + + +def parse_time(value: Any) -> Optional[datetime.datetime]: + if value is None: + return None + if isinstance(value, (int, float)): + try: + v = float(value) + # 兼容毫秒级时间戳 + if v > 1e11: + v = v / 1000.0 + return datetime.datetime.fromtimestamp(v) + except Exception: + return None + if isinstance(value, str): + fmts = [ + "%Y-%m-%dT%H:%M:%S.%fZ", + "%Y-%m-%dT%H:%M:%S.%f%z", + "%Y-%m-%dT%H:%M:%S%z", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d", + ] + for fmt in fmts: + try: + return datetime.datetime.strptime(value, fmt) + except Exception: + continue + try: + return datetime.datetime.fromisoformat(value) + except Exception: + return None + return None + + +def pick_time(source: Dict[str, Any]) -> Optional[datetime.datetime]: + candidates = [ + "updated_at", + "created_at", + "@timestamp", + "timestamp", + "updatedAt", + "createdAt", + "time", + "ts", + "timeStr", + "update_time", + "create_time", + ] + for key in candidates: + if key in source: + t = parse_time(source.get(key)) + if t is not None: + return t + # 宽松匹配:尝试扫描所有可能的时间相关字段 + for k, v in source.items(): + lk = str(k).lower() + if any(s in lk for s in ["time", "date", "_at", "timestamp"]): + t = parse_time(v) + if t is not None: + return t + return None + + +def extract_makee_id_from_user_msg(user_msg: Any) -> Optional[str]: + # 支持dict或字符串形式 + if isinstance(user_msg, dict): + mk = user_msg.get("makee_id") + if isinstance(mk, str) and mk: + return mk + if isinstance(user_msg, str) and user_msg: + # 1) 尝试整体解析为JSON + try: + obj = json.loads(user_msg) + mk = obj.get("makee_id") + if isinstance(mk, str) and mk: + return mk + except Exception: + pass + # 2) 尝试截取大括号中的JSON + try: + start = user_msg.find("{") + end = user_msg.rfind("}") + if start != -1 and end != -1 and end > start: + candidate = user_msg[start : end + 1] + obj = json.loads(candidate) + mk = obj.get("makee_id") + if isinstance(mk, str) and mk: + return mk + except Exception: + pass + # 3) 正则匹配 makee_id + m = re.search(r"\bmakee_id\b\s*:\s*\"([^\"]+)\"", user_msg) + if m: + return m.group(1) + return None + + +def fetch_es_asr_log(makee_id: str, es_cfg: Dict[str, Any]) -> Optional[Dict[str, Any]]: + if requests is None: + raise RuntimeError("缺少requests依赖,请安装后再运行。") + host = es_cfg.get("host") + port = es_cfg.get("port") + scheme = es_cfg.get("scheme", "http") + user = es_cfg.get("user") + password = es_cfg.get("password") + index = "llm_asr_log" + if not host: + return None + base = f"{scheme}://{host}:{port}" + url = f"{base}/{index}/_search" + headers = {"Content-Type": "application/json"} + body = { + "query": { + "bool": { + "should": [ + {"term": {"makee_id": {"value": str(makee_id)}}}, + {"term": {"makee_id.keyword": {"value": str(makee_id)}}}, + ], + "minimum_should_match": 1, + } + }, + "size": 10, + "_source": [ + "makee_id", + "result_text", + "source", + "updated_at", + "created_at", + "@timestamp", + "timestamp", + "updatedAt", + "createdAt", + "time", + "ts", + "timeStr", + "update_time", + "create_time", + ], + } + auth = (user, password) if user and password else None + try: + if scheme == "https" and urllib3 is not None: + try: + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + except Exception: + pass + resp = requests.post(url, headers=headers, json=body, auth=auth, timeout=20, verify=False if scheme == "https" else True) + resp.raise_for_status() + data = resp.json() + except Exception: + return None + hits = data.get("hits", {}).get("hits", []) + if not hits: + return None + # 选最新的 + chosen = None + best_t = None + for h in hits: + src = h.get("_source", {}) or {} + t = pick_time(src) + if t is None: + continue + if best_t is None or t > best_t: + best_t = t + chosen = src + if chosen is None: + # 如果都没有时间,选第一条 + chosen = (hits[0].get("_source", {}) or {}) + return chosen + + +def get_es_config() -> Dict[str, Any]: + return { + "host": os.getenv("ES_HOST"), + "port": os.getenv("ES_PORT", "9200"), + "scheme": os.getenv("ES_SCHEME", "http"), + "user": os.getenv("ES_USER"), + "password": os.getenv("ES_PASSWORD"), + "index": "user-audio", + } + + +def fetch_es_user_audio(user_id: str, es_cfg: Dict[str, Any]) -> List[Dict[str, Any]]: + if requests is None: + raise RuntimeError("缺少requests依赖,请安装后再运行。") + + print(f" [ES] 开始查询user-audio索引...") + start_time = datetime.datetime.now() + + host = es_cfg.get("host") + port = es_cfg.get("port") + scheme = es_cfg.get("scheme", "http") + user = es_cfg.get("user") + password = es_cfg.get("password") + index = es_cfg.get("index", "user-audio") + + if not host: + return [] + + base = f"{scheme}://{host}:{port}" + url = f"{base}/{index}/_search" + headers = {"Content-Type": "application/json"} + + body = { + "query": { + "bool": { + "should": [ + {"term": {"userId": {"value": str(user_id)}}}, + {"term": {"userId.keyword": {"value": str(user_id)}}}, + ], + "minimum_should_match": 1, + } + }, + "size": 10000, + "_source": [ + "userId", + "userMsg", + "userName", + "soeData", + "audioUrl", + "asrStatus", + "componentId", + "componentType", + "dataVersion", + "updated_at", + "created_at", + "@timestamp", + "timestamp", + "updatedAt", + "createdAt", + "time", + "ts", + "timeStr", + "update_time", + "create_time", + ], + } + + auth = (user, password) if user and password else None + + try: + # 抑制自签证书下的HTTPS不安全警告 + if scheme == "https" and urllib3 is not None: + try: + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + except Exception: + pass + resp = requests.post(url, headers=headers, json=body, auth=auth, timeout=30, verify=False if scheme == "https" else True) + resp.raise_for_status() + data = resp.json() + except Exception as e: + raise RuntimeError(f"ES查询失败: {e}") + + hits = data.get("hits", {}).get("hits", []) + print(f" [ES] 查询完成,获得{len(hits)}条记录,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + + if not hits: + return [] + + print(f" [ES] 开始处理音频数据...") + process_start = datetime.datetime.now() + + rows: List[Dict[str, Any]] = [] + asr_cache: Dict[str, Dict[str, Any]] = {} + makee_id_count = 0 + + for idx, h in enumerate(hits, 1): + # 每处理100条显示一次进度 + if idx % 100 == 0 or idx == len(hits): + print(f" [ES] 处理进度: {idx}/{len(hits)} ({idx*100//len(hits)}%)") + + src = h.get("_source", {}) or {} + row = { + "userId": src.get("userId"), + "userMsg": src.get("userMsg"), + "source": None, + "userName": src.get("userName"), + "soeData": to_json_str(src.get("soeData")), + "audioUrl": src.get("audioUrl"), + "asrStatus": src.get("asrStatus"), + "componentId": src.get("componentId"), + "componentType": src.get("componentType"), + "dataVersion": src.get("dataVersion"), + } + t = pick_time(src) + row["_time"] = t.isoformat() if t else None + row["timeStr"] = t.strftime("%Y-%m-%d %H:%M:%S") if t else None + # v1.2: 当userMsg包含makee_id时,补充查询llm_asr_log并回填 + mk = extract_makee_id_from_user_msg(row.get("userMsg")) + if mk: + makee_id_count += 1 + asr_doc = asr_cache.get(mk) + if asr_doc is None: + asr_doc = fetch_es_asr_log(mk, es_cfg) + if asr_doc is not None: + asr_cache[mk] = asr_doc + if asr_doc is not None: + rt = asr_doc.get("result_text") + if rt: + row["userMsg"] = rt + row["source"] = to_json_str(asr_doc.get("source")) + rows.append(row) + + print(f" [ES] 数据处理完成,发现{makee_id_count}条包含makee_id的记录,耗时{(datetime.datetime.now() - process_start).total_seconds():.2f}秒") + + print(f" [ES] 开始排序...") + rows.sort(key=lambda x: parse_time(x.get("_time")) or datetime.datetime.min, reverse=True) + print(f" [ES] 音频数据处理完成,总耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + + return rows + + +def get_pg_conn() -> Any: + if psycopg2 is None: + raise RuntimeError("缺少psycopg2依赖,请安装后再运行。") + host = os.getenv("PG_DB_HOST") + port = int(os.getenv("PG_DB_PORT", "5432")) + user = os.getenv("PG_DB_USER") + password = os.getenv("PG_DB_PASSWORD") + dbname = os.getenv("PG_DB_DATABASE") + if not host or not dbname: + raise RuntimeError("PG数据库环境变量未配置完整") + conn = psycopg2.connect(host=host, port=port, user=user, password=password, dbname=dbname) + return conn + + +def get_mysql_conn(database: str) -> Any: + """ + 获取MySQL数据库连接 + + Args: + database: 数据库名,可选值:'vala_user' 或 'vala_test' + vala_user 使用 online 配置(环境变量后缀 _online) + vala_test 使用默认配置 + + Returns: + MySQL连接对象 + """ + if pymysql is None: + raise RuntimeError("缺少pymysql依赖,请安装后再运行。") + + # 根据数据库选择不同的环境变量配置 + if database == "vala_user": + # vala_user 数据库使用 online 配置 + host = os.getenv("MYSQL_HOST_online") + port = int(os.getenv("MYSQL_PORT_online", "3306")) + user = os.getenv("MYSQL_USERNAME_online") + password = os.getenv("MYSQL_PASSWORD_online") + if not host: + raise RuntimeError("MySQL数据库环境变量未配置完整(缺少MYSQL_HOST_online)") + else: + # vala_test 等其他数据库使用默认配置 + host = os.getenv("MYSQL_HOST") + port = int(os.getenv("MYSQL_PORT", "3306")) + user = os.getenv("MYSQL_USERNAME") + password = os.getenv("MYSQL_PASSWORD") + if not host: + raise RuntimeError("MySQL数据库环境变量未配置完整(缺少MYSQL_HOST)") + + conn = pymysql.connect( + host=host, + port=port, + user=user, + password=password, + database=database, # 直接使用传入的数据库名 + charset="utf8mb4", + cursorclass=pymysql.cursors.DictCursor, + ) + return conn + + +def get_id_2_unit_index(conn: Any) -> Dict[int, int]: + """ + 从MySQL获取 story_id 到 unit_id 的映射关系 + + Args: + conn: MySQL数据库连接 + + Returns: + 映射字典 {story_id: unit_id} + """ + sql = """ + SELECT * + FROM `vala_game_info` + WHERE id > 0 + AND `vala_game_info`.`deleted_at` IS NULL + ORDER BY season_package_id asc, `index` asc + """ + try: + with conn.cursor() as cur: + cur.execute(sql) + rows = cur.fetchall() or [] + # 构建映射表:按查询结果的顺序,索引即为unit_id + id_2_unit_index = {} + for index, row in enumerate(rows): + id_2_unit_index[row["id"]] = index + return id_2_unit_index + except Exception as e: + print(f"[ERROR] 获取story_id到unit_id映射失败: {e}") + return {} + + +def get_chapter_id_to_lesson_id(conn: Any) -> Dict[int, int]: + """ + 从MySQL获取 chapter_id 到 lesson_id 的映射关系 + + Args: + conn: MySQL数据库连接 + + Returns: + 映射字典 {chapter_id: lesson_id} + """ + sql = """ + SELECT id, `index` + FROM `vala_game_chapter` + WHERE deleted_at IS NULL + """ + try: + with conn.cursor() as cur: + cur.execute(sql) + rows = cur.fetchall() or [] + # 构建映射表:chapter的index字段即为lesson_id + chapter_id_to_lesson_id = {} + for row in rows: + chapter_id_to_lesson_id[row["id"]] = row["index"] + return chapter_id_to_lesson_id + except Exception as e: + print(f"[ERROR] 获取chapter_id到lesson_id映射失败: {e}") + return {} + + +# 组件类型到组件名称的映射 +COMPONENT_TYPE_NAMES = { + "mid_vocab_item": "物品互动", + "mid_vocab_image": "图片互动", + "mid_vocab_fillBlank": "填词互动", + "mid_vocab_instruction": "指令互动", + "mid_sentence_dialogue": "对话互动", # 需要根据mode进一步判断 + "mid_sentence_voice": "语音互动", + "mid_sentence_material": "材料互动", + "mid_sentence_makeSentence": "造句互动", + "mid_grammar_cloze": "挖空互动", + "mid_grammar_sentence": "组句互动", + "mid_pron_pron": "发音互动", + "core_speaking_reply": "口语快答", + "core_speaking_inquiry": "口语妙问", + "core_speaking_explore": "口语探讨", + "core_speaking_monologue": "口语独白", + "core_reading_order": "合作阅读", + "core_listening_order": "合作听力", + "core_writing_imgMakeSentence": "看图组句", + "core_writing_imgWrite": "看图撰写", + "core_writing_questionMakeSentence": "问题组句", + "core_writing_questionWrite": "问题撰写", +} + + +def get_component_name(c_type: str, component_config: Optional[Dict[str, Any]]) -> str: + """ + 根据c_type和组件配置获取组件名称 + + Args: + c_type: 组件类型 + component_config: 组件配置(用于判断对话互动的mode) + + Returns: + 组件名称 + """ + if not c_type: + return "" + + # 特殊处理:对话互动需要根据mode判断 + if c_type == "mid_sentence_dialogue" and component_config: + try: + question = component_config.get("question", {}) + mode = question.get("mode", "") + if mode == "express": + return "对话互动-表达" + elif mode == "read": + return "对话互动-朗读" + except Exception: + pass + + return COMPONENT_TYPE_NAMES.get(c_type, "") + + +def batch_fetch_component_configs(play_records: List[Dict[str, Any]], mysql_conn: Any) -> Dict[str, Dict[str, Any]]: + """ + 批量查询组件配置信息 + + Args: + play_records: 播放记录列表 + mysql_conn: MySQL连接 + + Returns: + 组件配置映射 {c_type_c_id: {title, component_config, kp_relation_info}} + """ + print(f" [MySQL] 开始批量查询组件配置...") + start_time = datetime.datetime.now() + + # 收集需要查询的c_type和c_id + mid_c_ids = set() + core_c_ids = set() + mid_type_id_pairs = [] # 用于调试日志 + core_type_id_pairs = [] + + for record in play_records: + c_type = record.get("c_type", "") + c_id = record.get("c_id") + if c_type and c_id: + if c_type.startswith("mid"): + mid_c_ids.add(c_id) + mid_type_id_pairs.append((c_type, c_id)) + elif c_type.startswith("core"): + core_c_ids.add(c_id) + core_type_id_pairs.append((c_type, c_id)) + + print(f" [MySQL] 需要查询中互动组件: {len(mid_c_ids)}个, 核心互动组件: {len(core_c_ids)}个") + if mid_c_ids: + print(f" [MySQL] 中互动组件ID列表(前10个): {sorted(list(mid_c_ids))[:10]}") + if core_c_ids: + print(f" [MySQL] 核心互动组件ID列表(前10个): {sorted(list(core_c_ids))[:10]}") + + config_map = {} + + # 批量查询middle_interaction_component + if mid_c_ids: + try: + with mysql_conn.cursor() as cur: + placeholders = ','.join(['%s'] * len(mid_c_ids)) + sql = f""" + SELECT c_id, c_type, title, component_config, kp_relation_info + FROM middle_interaction_component + WHERE c_id IN ({placeholders}) AND deleted_at IS NULL + """ + print(f" [MySQL] 执行中互动组件查询,查询条件: c_id IN ({len(mid_c_ids)}个ID)") + cur.execute(sql, tuple(mid_c_ids)) + rows = cur.fetchall() or [] + print(f" [MySQL] 查询到{len(rows)}条中互动组件配置") + + if len(rows) == 0 and len(mid_c_ids) > 0: + print(f" [MySQL] [警告] 查询结果为空!可能的原因:") + print(f" [MySQL] - 数据库中没有匹配的c_id记录") + print(f" [MySQL] - deleted_at字段不为NULL") + print(f" [MySQL] - c_id不存在") + + for idx, row in enumerate(rows): + c_type = row.get("c_type", "") + c_id = row.get("c_id") + key = f"{c_type}_{c_id}" + + if idx < 3: # 输出前3条的详细信息 + print(f" [MySQL] [样例{idx+1}] id={c_id}, c_type={c_type}, key={key}") + print(f" [MySQL] [样例{idx+1}] title={row.get('title', '')[:50]}") + + # 解析component_config + component_config = row.get("component_config") + if isinstance(component_config, str): + try: + component_config = json.loads(component_config) + except Exception as e: + print(f" [MySQL] [警告] 解析component_config失败 (id={c_id}): {e}") + component_config = {} + + # 提取question字段作为摘要 + summary = "" + if isinstance(component_config, dict): + question = component_config.get("question") + summary = to_json_str(question) if question else "" + if idx < 3 and question: + print(f" [MySQL] [样例{idx+1}] 提取到question字段,长度: {len(summary)}") + + # 解析kp_relation_info + kp_relation_info = row.get("kp_relation_info") + if isinstance(kp_relation_info, str): + try: + kp_relation_info = json.loads(kp_relation_info) + except Exception: + kp_relation_info = [] + + config_map[key] = { + "title": row.get("title", ""), + "component_config": component_config, + "summary": summary, + "kp_relation_info": to_json_str(kp_relation_info), + } + + print(f" [MySQL] 中互动组件配置已加入config_map,当前map大小: {len(config_map)}") + except Exception as e: + print(f" [MySQL] [错误] 查询中互动组件配置失败: {e}") + import traceback + traceback.print_exc() + + # 批量查询core_interaction_component + if core_c_ids: + try: + with mysql_conn.cursor() as cur: + placeholders = ','.join(['%s'] * len(core_c_ids)) + sql = f""" + SELECT c_id, c_type, title, component_config, kp_relation_info + FROM core_interaction_component + WHERE c_id IN ({placeholders}) AND deleted_at IS NULL + """ + print(f" [MySQL] 执行核心互动组件查询,查询条件: c_id IN ({len(core_c_ids)}个ID)") + cur.execute(sql, tuple(core_c_ids)) + rows = cur.fetchall() or [] + print(f" [MySQL] 查询到{len(rows)}条核心互动组件配置") + + if len(rows) == 0 and len(core_c_ids) > 0: + print(f" [MySQL] [警告] 查询结果为空!可能的原因:") + print(f" [MySQL] - 数据库中没有匹配的c_id记录") + print(f" [MySQL] - deleted_at字段不为NULL") + print(f" [MySQL] - c_id不存在") + + for idx, row in enumerate(rows): + c_type = row.get("c_type", "") + c_id = row.get("c_id") + key = f"{c_type}_{c_id}" + + if idx < 3: # 输出前3条的详细信息 + print(f" [MySQL] [样例{idx+1}] id={c_id}, c_type={c_type}, key={key}") + print(f" [MySQL] [样例{idx+1}] title={row.get('title', '')[:50]}") + + # 解析component_config + component_config = row.get("component_config") + if isinstance(component_config, str): + try: + component_config = json.loads(component_config) + except Exception as e: + print(f" [MySQL] [警告] 解析component_config失败 (id={c_id}): {e}") + component_config = {} + + # 提取taskInfo字段作为摘要 + summary = "" + if isinstance(component_config, dict): + task_info = component_config.get("taskInfo") + summary = to_json_str(task_info) if task_info else "" + if idx < 3 and task_info: + print(f" [MySQL] [样例{idx+1}] 提取到taskInfo字段,长度: {len(summary)}") + + # 解析kp_relation_info + kp_relation_info = row.get("kp_relation_info") + if isinstance(kp_relation_info, str): + try: + kp_relation_info = json.loads(kp_relation_info) + except Exception: + kp_relation_info = [] + + config_map[key] = { + "title": row.get("title", ""), + "component_config": component_config, + "summary": summary, + "kp_relation_info": to_json_str(kp_relation_info), + } + + print(f" [MySQL] 核心互动组件配置已加入config_map,当前map大小: {len(config_map)}") + except Exception as e: + print(f" [MySQL] [错误] 查询核心互动组件配置失败: {e}") + import traceback + traceback.print_exc() + + print(f" [MySQL] 组件配置查询完成,共{len(config_map)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + return config_map + + +def calculate_accuracy(question_list: Any) -> float: + """ + 计算问题列表的正确率 + + Args: + question_list: 问题列表(可能是JSON字符串或list) + + Returns: + 正确率(百分比,保留2位小数) + """ + try: + if isinstance(question_list, str): + question_list = json.loads(question_list) + + if not isinstance(question_list, list) or len(question_list) == 0: + return 0.0 + + total = len(question_list) + correct = sum(1 for q in question_list if q.get('isRight') == True) + accuracy = round(correct / total * 100, 2) if total > 0 else 0.0 + + return accuracy + except Exception: + return 0.0 + + + +def fetch_character_ids_by_account(account_id: str, conn: Any) -> List[str]: + """根据账户id查询对应的角色id列表""" + sql = "SELECT id FROM vala_app_character WHERE account_id = %s" + try: + with conn.cursor() as cur: + cur.execute(sql, (account_id,)) + rows = cur.fetchall() or [] + return [str(row["id"]) for row in rows if row.get("id")] + except Exception as e: + print(f"[ERROR] 查询账户id={account_id}的角色id失败: {e}") + return [] + + +def fetch_pg_play_records(user_id: str, conn: Any, mysql_conn: Any) -> List[Dict[str, Any]]: + """ + 查询互动组件学习记录并补充组件配置信息 + + Args: + user_id: 用户ID(角色ID) + conn: PostgreSQL数据库连接 + mysql_conn: MySQL数据库连接 + + Returns: + 互动组件学习记录列表 + """ + print(f" [PG] 开始查询互动组件学习记录(8张分表)...") + start_time = datetime.datetime.now() + + tables = [f"user_component_play_record_{i}" for i in range(8)] + rows: List[Dict[str, Any]] = [] + with conn.cursor(cursor_factory=RealDictCursor) as cur: + for t in tables: + try: + cur.execute( + f""" + SELECT user_id, component_unique_code, session_id, c_type, c_id, + play_result, user_behavior_info, updated_at + FROM {t} + WHERE user_id = %s + ORDER BY updated_at DESC + """, + (user_id,), + ) + part = cur.fetchall() or [] + if part: + print(f" [PG] 表{t}查到{len(part)}条记录") + for r in part: + r = dict(r) + r["play_result"] = to_json_str(r.get("play_result")) + r["user_behavior_info"] = to_json_str(r.get("user_behavior_info")) + # 将带时区的时间转换为无时区,避免Excel写入报错 + upd = r.get("updated_at") + if isinstance(upd, datetime.datetime): + try: + if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: + r["updated_at"] = upd.replace(tzinfo=None) + except Exception: + # 回退为字符串 + r["updated_at"] = str(upd) + rows.append(r) + except Exception as e: + print(f" [PG] 表{t}查询失败: {e}") + continue + + rows.sort(key=lambda x: parse_time(x.get("updated_at")) or datetime.datetime.min, reverse=True) + print(f" [PG] 互动组件学习记录查询完成,共{len(rows)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + + # 批量查询组件配置 + if rows and mysql_conn: + config_map = batch_fetch_component_configs(rows, mysql_conn) + + # 补充组件信息 + print(f" [PG] 开始补充组件配置信息...") + filled_count = 0 + empty_count = 0 + sample_keys = [] + sample_mode_check = [] # 检查对话互动的mode + + for r in rows: + c_type = r.get("c_type", "") + c_id = r.get("c_id") + key = f"{c_type}_{c_id}" if c_type and c_id else "" + + config = config_map.get(key, {}) + component_config = config.get("component_config", {}) + + component_name = get_component_name(c_type, component_config) + r["互动组件名称"] = component_name + r["组件标题"] = config.get("title", "") + r["组件配置摘要"] = config.get("summary", "") + r["知识点"] = config.get("kp_relation_info", "") + + # 统计填充情况 + if config: + filled_count += 1 + if len(sample_keys) < 3: + sample_keys.append((key, component_name, r["组件标题"][:30] if r["组件标题"] else "")) + + # 检查对话互动的mode + if c_type == "mid_sentence_dialogue" and len(sample_mode_check) < 3: + mode = "" + if isinstance(component_config, dict): + question = component_config.get("question", {}) + if isinstance(question, dict): + mode = question.get("mode", "") + sample_mode_check.append({ + "key": key, + "mode": mode, + "component_name": component_name + }) + else: + empty_count += 1 + if empty_count <= 5: # 输出前5个未匹配的key + print(f" [PG] [警告] 未找到组件配置: key={key}") + + print(f" [PG] 组件配置信息补充完成") + print(f" [PG] 匹配到配置: {filled_count}条, 未匹配: {empty_count}条") + if sample_keys: + print(f" [PG] 样例数据(前3条):") + for key, name, title in sample_keys: + print(f" [PG] - key={key}, 名称={name}, 标题={title}") + + if sample_mode_check: + print(f" [PG] 对话互动mode检查(前3条):") + for s in sample_mode_check: + print(f" [PG] - key={s['key']}, mode={s['mode']}, 最终名称={s['component_name']}") + + return rows + + +def fetch_pg_unit_review(user_id: str, conn: Any, id_2_unit_index: Dict[int, int], chapter_id_to_lesson_id: Dict[int, int]) -> List[Dict[str, Any]]: + """ + 查询课程巩固记录 + + Args: + user_id: 用户ID(角色ID) + conn: PostgreSQL数据库连接 + id_2_unit_index: story_id到unit_id的映射字典 + chapter_id_to_lesson_id: chapter_id到lesson_id的映射字典 + + Returns: + 课程巩固记录列表 + """ + print(f" [PG] 开始查询课程巩固记录...") + start_time = datetime.datetime.now() + + sql = ( + "SELECT user_id, story_id, chapter_id, question_list, updated_at " + "FROM user_unit_review_question_result WHERE user_id = %s ORDER BY updated_at DESC" + ) + with conn.cursor(cursor_factory=RealDictCursor) as cur: + try: + cur.execute(sql, (user_id,)) + rows = cur.fetchall() or [] + except Exception as e: + print(f" [PG] 课程巩固记录查询失败: {e}") + rows = [] + out: List[Dict[str, Any]] = [] + for r in rows: + d = dict(r) + + # 映射 story_id 到 unit_id + story_id = d.get("story_id") + unit_id = id_2_unit_index.get(story_id) if story_id else None + d["unit_id"] = unit_id + + # 映射 chapter_id 到 lesson_id + chapter_id = d.get("chapter_id") + lesson_id = chapter_id_to_lesson_id.get(chapter_id) if chapter_id else None + d["lesson_id"] = lesson_id + + # 计算正确率 + question_list = d.get("question_list") + d["正确率"] = calculate_accuracy(question_list) + + d["question_list"] = to_json_str(question_list) + upd = d.get("updated_at") + if isinstance(upd, datetime.datetime): + try: + if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: + d["updated_at"] = upd.replace(tzinfo=None) + except Exception: + d["updated_at"] = str(upd) + out.append(d) + + print(f" [PG] 课程巩固记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + return out + + +def fetch_pg_unit_challenge(user_id: str, conn: Any, id_2_unit_index: Dict[int, int]) -> List[Dict[str, Any]]: + """ + 查询单元挑战记录 + + Args: + user_id: 用户ID(角色ID) + conn: PostgreSQL数据库连接 + id_2_unit_index: story_id到unit_id的映射字典 + + Returns: + 单元挑战记录列表 + """ + print(f" [PG] 开始查询单元挑战记录...") + start_time = datetime.datetime.now() + + sql = ( + "SELECT user_id, story_id, category, score_text, question_list, updated_at " + "FROM user_unit_challenge_question_result WHERE user_id = %s ORDER BY updated_at DESC" + ) + with conn.cursor(cursor_factory=RealDictCursor) as cur: + try: + cur.execute(sql, (user_id,)) + rows = cur.fetchall() or [] + except Exception as e: + print(f" [PG] 单元挑战记录查询失败: {e}") + rows = [] + out: List[Dict[str, Any]] = [] + for r in rows: + d = dict(r) + + # 映射 story_id 到 unit_id + story_id = d.get("story_id") + unit_id = id_2_unit_index.get(story_id) if story_id else None + d["unit_id"] = unit_id + + d["question_list"] = to_json_str(d.get("question_list")) + upd = d.get("updated_at") + if isinstance(upd, datetime.datetime): + try: + if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: + d["updated_at"] = upd.replace(tzinfo=None) + except Exception: + d["updated_at"] = str(upd) + out.append(d) + + print(f" [PG] 单元挑战记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + return out + + +def fetch_pg_unit_summary(user_id: str, conn: Any, id_2_unit_index: Dict[int, int]) -> List[Dict[str, Any]]: + """ + 查询单元总结知识点结果数据 + + Args: + user_id: 用户ID(角色ID) + conn: PostgreSQL数据库连接 + id_2_unit_index: story_id到unit_id的映射字典 + + Returns: + 单元总结记录列表 + """ + print(f" [PG] 开始查询单元总结记录...") + start_time = datetime.datetime.now() + + sql = ( + "SELECT id, user_id, story_id, updated_at, km_id, km_type, play_time " + "FROM user_unit_summary_km_result WHERE user_id = %s AND deleted_at IS NULL ORDER BY updated_at DESC" + ) + with conn.cursor(cursor_factory=RealDictCursor) as cur: + try: + cur.execute(sql, (user_id,)) + rows = cur.fetchall() or [] + except Exception as e: + print(f" [PG] 单元总结记录查询失败: {e}") + rows = [] + + out: List[Dict[str, Any]] = [] + for r in rows: + d = dict(r) + # 映射 story_id 到 unit_id + story_id = d.get("story_id") + unit_id = id_2_unit_index.get(story_id) if story_id else None + d["unit_id"] = unit_id + + # 转换 play_time (毫秒) 为秒 (整数) + play_time = d.get("play_time") + d["play_time_seconds"] = play_time // 1000 if play_time else 0 + + # 移除时区信息 + upd = d.get("updated_at") + if isinstance(upd, datetime.datetime): + try: + if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: + d["updated_at"] = upd.replace(tzinfo=None) + except Exception: + d["updated_at"] = str(upd) + out.append(d) + + print(f" [PG] 单元总结记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + return out + + +def generate_statistics(sheet2_rows: List[Dict[str, Any]], sheet5_rows: List[Dict[str, Any]]) -> tuple: + """ + 生成汇总统计数据 + + Args: + sheet2_rows: 互动组件学习记录 + sheet5_rows: 单元总结记录 + + Returns: + (组件统计DataFrame, 知识点统计DataFrame, 单元时长统计DataFrame) + """ + if pd is None: + raise RuntimeError("缺少pandas依赖,请安装后再运行。") + + print(f" [统计] 开始生成汇总统计数据...") + start_time = datetime.datetime.now() + + from collections import defaultdict + + # ============ a. 所有互动-按互动组件类型-通过情况统计 ============ + component_stats_data = [] + component_stats = defaultdict(lambda: {"Perfect": 0, "Good": 0, "Failed": 0, "Pass": 0, "Oops": 0, "total": 0}) + + # 用于调试 + sample_results = [] + parse_error_count = 0 + + for idx, record in enumerate(sheet2_rows): + component_name = record.get("互动组件名称", "") + if not component_name: + continue + + play_result_str = record.get("play_result", "") + + # 解析play_result + result = "" + try: + # 先判断是否是简单的字符串(Perfect/Good/Failed/Pass/Oops) + if isinstance(play_result_str, str): + # 去除空格后检查 + stripped = play_result_str.strip() + if stripped in ["Perfect", "Good", "Failed", "Pass", "Oops"]: + # 直接使用 + result = stripped + else: + # 尝试JSON解析 + try: + play_result = json.loads(play_result_str) + if isinstance(play_result, dict): + result = play_result.get("result", "") + else: + result = "" + except: + result = "" + else: + # 如果不是字符串,尝试当dict处理 + if isinstance(play_result_str, dict): + result = play_result_str.get("result", "") + else: + result = "" + + # 收集前3个样例 + if idx < 3: + sample_results.append({ + "component": component_name, + "raw": str(play_result_str)[:100], + "result": result + }) + except Exception as e: + parse_error_count += 1 + if parse_error_count <= 3: + print(f" [统计] [警告] 解析play_result失败 (第{idx+1}条): {e}, 原始值: {str(play_result_str)[:100]}") + result = "" + + component_stats[component_name]["total"] += 1 + if result in ["Perfect", "Good", "Failed", "Pass", "Oops"]: + component_stats[component_name][result] += 1 + + print(f" [统计] play_result解析样例(前3条):") + for s in sample_results: + print(f" [统计] - 组件: {s['component']}, 结果: {s['result']}, 原始: {s['raw']}") + if parse_error_count > 0: + print(f" [统计] play_result解析失败总数: {parse_error_count}") + + # 生成统计数据行 + for component_name in sorted(component_stats.keys()): + stats = component_stats[component_name] + total = stats["total"] + perfect = stats["Perfect"] + good = stats["Good"] + failed = stats["Failed"] + pass_count = stats["Pass"] + oops = stats["Oops"] + + perfect_ratio = round(perfect / total * 100, 2) if total > 0 else 0 + good_ratio = round(good / total * 100, 2) if total > 0 else 0 + failed_ratio = round(failed / total * 100, 2) if total > 0 else 0 + pass_ratio = round(pass_count / total * 100, 2) if total > 0 else 0 + oops_ratio = round(oops / total * 100, 2) if total > 0 else 0 + + component_stats_data.append({ + "互动组件名称": component_name, + "总数量": total, + "Perfect数量": perfect, + "Good数量": good, + "Failed数量": failed, + "Pass数量": pass_count, + "Oops数量": oops, + "Perfect比例(%)": perfect_ratio, + "Good比例(%)": good_ratio, + "Failed比例(%)": failed_ratio, + "Pass比例(%)": pass_ratio, + "Oops比例(%)": oops_ratio, + }) + + # ============ b. 中互动组件-按知识点-通过情况统计 ============ + kp_stats_data = [] + kp_stats = defaultdict(lambda: {"Perfect": 0, "Good": 0, "Failed": 0, "Pass": 0, "Oops": 0, "total": 0}) + + # 调试信息 + mid_count = 0 + has_kp_count = 0 + sample_kp_records = [] + + for idx, record in enumerate(sheet2_rows): + c_type = record.get("c_type", "") + if not c_type or not c_type.startswith("mid"): + continue + + mid_count += 1 + kp_relation_info_str = record.get("知识点", "") + + if not kp_relation_info_str: + continue + + has_kp_count += 1 + + # 解析知识点 + try: + if isinstance(kp_relation_info_str, str): + kp_relation_info = json.loads(kp_relation_info_str) + else: + kp_relation_info = kp_relation_info_str + + if not isinstance(kp_relation_info, list): + continue + + # 收集样例 + if len(sample_kp_records) < 3: + sample_kp_records.append({ + "c_type": c_type, + "kp_count": len(kp_relation_info), + "kp_info": str(kp_relation_info)[:200] + }) + + # 解析play_result(使用相同的逻辑) + play_result_str = record.get("play_result", "") + result = "" + if isinstance(play_result_str, str): + stripped = play_result_str.strip() + if stripped in ["Perfect", "Good", "Failed", "Pass", "Oops"]: + result = stripped + else: + try: + play_result = json.loads(play_result_str) + if isinstance(play_result, dict): + result = play_result.get("result", "") + except: + pass + elif isinstance(play_result_str, dict): + result = play_result_str.get("result", "") + + # 为每个知识点统计 + for kp in kp_relation_info: + if not isinstance(kp, dict): + continue + + kp_id = kp.get("kpId", "") + kp_type = kp.get("kpType", "") + kp_title = kp.get("kpTitle", "") + + if not kp_id: + continue + + kp_key = f"{kp_id}|{kp_type}|{kp_title}" + kp_stats[kp_key]["total"] += 1 + if result in ["Perfect", "Good", "Failed", "Pass", "Oops"]: + kp_stats[kp_key][result] += 1 + + except Exception as e: + if len(sample_kp_records) < 5: + print(f" [统计] [警告] 解析知识点失败: {e}, 原始值: {str(kp_relation_info_str)[:100]}") + continue + + print(f" [统计] 中互动组件统计: 总数={mid_count}, 有知识点={has_kp_count}, 知识点条目数={len(kp_stats)}") + if sample_kp_records: + print(f" [统计] 知识点样例(前3条):") + for s in sample_kp_records: + print(f" [统计] - c_type={s['c_type']}, 知识点数量={s['kp_count']}, 内容={s['kp_info']}") + + # 生成知识点统计数据行 + for kp_key in sorted(kp_stats.keys()): + parts = kp_key.split("|") + if len(parts) != 3: + continue + + kp_id, kp_type, kp_title = parts + stats = kp_stats[kp_key] + total = stats["total"] + perfect = stats["Perfect"] + good = stats["Good"] + failed = stats["Failed"] + pass_count = stats["Pass"] + oops = stats["Oops"] + + perfect_ratio = round(perfect / total * 100, 2) if total > 0 else 0 + good_ratio = round(good / total * 100, 2) if total > 0 else 0 + failed_ratio = round(failed / total * 100, 2) if total > 0 else 0 + pass_ratio = round(pass_count / total * 100, 2) if total > 0 else 0 + oops_ratio = round(oops / total * 100, 2) if total > 0 else 0 + + kp_stats_data.append({ + "知识点ID": kp_id, + "知识点类型": kp_type, + "知识点标题": kp_title, + "总数量": total, + "Perfect数量": perfect, + "Good数量": good, + "Failed数量": failed, + "Pass数量": pass_count, + "Oops数量": oops, + "Perfect比例(%)": perfect_ratio, + "Good比例(%)": good_ratio, + "Failed比例(%)": failed_ratio, + "Pass比例(%)": pass_ratio, + "Oops比例(%)": oops_ratio, + }) + + # ============ c. 单元总结-按单元统计时长 ============ + unit_time_stats_data = [] + unit_time_stats = defaultdict(int) + + for record in sheet5_rows: + unit_id = record.get("unit_id") + play_time_seconds = record.get("play_time_seconds", 0) + + if unit_id is not None: + unit_time_stats[unit_id] += play_time_seconds + + # 生成单元时长统计数据行 + for unit_id in sorted(unit_time_stats.keys()): + total_seconds = unit_time_stats[unit_id] + total_minutes = int(total_seconds / 60) + + unit_time_stats_data.append({ + "单元ID": f"unit_{unit_id}", + "总时长(秒)": total_seconds, + "总时长(分钟)": total_minutes, + }) + + print(f" [统计] 汇总统计数据生成完成,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + print(f" [统计] 生成了{len(component_stats_data)}条组件统计, {len(kp_stats_data)}条知识点统计, {len(unit_time_stats_data)}条单元时长统计") + + return ( + pd.DataFrame(component_stats_data), + pd.DataFrame(kp_stats_data), + pd.DataFrame(unit_time_stats_data) + ) + + + +def write_excel(path: str, sheet1_rows: List[Dict[str, Any]], sheet2_rows: List[Dict[str, Any]], sheet3_rows: List[Dict[str, Any]], sheet4_rows: List[Dict[str, Any]], sheet5_rows: List[Dict[str, Any]], stats_component_df: Any, stats_kp_df: Any, stats_unit_time_df: Any) -> None: + if pd is None: + raise RuntimeError("缺少pandas依赖,请安装后再运行。") + + print(f" [Excel] 开始写入Excel文件: {path}") + start_time = datetime.datetime.now() + + out_dir = os.path.dirname(path) or "." + os.makedirs(out_dir, exist_ok=True) + with pd.ExcelWriter(path, engine="openpyxl") as writer: + pd.DataFrame(sheet1_rows, columns=SHEET1_COLUMNS).to_excel(writer, sheet_name="全部音频数据", index=False) + pd.DataFrame(sheet2_rows, columns=SHEET2_COLUMNS).to_excel(writer, sheet_name="互动组件学习记录", index=False) + pd.DataFrame(sheet3_rows, columns=SHEET3_COLUMNS).to_excel(writer, sheet_name="课程巩固记录", index=False) + pd.DataFrame(sheet4_rows, columns=SHEET4_COLUMNS).to_excel(writer, sheet_name="单元挑战记录", index=False) + pd.DataFrame(sheet5_rows, columns=SHEET5_COLUMNS).to_excel(writer, sheet_name="单元总结记录", index=False) + stats_component_df.to_excel(writer, sheet_name="统计-互动组件通过情况", index=False) + stats_kp_df.to_excel(writer, sheet_name="统计-知识点通过情况", index=False) + stats_unit_time_df.to_excel(writer, sheet_name="统计-单元总结时长", index=False) + + print(f" [Excel] 写入完成,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + + +def get_date_str() -> str: + """获取当前日期字符串 格式:YYYYMMDD""" + return datetime.datetime.now().strftime("%Y%m%d") + + +def export_single_user(user_id: str, es_cfg: Dict[str, Any], pg_conn: Any, mysql_conn: Any, output_path: str, id_2_unit_index: Dict[int, int], chapter_id_to_lesson_id: Dict[int, int]) -> bool: + """ + 导出单个角色id的数据 + + Args: + user_id: 角色ID + es_cfg: ES配置 + pg_conn: PostgreSQL连接 + mysql_conn: MySQL连接 + output_path: 输出路径 + id_2_unit_index: story_id到unit_id的映射字典 + chapter_id_to_lesson_id: chapter_id到lesson_id的映射字典 + + Returns: + True表示成功,False表示失败 + """ + try: + print(f"\n[INFO] ========== 开始导出角色id={user_id} ==========") + total_start_time = datetime.datetime.now() + + # 查询ES数据 + sheet1_rows = fetch_es_user_audio(user_id, es_cfg) + + # 查询PG数据 + sheet2_rows = fetch_pg_play_records(user_id, pg_conn, mysql_conn) + sheet3_rows = fetch_pg_unit_review(user_id, pg_conn, id_2_unit_index, chapter_id_to_lesson_id) + sheet4_rows = fetch_pg_unit_challenge(user_id, pg_conn, id_2_unit_index) + sheet5_rows = fetch_pg_unit_summary(user_id, pg_conn, id_2_unit_index) + + # 检查是否有有效数据 + total_records = len(sheet1_rows) + len(sheet2_rows) + len(sheet3_rows) + len(sheet4_rows) + len(sheet5_rows) + print(f" [统计] 数据汇总:") + print(f" - 全部音频数据: {len(sheet1_rows)}条") + print(f" - 互动组件学习记录: {len(sheet2_rows)}条") + print(f" - 课程巩固记录: {len(sheet3_rows)}条") + print(f" - 单元挑战记录: {len(sheet4_rows)}条") + print(f" - 单元总结记录: {len(sheet5_rows)}条") + print(f" - 总计: {total_records}条") + + if total_records == 0: + print(f"[WARN] 角色id={user_id} 没有找到任何有效记录,跳过导出") + return False + + # 生成汇总统计数据 + stats_component_df, stats_kp_df, stats_unit_time_df = generate_statistics(sheet2_rows, sheet5_rows) + + # 写入Excel + write_excel(output_path, sheet1_rows, sheet2_rows, sheet3_rows, sheet4_rows, sheet5_rows, stats_component_df, stats_kp_df, stats_unit_time_df) + + total_time = (datetime.datetime.now() - total_start_time).total_seconds() + print(f"[INFO] 角色id={user_id} 导出成功") + print(f"[INFO] 文件路径: {output_path}") + print(f"[INFO] 总耗时: {total_time:.2f}秒") + print(f"[INFO] ========== 完成 ==========\n") + return True + + except Exception as e: + print(f"[ERROR] 角色id={user_id} 导出失败: {e}") + import traceback + traceback.print_exc() + return False + + +def main(): + load_env() + + # 确定运行模式并收集需要导出的角色id列表 + user_id_list: List[tuple] = [] # [(user_id, account_id or None), ...] + date_str = get_date_str() + + # 检查三种模式的配置 + has_user_id = USER_ID is not None + has_user_id_list = USER_ID_LIST is not None and len(USER_ID_LIST) > 0 + has_account_id_list = ACCOUNT_ID_LIST is not None and len(ACCOUNT_ID_LIST) > 0 + + # 验证只能配置一种模式 + mode_count = sum([has_user_id, has_user_id_list, has_account_id_list]) + if mode_count == 0: + raise RuntimeError("请配置 USER_ID、USER_ID_LIST 或 ACCOUNT_ID_LIST 中的一个") + if mode_count > 1: + raise RuntimeError("USER_ID、USER_ID_LIST、ACCOUNT_ID_LIST 只能配置一个,请检查配置") + + # 模式1:单个角色id + if has_user_id: + user_id_list = [(str(USER_ID), None)] + print(f"[INFO] 运行模式:单个角色id") + + # 模式2:角色id列表 + elif has_user_id_list: + user_id_list = [(str(uid), None) for uid in USER_ID_LIST] + print(f"[INFO] 运行模式:角色id列表,共{len(user_id_list)}个角色") + + # 模式3:账户id列表 + elif has_account_id_list: + print(f"[INFO] 运行模式:账户id列表,共{len(ACCOUNT_ID_LIST)}个账户") + mysql_conn = None + try: + mysql_conn = get_mysql_conn("vala_user") # 查询用户表,使用 vala_user 数据库 + for account_id in ACCOUNT_ID_LIST: + account_id_str = str(account_id) + print(f"[INFO] 查询账户id={account_id_str}对应的角色id...") + character_ids = fetch_character_ids_by_account(account_id_str, mysql_conn) + if not character_ids: + print(f"[WARN] 账户id={account_id_str} 未找到关联的角色id,跳过") + continue + print(f"[INFO] 账户id={account_id_str} 找到{len(character_ids)}个角色id: {character_ids}") + for cid in character_ids: + user_id_list.append((cid, account_id_str)) + finally: + if mysql_conn: + try: + mysql_conn.close() + except Exception: + pass + + if not user_id_list: + print("[WARN] 没有需要导出的角色id,程序退出") + return + + # 初始化连接 + es_cfg = get_es_config() + pg_conn = get_pg_conn() + + # 获取映射表(只需要查询一次,所有角色共用) + print(f"\n[INFO] ===== 准备工作:获取映射表 =====") + mysql_conn = None + id_2_unit_index = {} + chapter_id_to_lesson_id = {} + try: + print(f"[INFO] 正在连接MySQL数据库(vala_test)...") + mysql_conn = get_mysql_conn("vala_test") # 查询游戏配置表,使用 vala_test 数据库 + print(f"[INFO] 正在获取 story_id 到 unit_id 的映射...") + id_2_unit_index = get_id_2_unit_index(mysql_conn) + print(f"[INFO] 成功获取 {len(id_2_unit_index)} 个 story_id 映射") + print(f"[INFO] 正在获取 chapter_id 到 lesson_id 的映射...") + chapter_id_to_lesson_id = get_chapter_id_to_lesson_id(mysql_conn) + print(f"[INFO] 成功获取 {len(chapter_id_to_lesson_id)} 个 chapter_id 映射") + except Exception as e: + print(f"[ERROR] 获取映射表失败: {e}") + import traceback + traceback.print_exc() + if pg_conn: + try: + pg_conn.close() + except Exception: + pass + if mysql_conn: + try: + mysql_conn.close() + except Exception: + pass + return + + try: + # 统计信息 + success_count = 0 + skip_count = 0 + + print(f"\n[INFO] ===== 开始批量导出 =====") + print(f"[INFO] 共需导出{len(user_id_list)}个角色\n") + batch_start_time = datetime.datetime.now() + + # 循环处理每个角色id + for idx, (user_id, account_id) in enumerate(user_id_list, 1): + print(f"\n{'='*60}") + print(f"[INFO] 进度: {idx}/{len(user_id_list)} ({idx*100//len(user_id_list)}%)") + print(f"{'='*60}") + + # 生成输出文件名 + if account_id is None: + # 模式1和模式2:角色id_{}_导出时间_{}.xlsx + filename = f"角色id_{user_id}_导出时间_{date_str}.xlsx" + else: + # 模式3:账户id_{}_角色id_{}_导出时间_{}.xlsx + filename = f"账户id_{account_id}_角色id_{user_id}_导出时间_{date_str}.xlsx" + + output_path = os.path.join(OUTPUT_DIR, filename) + + # 导出单个角色的数据 + result = export_single_user(user_id, es_cfg, pg_conn, mysql_conn, output_path, id_2_unit_index, chapter_id_to_lesson_id) + if result: + success_count += 1 + else: + skip_count += 1 + + # 输出统计信息 + batch_total_time = (datetime.datetime.now() - batch_start_time).total_seconds() + print(f"\n{'='*60}") + print(f"[INFO] ===== 全部导出完成 =====") + print(f"[INFO] 总计: {len(user_id_list)}个角色") + print(f"[INFO] 成功: {success_count}个") + print(f"[INFO] 跳过: {skip_count}个") + print(f"[INFO] 总耗时: {batch_total_time:.2f}秒 ({batch_total_time/60:.2f}分钟)") + if success_count > 0: + print(f"[INFO] 平均每个角色: {batch_total_time/success_count:.2f}秒") + print(f"{'='*60}\n") + + finally: + if pg_conn: + try: + pg_conn.close() + except Exception: + pass + if mysql_conn: + try: + mysql_conn.close() + except Exception: + pass + + +if __name__ == "__main__": + main() diff --git a/export_only_12698.py b/export_only_12698.py new file mode 100644 index 0000000..60f36b5 --- /dev/null +++ b/export_only_12698.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +"""单独测试角色12698的导出,查看具体报错""" + +import os +import json +import sys +import datetime +from typing import Any, Dict, List + +# 加载环境变量 +def load_env(): + env_path = os.path.join(os.getcwd(), ".env") + if os.path.exists(env_path): + with open(env_path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + k, v = line.split("=", 1) + os.environ[k.strip()] = v.strip().strip('"').strip("'") + +load_env() + +import psycopg2 +from psycopg2.extras import RealDictCursor +import pymysql +import requests +from requests.auth import HTTPBasicAuth +import warnings +warnings.filterwarnings('ignore') + +def test_role_12698(): + print("="*60) + print("单独测试角色ID=12698的查询") + print("="*60) + + # 连接PG + try: + conn = psycopg2.connect( + host=os.getenv("PG_DB_HOST"), + port=int(os.getenv("PG_DB_PORT")), + user=os.getenv("PG_DB_USER"), + password=os.getenv("PG_DB_PASSWORD"), + dbname=os.getenv("PG_DB_DATABASE"), + connect_timeout=10 + ) + print("✅ PG连接成功") + except Exception as e: + print(f"❌ PG连接失败: {e}") + return + + user_id = "12698" + + # 测试第一个查询:user_component_play_record_0 + print(f"\n测试查询表 user_component_play_record_0,user_id={user_id}") + try: + with conn.cursor(cursor_factory=RealDictCursor) as cur: + sql = f""" + SELECT user_id, component_unique_code, session_id, c_type, c_id, + play_result, user_behavior_info, updated_at + FROM user_component_play_record_0 + WHERE user_id = %s + ORDER BY updated_at DESC + """ + cur.execute(sql, (user_id,)) + rows = cur.fetchall() + print(f"✅ 查询成功,返回{len(rows)}条记录") + except Exception as e: + print(f"❌ 查询失败: {e}") + print(f"错误类型: {type(e).__name__}") + + # 回滚事务 + print("\n尝试回滚事务...") + try: + conn.rollback() + print("✅ 事务回滚成功") + except Exception as e2: + print(f"❌ 回滚失败: {e2}") + + # 测试查询课程巩固记录表 + print(f"\n测试查询表 user_unit_review_question_result,user_id={user_id}") + try: + with conn.cursor(cursor_factory=RealDictCursor) as cur: + sql = f""" + SELECT user_id, story_id, chapter_id, question_list, updated_at + FROM user_unit_review_question_result + WHERE user_id = %s + ORDER BY updated_at DESC + """ + cur.execute(sql, (user_id,)) + rows = cur.fetchall() + print(f"✅ 查询成功,返回{len(rows)}条记录") + except Exception as e: + print(f"❌ 查询失败: {e}") + print(f"错误类型: {type(e).__name__}") + + # 回滚事务 + print("\n尝试回滚事务...") + try: + conn.rollback() + print("✅ 事务回滚成功") + except Exception as e2: + print(f"❌ 回滚失败: {e2}") + + # 测试查询单元挑战记录表 + print(f"\n测试查询表 user_unit_challenge_question_result,user_id={user_id}") + try: + with conn.cursor(cursor_factory=RealDictCursor) as cur: + sql = f""" + SELECT user_id, story_id, category, score_text, question_list, updated_at + FROM user_unit_challenge_question_result + WHERE user_id = %s + ORDER BY updated_at DESC + """ + cur.execute(sql, (user_id,)) + rows = cur.fetchall() + print(f"✅ 查询成功,返回{len(rows)}条记录") + except Exception as e: + print(f"❌ 查询失败: {e}") + print(f"错误类型: {type(e).__name__}") + + # 测试查询单元总结记录表 + print(f"\n测试查询表 user_unit_summary_record,user_id={user_id}") + try: + with conn.cursor(cursor_factory=RealDictCursor) as cur: + sql = f""" + SELECT id, user_id, unit_id, updated_at, km_id, km_type, play_time_seconds + FROM user_unit_summary_record + WHERE user_id = %s + ORDER BY updated_at DESC + """ + cur.execute(sql, (user_id,)) + rows = cur.fetchall() + print(f"✅ 查询成功,返回{len(rows)}条记录") + except Exception as e: + print(f"❌ 查询失败: {e}") + print(f"错误类型: {type(e).__name__}") + import traceback + traceback.print_exc() + + conn.close() + +if __name__ == "__main__": + test_role_12698() diff --git a/export_user_id_data.py b/export_user_id_data.py new file mode 100644 index 0000000..478b2e0 --- /dev/null +++ b/export_user_id_data.py @@ -0,0 +1,1846 @@ +""" +初版需求v1.0: 2025.11.18 + +导出 一个userId的多表数据, 最终按照不同sheet,输出到一个 excel文件中。 + +1. 第一个sheet:"全部音频数据" +es相关配置通过以下环境变量 +ES_HOST=xxx +ES_PORT=9200 +ES_SCHEME=https +ES_USER=elastic +ES_PASSWORD=xxx + +index: user-audio + +脚本思路: +过滤字段: +userId == xxxx + +输出该userId的全部记录 按时间倒序排序 +包含以下字段内容: + +userId +userMsg +userName +soeData +audioUrl +asrStatus +componentId +componentType +dataVersion + +2. 第二个sheet:"互动组件学习记录" +在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 +数据库相关配置 从.env中读取: +PG_DB_HOST = xxx +PG_DB_PORT = xxx +PG_DB_USER = xxx +PG_DB_PASSWORD = xxx +PG_DB_DATABASE = xxx + +读取以下数据表: +user_component_play_record_0 ~ user_component_play_record_7 + +输出以下字段: +user_id, +component_unique_code, +session_id, +c_type, +c_id, +play_result, +user_behavior_info, +updated_at + +3.第三个sheet:"课程巩固记录" +在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 + +数据表:user_unit_review_question_result + +输出以下字段: +user_id +story_id +chapter_id +question_list +updated_at + +4.第四个sheet:"单元挑战记录" +在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 + +数据表:user_unit_challenge_question_result + +输出以下字段: +user_id +story_id +category +score_text, +question_list +updated_at +------------ + +需求补充v1.1: +"全部音频数据"这个sheet +输出字段 添加timeStr 并按时间倒序排列 最新的记录 在最上面 + +------------ +需求补充v1.2: +"全部音频数据"这个sheet +如果userMsg字段内容 包含 ”makee_id“ 要进行以下处理: + +从userMsg字段中提取出具体的makee_id: +此时的字段样例: +``` +asr msg信息为:{ + "time_ms": 358, + "time_ms_api": 357, + "hot_words_str": "{\n \"context_type\": \"dialog_ctx\",\n \"context_data\": [\n {\n \"text\": \"planet Walla\"\n },\n {\n \"text\": \"Walla\"\n }\n ]\n}", + "makee_id": "d208c617-902f-4f81-8255-b5fb73599546", + "volcano_fast_x_tt_logid": "202511151541355DF72BE5EBFE73795BFD", + "api_name": "volcano-fast" +} +``` +然后基于makee_id 去另一个表里查记录: index:llm_asr_log +将查询到的记录的 result_text 字段内容 回填到 userMsg。 +将source字段内容 输出 到 source。 + +如果userMsg字段内容 不包含 ”makee_id“ 保持之前的逻辑。 + +-------------- +需求补充 v1.3 +当前输入 只支持配置单个 userId (业务侧名称为角色id) + + +期望扩展为以下逻辑: +1. 改为配置 角色id list , 分别 导出 多份excel文件。命名格式为 角色id_{}_导出时间_{}.xlsx +2. 改为配置 账户id list , 分别 导出 多份excel文件。命名格式为 账户id_{}_角色id_{}_导出时间_{}.xlsx + +关于 账户 id 到角色id 的映射逻辑, +首先 读取 mysql 表 vala_app_character +筛选 account_id字段值 == 账户id 的 记录, 其中 该记录 的 id值,则为角色id 一个 账户id 可以对应多个角色id + +本次需求只针对输入侧调整, 数据抽取聚合逻辑部分和之前保持一致 + +--------------- +需求补充 v1.4 + +增加一个sheet "单元总结记录", +导出对应角色id的单元总结记录。 参考 export_unit_summary.py 中的原始数据提取方案即可(不必关注其中的数据统计部分)。 + +其他已有逻辑保持不动哦。 + +---------------- +需求补充 v1.5 + +1."互动组件学习记录"sheet 增加以下字段 +"互动组件名称"、"组件标题"、"组件配置摘要"、"知识点": +字段取值规则: +根据 c_type 及组件配置(从mysql表获取) 进行映射和处理: +``` +1).如果 c_type 开头为"mid" + +则读取下表:表名:middle_interaction_component + +获取以下字段值: +title (作为组件标题) +component_config (完整的组件配置) 获取其中 的 question 字段值 作为 组件配置摘要; +kp_relation_info 字段值 作为 知识点 + +"互动组件名称"规则: + +"物品互动": "mid_vocab_item", +"图片互动": "mid_vocab_image", +"填词互动": "mid_vocab_fillBlank", +"指令互动": "mid_vocab_instruction" +"对话互动-表达": "mid_sentence_dialogue", 且 component_config->question->mode == "express" +"对话互动-朗读": "mid_sentence_dialogue", 且 component_config->question->mode == "read" +"语音互动": "mid_sentence_voice", +"材料互动": "mid_sentence_material", +"造句互动": "mid_sentence_makeSentence" +"挖空互动": "mid_grammar_cloze", +"组句互动": "mid_grammar_sentence" +"发音互动": "mid_pron_pron" + + +2). 如果 c_type 开头为"core" +则读取下表:表名:core_interaction_component + +获取以下字段值: +title (作为组件标题) +component_config (完整的组件配置) 获取其中 的 taskInfo 字段值 作为 组件配置摘要 +kp_relation_info 字段值 作为 知识点 + +"互动组件名称"规则: +"口语快答": "core_speaking_reply", +"口语妙问": "core_speaking_inquiry", +"口语探讨": "core_speaking_explore", +"口语独白": "core_speaking_monologue" +"合作阅读": "core_reading_order", +"合作听力": "core_listening_order", +"看图组句": "core_writing_imgMakeSentence", +"看图撰写": "core_writing_imgWrite", +"问题组句": "core_writing_questionMakeSentence", +"问题撰写": "core_writing_questionWrite", +``` + +2."课程巩固记录" sheet 增加以下字段 +"正确率": 参考 export_lesson_review.py 中的计算逻辑 + +3. 新增一个"汇总统计"sheet +统计并展示以下内容 请以 可读性 比较好的方式排列、展示 + +a. "所有互动-按互动组件类型-通过情况统计" +以每种"互动组件名称"进行聚合 +统计play_result的取值分布情况,算以下指标: +总数量、Perfect数量、Good数量、Failed数量、Pass数量、Perfect比例、Good比例、Failed比例、Pass比例 + +b. "中互动组件-按知识点-通过情况统计" +以每个知识点进行聚合 + +其中 知识点配置格式如下: +``` +[{"kpId":"0000004","kpType":"sentence","kpTitle":"My name is ...","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000004","kpType":"sentence","kpTitle":"My name is ...","kpSkill":"sentence_meaning","kpSkillName":"语义"},{"kpId":"0000005","kpType":"sentence","kpTitle":"I'm… years old.","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000005","kpType":"sentence","kpTitle":"I'm… years old.","kpSkill":"sentence_meaning","kpSkillName":"语义"},{"kpId":"0000014","kpType":"sentence","kpTitle":"Nice to meet you.","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000014","kpType":"sentence","kpTitle":"Nice to meet you.","kpSkill":"sentence_meaning","kpSkillName":"语义"}] +``` +一个组件可以绑定多个知识点,以每个知识点的 kpId + kpType + kpTitle 进行 展示及聚合 + +对所有绑定了某个知识点的中互动组件(c_type以mid开头) +统计play_result的取值分布情况,算以下指标: +总数量、Perfect数量、Good数量、Failed数量、Pass数量、Perfect比例、Good比例、Failed比例、Pass比例 + +c. "单元总结-按单元统计时长" + +将"单元总结记录"中的"play_time_seconds"字段值 以每个单元id 进行聚合 进行 累加 统计,并增加一列 转换为分钟为单位 取整数 + + +""" +# ==== 可直接修改的脚本变量(不使用命令行传参) ==== +# 三种模式互斥,只能配置一个: +# 模式1:单个角色id +USER_ID = None # 单个角色ID,示例:2911 + +# 模式2:角色id列表(多个角色id批量导出) +USER_ID_LIST = None # 角色ID列表,示例:[2911, 2912, 2913] + +# 模式3:账户id列表(通过账户id查询对应的角色id后批量导出) +ACCOUNT_ID_LIST = [9343] # 账户ID列表,示例:[100, 101, 102] + +OUTPUT_DIR = "output/" # 输出目录,默认为output文件夹 +# ==== 变量结束 ==== +import os +import json +import re +from typing import Any, Dict, List, Optional + +import datetime + +try: + import requests +except Exception: + requests = None + +try: + import psycopg2 + from psycopg2.extras import RealDictCursor +except Exception: + psycopg2 = None + RealDictCursor = None + +try: + import pymysql + import pymysql.cursors +except Exception: + pymysql = None + +try: + import pandas as pd +except Exception: + pd = None + +try: + import urllib3 +except Exception: + urllib3 = None + + +SHEET1_COLUMNS = [ + "userId", + "userMsg", + "source", + "userName", + "soeData", + "audioUrl", + "asrStatus", + "componentId", + "componentType", + "dataVersion", + "timeStr", +] + +SHEET2_COLUMNS = [ + "user_id", + "component_unique_code", + "session_id", + "c_type", + "c_id", + "互动组件名称", + "组件标题", + "组件配置摘要", + "知识点", + "play_result", + "user_behavior_info", + "updated_at", +] + +SHEET3_COLUMNS = [ + "user_id", + "unit_id", + "lesson_id", + "question_list", + "正确率", + "updated_at", +] + +SHEET4_COLUMNS = [ + "user_id", + "unit_id", + "category", + "score_text", + "question_list", + "updated_at", +] + +SHEET5_COLUMNS = [ + "id", + "user_id", + "unit_id", + "updated_at", + "km_id", + "km_type", + "play_time_seconds", +] + + +def _load_env_file(path: str) -> None: + if not os.path.exists(path): + return + try: + with open(path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + if "=" not in line: + continue + k, v = line.split("=", 1) + k = k.strip() + v = v.strip().strip('"').strip("'") + if k and (os.getenv(k) is None): + os.environ[k] = v + except Exception: + pass + + +def load_env() -> None: + _load_env_file(os.path.join(os.getcwd(), ".env")) + _load_env_file(os.path.join(os.getcwd(), ".env.local")) + + +def to_json_str(v: Any) -> Any: + if isinstance(v, (dict, list)): + try: + return json.dumps(v, ensure_ascii=False) + except Exception: + return str(v) + return v + + +def parse_time(value: Any) -> Optional[datetime.datetime]: + if value is None: + return None + if isinstance(value, (int, float)): + try: + v = float(value) + # 兼容毫秒级时间戳 + if v > 1e11: + v = v / 1000.0 + return datetime.datetime.fromtimestamp(v) + except Exception: + return None + if isinstance(value, str): + fmts = [ + "%Y-%m-%dT%H:%M:%S.%fZ", + "%Y-%m-%dT%H:%M:%S.%f%z", + "%Y-%m-%dT%H:%M:%S%z", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d", + ] + for fmt in fmts: + try: + return datetime.datetime.strptime(value, fmt) + except Exception: + continue + try: + return datetime.datetime.fromisoformat(value) + except Exception: + return None + return None + + +def pick_time(source: Dict[str, Any]) -> Optional[datetime.datetime]: + candidates = [ + "updated_at", + "created_at", + "@timestamp", + "timestamp", + "updatedAt", + "createdAt", + "time", + "ts", + "timeStr", + "update_time", + "create_time", + ] + for key in candidates: + if key in source: + t = parse_time(source.get(key)) + if t is not None: + return t + # 宽松匹配:尝试扫描所有可能的时间相关字段 + for k, v in source.items(): + lk = str(k).lower() + if any(s in lk for s in ["time", "date", "_at", "timestamp"]): + t = parse_time(v) + if t is not None: + return t + return None + + +def extract_makee_id_from_user_msg(user_msg: Any) -> Optional[str]: + # 支持dict或字符串形式 + if isinstance(user_msg, dict): + mk = user_msg.get("makee_id") + if isinstance(mk, str) and mk: + return mk + if isinstance(user_msg, str) and user_msg: + # 1) 尝试整体解析为JSON + try: + obj = json.loads(user_msg) + mk = obj.get("makee_id") + if isinstance(mk, str) and mk: + return mk + except Exception: + pass + # 2) 尝试截取大括号中的JSON + try: + start = user_msg.find("{") + end = user_msg.rfind("}") + if start != -1 and end != -1 and end > start: + candidate = user_msg[start : end + 1] + obj = json.loads(candidate) + mk = obj.get("makee_id") + if isinstance(mk, str) and mk: + return mk + except Exception: + pass + # 3) 正则匹配 makee_id + m = re.search(r"\bmakee_id\b\s*:\s*\"([^\"]+)\"", user_msg) + if m: + return m.group(1) + return None + + +def fetch_es_asr_log(makee_id: str, es_cfg: Dict[str, Any]) -> Optional[Dict[str, Any]]: + if requests is None: + raise RuntimeError("缺少requests依赖,请安装后再运行。") + host = es_cfg.get("host") + port = es_cfg.get("port") + scheme = es_cfg.get("scheme", "http") + user = es_cfg.get("user") + password = es_cfg.get("password") + index = "llm_asr_log" + if not host: + return None + base = f"{scheme}://{host}:{port}" + url = f"{base}/{index}/_search" + headers = {"Content-Type": "application/json"} + body = { + "query": { + "bool": { + "should": [ + {"term": {"makee_id": {"value": str(makee_id)}}}, + {"term": {"makee_id.keyword": {"value": str(makee_id)}}}, + ], + "minimum_should_match": 1, + } + }, + "size": 10, + "_source": [ + "makee_id", + "result_text", + "source", + "updated_at", + "created_at", + "@timestamp", + "timestamp", + "updatedAt", + "createdAt", + "time", + "ts", + "timeStr", + "update_time", + "create_time", + ], + } + auth = (user, password) if user and password else None + try: + if scheme == "https" and urllib3 is not None: + try: + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + except Exception: + pass + resp = requests.post(url, headers=headers, json=body, auth=auth, timeout=20, verify=False if scheme == "https" else True) + resp.raise_for_status() + data = resp.json() + except Exception: + return None + hits = data.get("hits", {}).get("hits", []) + if not hits: + return None + # 选最新的 + chosen = None + best_t = None + for h in hits: + src = h.get("_source", {}) or {} + t = pick_time(src) + if t is None: + continue + if best_t is None or t > best_t: + best_t = t + chosen = src + if chosen is None: + # 如果都没有时间,选第一条 + chosen = (hits[0].get("_source", {}) or {}) + return chosen + + +def get_es_config() -> Dict[str, Any]: + return { + "host": os.getenv("ES_HOST"), + "port": os.getenv("ES_PORT", "9200"), + "scheme": os.getenv("ES_SCHEME", "http"), + "user": os.getenv("ES_USER"), + "password": os.getenv("ES_PASSWORD"), + "index": "user-audio", + } + + +def fetch_es_user_audio(user_id: str, es_cfg: Dict[str, Any]) -> List[Dict[str, Any]]: + if requests is None: + raise RuntimeError("缺少requests依赖,请安装后再运行。") + + print(f" [ES] 开始查询user-audio索引...") + start_time = datetime.datetime.now() + + host = es_cfg.get("host") + port = es_cfg.get("port") + scheme = es_cfg.get("scheme", "http") + user = es_cfg.get("user") + password = es_cfg.get("password") + index = es_cfg.get("index", "user-audio") + + if not host: + return [] + + base = f"{scheme}://{host}:{port}" + url = f"{base}/{index}/_search" + headers = {"Content-Type": "application/json"} + + body = { + "query": { + "bool": { + "should": [ + {"term": {"userId": {"value": str(user_id)}}}, + {"term": {"userId.keyword": {"value": str(user_id)}}}, + ], + "minimum_should_match": 1, + } + }, + "size": 10000, + "_source": [ + "userId", + "userMsg", + "userName", + "soeData", + "audioUrl", + "asrStatus", + "componentId", + "componentType", + "dataVersion", + "updated_at", + "created_at", + "@timestamp", + "timestamp", + "updatedAt", + "createdAt", + "time", + "ts", + "timeStr", + "update_time", + "create_time", + ], + } + + auth = (user, password) if user and password else None + + try: + # 抑制自签证书下的HTTPS不安全警告 + if scheme == "https" and urllib3 is not None: + try: + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + except Exception: + pass + resp = requests.post(url, headers=headers, json=body, auth=auth, timeout=30, verify=False if scheme == "https" else True) + resp.raise_for_status() + data = resp.json() + except Exception as e: + raise RuntimeError(f"ES查询失败: {e}") + + hits = data.get("hits", {}).get("hits", []) + print(f" [ES] 查询完成,获得{len(hits)}条记录,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + + if not hits: + return [] + + print(f" [ES] 开始处理音频数据...") + process_start = datetime.datetime.now() + + rows: List[Dict[str, Any]] = [] + asr_cache: Dict[str, Dict[str, Any]] = {} + makee_id_count = 0 + + for idx, h in enumerate(hits, 1): + # 每处理100条显示一次进度 + if idx % 100 == 0 or idx == len(hits): + print(f" [ES] 处理进度: {idx}/{len(hits)} ({idx*100//len(hits)}%)") + + src = h.get("_source", {}) or {} + row = { + "userId": src.get("userId"), + "userMsg": src.get("userMsg"), + "source": None, + "userName": src.get("userName"), + "soeData": to_json_str(src.get("soeData")), + "audioUrl": src.get("audioUrl"), + "asrStatus": src.get("asrStatus"), + "componentId": src.get("componentId"), + "componentType": src.get("componentType"), + "dataVersion": src.get("dataVersion"), + } + t = pick_time(src) + row["_time"] = t.isoformat() if t else None + row["timeStr"] = t.strftime("%Y-%m-%d %H:%M:%S") if t else None + # v1.2: 当userMsg包含makee_id时,补充查询llm_asr_log并回填 + mk = extract_makee_id_from_user_msg(row.get("userMsg")) + if mk: + makee_id_count += 1 + asr_doc = asr_cache.get(mk) + if asr_doc is None: + asr_doc = fetch_es_asr_log(mk, es_cfg) + if asr_doc is not None: + asr_cache[mk] = asr_doc + if asr_doc is not None: + rt = asr_doc.get("result_text") + if rt: + row["userMsg"] = rt + row["source"] = to_json_str(asr_doc.get("source")) + rows.append(row) + + print(f" [ES] 数据处理完成,发现{makee_id_count}条包含makee_id的记录,耗时{(datetime.datetime.now() - process_start).total_seconds():.2f}秒") + + print(f" [ES] 开始排序...") + rows.sort(key=lambda x: parse_time(x.get("_time")) or datetime.datetime.min, reverse=True) + print(f" [ES] 音频数据处理完成,总耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + + return rows + + +def get_pg_conn() -> Any: + if psycopg2 is None: + raise RuntimeError("缺少psycopg2依赖,请安装后再运行。") + host = os.getenv("PG_DB_HOST") + port = int(os.getenv("PG_DB_PORT", "5432")) + user = os.getenv("PG_DB_USER") + password = os.getenv("PG_DB_PASSWORD") + dbname = os.getenv("PG_DB_DATABASE") + if not host or not dbname: + raise RuntimeError("PG数据库环境变量未配置完整") + conn = psycopg2.connect(host=host, port=port, user=user, password=password, dbname=dbname) + return conn + + +def get_mysql_conn(database: str) -> Any: + """ + 获取MySQL数据库连接 + + Args: + database: 数据库名,可选值:'vala_user' 或 'vala_test' + vala_user 使用 online 配置(环境变量后缀 _online) + vala_test 使用默认配置 + + Returns: + MySQL连接对象 + """ + if pymysql is None: + raise RuntimeError("缺少pymysql依赖,请安装后再运行。") + + # 根据数据库选择不同的环境变量配置 + if database == "vala_user": + # vala_user 数据库使用 online 配置 + host = os.getenv("MYSQL_HOST_online") + port = int(os.getenv("MYSQL_PORT_online", "3306")) + user = os.getenv("MYSQL_USERNAME_online") + password = os.getenv("MYSQL_PASSWORD_online") + if not host: + raise RuntimeError("MySQL数据库环境变量未配置完整(缺少MYSQL_HOST_online)") + else: + # vala_test 等其他数据库使用默认配置 + host = os.getenv("MYSQL_HOST") + port = int(os.getenv("MYSQL_PORT", "3306")) + user = os.getenv("MYSQL_USERNAME") + password = os.getenv("MYSQL_PASSWORD") + if not host: + raise RuntimeError("MySQL数据库环境变量未配置完整(缺少MYSQL_HOST)") + + conn = pymysql.connect( + host=host, + port=port, + user=user, + password=password, + database=database, # 直接使用传入的数据库名 + charset="utf8mb4", + cursorclass=pymysql.cursors.DictCursor, + ) + return conn + + +def get_id_2_unit_index(conn: Any) -> Dict[int, int]: + """ + 从MySQL获取 story_id 到 unit_id 的映射关系 + + Args: + conn: MySQL数据库连接 + + Returns: + 映射字典 {story_id: unit_id} + """ + sql = """ + SELECT * + FROM `vala_game_info` + WHERE id > 0 + AND `vala_game_info`.`deleted_at` IS NULL + ORDER BY season_package_id asc, `index` asc + """ + try: + with conn.cursor() as cur: + cur.execute(sql) + rows = cur.fetchall() or [] + # 构建映射表:按查询结果的顺序,索引即为unit_id + id_2_unit_index = {} + for index, row in enumerate(rows): + id_2_unit_index[row["id"]] = index + return id_2_unit_index + except Exception as e: + print(f"[ERROR] 获取story_id到unit_id映射失败: {e}") + return {} + + +def get_chapter_id_to_lesson_id(conn: Any) -> Dict[int, int]: + """ + 从MySQL获取 chapter_id 到 lesson_id 的映射关系 + + Args: + conn: MySQL数据库连接 + + Returns: + 映射字典 {chapter_id: lesson_id} + """ + sql = """ + SELECT id, `index` + FROM `vala_game_chapter` + WHERE deleted_at IS NULL + """ + try: + with conn.cursor() as cur: + cur.execute(sql) + rows = cur.fetchall() or [] + # 构建映射表:chapter的index字段即为lesson_id + chapter_id_to_lesson_id = {} + for row in rows: + chapter_id_to_lesson_id[row["id"]] = row["index"] + return chapter_id_to_lesson_id + except Exception as e: + print(f"[ERROR] 获取chapter_id到lesson_id映射失败: {e}") + return {} + + +# 组件类型到组件名称的映射 +COMPONENT_TYPE_NAMES = { + "mid_vocab_item": "物品互动", + "mid_vocab_image": "图片互动", + "mid_vocab_fillBlank": "填词互动", + "mid_vocab_instruction": "指令互动", + "mid_sentence_dialogue": "对话互动", # 需要根据mode进一步判断 + "mid_sentence_voice": "语音互动", + "mid_sentence_material": "材料互动", + "mid_sentence_makeSentence": "造句互动", + "mid_grammar_cloze": "挖空互动", + "mid_grammar_sentence": "组句互动", + "mid_pron_pron": "发音互动", + "core_speaking_reply": "口语快答", + "core_speaking_inquiry": "口语妙问", + "core_speaking_explore": "口语探讨", + "core_speaking_monologue": "口语独白", + "core_reading_order": "合作阅读", + "core_listening_order": "合作听力", + "core_writing_imgMakeSentence": "看图组句", + "core_writing_imgWrite": "看图撰写", + "core_writing_questionMakeSentence": "问题组句", + "core_writing_questionWrite": "问题撰写", +} + + +def get_component_name(c_type: str, component_config: Optional[Dict[str, Any]]) -> str: + """ + 根据c_type和组件配置获取组件名称 + + Args: + c_type: 组件类型 + component_config: 组件配置(用于判断对话互动的mode) + + Returns: + 组件名称 + """ + if not c_type: + return "" + + # 特殊处理:对话互动需要根据mode判断 + if c_type == "mid_sentence_dialogue" and component_config: + try: + question = component_config.get("question", {}) + mode = question.get("mode", "") + if mode == "express": + return "对话互动-表达" + elif mode == "read": + return "对话互动-朗读" + except Exception: + pass + + return COMPONENT_TYPE_NAMES.get(c_type, "") + + +def batch_fetch_component_configs(play_records: List[Dict[str, Any]], mysql_conn: Any) -> Dict[str, Dict[str, Any]]: + """ + 批量查询组件配置信息 + + Args: + play_records: 播放记录列表 + mysql_conn: MySQL连接 + + Returns: + 组件配置映射 {c_type_c_id: {title, component_config, kp_relation_info}} + """ + print(f" [MySQL] 开始批量查询组件配置...") + start_time = datetime.datetime.now() + + # 收集需要查询的c_type和c_id + mid_c_ids = set() + core_c_ids = set() + mid_type_id_pairs = [] # 用于调试日志 + core_type_id_pairs = [] + + for record in play_records: + c_type = record.get("c_type", "") + c_id = record.get("c_id") + if c_type and c_id: + if c_type.startswith("mid"): + mid_c_ids.add(c_id) + mid_type_id_pairs.append((c_type, c_id)) + elif c_type.startswith("core"): + core_c_ids.add(c_id) + core_type_id_pairs.append((c_type, c_id)) + + print(f" [MySQL] 需要查询中互动组件: {len(mid_c_ids)}个, 核心互动组件: {len(core_c_ids)}个") + if mid_c_ids: + print(f" [MySQL] 中互动组件ID列表(前10个): {sorted(list(mid_c_ids))[:10]}") + if core_c_ids: + print(f" [MySQL] 核心互动组件ID列表(前10个): {sorted(list(core_c_ids))[:10]}") + + config_map = {} + + # 批量查询middle_interaction_component + if mid_c_ids: + try: + with mysql_conn.cursor() as cur: + placeholders = ','.join(['%s'] * len(mid_c_ids)) + sql = f""" + SELECT c_id, c_type, title, component_config, kp_relation_info + FROM middle_interaction_component + WHERE c_id IN ({placeholders}) AND deleted_at IS NULL + """ + print(f" [MySQL] 执行中互动组件查询,查询条件: c_id IN ({len(mid_c_ids)}个ID)") + cur.execute(sql, tuple(mid_c_ids)) + rows = cur.fetchall() or [] + print(f" [MySQL] 查询到{len(rows)}条中互动组件配置") + + if len(rows) == 0 and len(mid_c_ids) > 0: + print(f" [MySQL] [警告] 查询结果为空!可能的原因:") + print(f" [MySQL] - 数据库中没有匹配的c_id记录") + print(f" [MySQL] - deleted_at字段不为NULL") + print(f" [MySQL] - c_id不存在") + + for idx, row in enumerate(rows): + c_type = row.get("c_type", "") + c_id = row.get("c_id") + key = f"{c_type}_{c_id}" + + if idx < 3: # 输出前3条的详细信息 + print(f" [MySQL] [样例{idx+1}] id={c_id}, c_type={c_type}, key={key}") + print(f" [MySQL] [样例{idx+1}] title={row.get('title', '')[:50]}") + + # 解析component_config + component_config = row.get("component_config") + if isinstance(component_config, str): + try: + component_config = json.loads(component_config) + except Exception as e: + print(f" [MySQL] [警告] 解析component_config失败 (id={c_id}): {e}") + component_config = {} + + # 提取question字段作为摘要 + summary = "" + if isinstance(component_config, dict): + question = component_config.get("question") + summary = to_json_str(question) if question else "" + if idx < 3 and question: + print(f" [MySQL] [样例{idx+1}] 提取到question字段,长度: {len(summary)}") + + # 解析kp_relation_info + kp_relation_info = row.get("kp_relation_info") + if isinstance(kp_relation_info, str): + try: + kp_relation_info = json.loads(kp_relation_info) + except Exception: + kp_relation_info = [] + + config_map[key] = { + "title": row.get("title", ""), + "component_config": component_config, + "summary": summary, + "kp_relation_info": to_json_str(kp_relation_info), + } + + print(f" [MySQL] 中互动组件配置已加入config_map,当前map大小: {len(config_map)}") + except Exception as e: + print(f" [MySQL] [错误] 查询中互动组件配置失败: {e}") + import traceback + traceback.print_exc() + + # 批量查询core_interaction_component + if core_c_ids: + try: + with mysql_conn.cursor() as cur: + placeholders = ','.join(['%s'] * len(core_c_ids)) + sql = f""" + SELECT c_id, c_type, title, component_config, kp_relation_info + FROM core_interaction_component + WHERE c_id IN ({placeholders}) AND deleted_at IS NULL + """ + print(f" [MySQL] 执行核心互动组件查询,查询条件: c_id IN ({len(core_c_ids)}个ID)") + cur.execute(sql, tuple(core_c_ids)) + rows = cur.fetchall() or [] + print(f" [MySQL] 查询到{len(rows)}条核心互动组件配置") + + if len(rows) == 0 and len(core_c_ids) > 0: + print(f" [MySQL] [警告] 查询结果为空!可能的原因:") + print(f" [MySQL] - 数据库中没有匹配的c_id记录") + print(f" [MySQL] - deleted_at字段不为NULL") + print(f" [MySQL] - c_id不存在") + + for idx, row in enumerate(rows): + c_type = row.get("c_type", "") + c_id = row.get("c_id") + key = f"{c_type}_{c_id}" + + if idx < 3: # 输出前3条的详细信息 + print(f" [MySQL] [样例{idx+1}] id={c_id}, c_type={c_type}, key={key}") + print(f" [MySQL] [样例{idx+1}] title={row.get('title', '')[:50]}") + + # 解析component_config + component_config = row.get("component_config") + if isinstance(component_config, str): + try: + component_config = json.loads(component_config) + except Exception as e: + print(f" [MySQL] [警告] 解析component_config失败 (id={c_id}): {e}") + component_config = {} + + # 提取taskInfo字段作为摘要 + summary = "" + if isinstance(component_config, dict): + task_info = component_config.get("taskInfo") + summary = to_json_str(task_info) if task_info else "" + if idx < 3 and task_info: + print(f" [MySQL] [样例{idx+1}] 提取到taskInfo字段,长度: {len(summary)}") + + # 解析kp_relation_info + kp_relation_info = row.get("kp_relation_info") + if isinstance(kp_relation_info, str): + try: + kp_relation_info = json.loads(kp_relation_info) + except Exception: + kp_relation_info = [] + + config_map[key] = { + "title": row.get("title", ""), + "component_config": component_config, + "summary": summary, + "kp_relation_info": to_json_str(kp_relation_info), + } + + print(f" [MySQL] 核心互动组件配置已加入config_map,当前map大小: {len(config_map)}") + except Exception as e: + print(f" [MySQL] [错误] 查询核心互动组件配置失败: {e}") + import traceback + traceback.print_exc() + + print(f" [MySQL] 组件配置查询完成,共{len(config_map)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + return config_map + + +def calculate_accuracy(question_list: Any) -> float: + """ + 计算问题列表的正确率 + + Args: + question_list: 问题列表(可能是JSON字符串或list) + + Returns: + 正确率(百分比,保留2位小数) + """ + try: + if isinstance(question_list, str): + question_list = json.loads(question_list) + + if not isinstance(question_list, list) or len(question_list) == 0: + return 0.0 + + total = len(question_list) + correct = sum(1 for q in question_list if q.get('isRight') == True) + accuracy = round(correct / total * 100, 2) if total > 0 else 0.0 + + return accuracy + except Exception: + return 0.0 + + + +def fetch_character_ids_by_account(account_id: str, conn: Any) -> List[str]: + """根据账户id查询对应的角色id列表""" + sql = "SELECT id FROM vala_app_character WHERE account_id = %s" + try: + with conn.cursor() as cur: + cur.execute(sql, (account_id,)) + rows = cur.fetchall() or [] + return [str(row["id"]) for row in rows if row.get("id")] + except Exception as e: + print(f"[ERROR] 查询账户id={account_id}的角色id失败: {e}") + return [] + + +def fetch_pg_play_records(user_id: str, conn: Any, mysql_conn: Any) -> List[Dict[str, Any]]: + """ + 查询互动组件学习记录并补充组件配置信息 + + Args: + user_id: 用户ID(角色ID) + conn: PostgreSQL数据库连接 + mysql_conn: MySQL数据库连接 + + Returns: + 互动组件学习记录列表 + """ + print(f" [PG] 开始查询互动组件学习记录(8张分表)...") + start_time = datetime.datetime.now() + + tables = [f"user_component_play_record_{i}" for i in range(8)] + rows: List[Dict[str, Any]] = [] + with conn.cursor(cursor_factory=RealDictCursor) as cur: + for t in tables: + try: + cur.execute( + f""" + SELECT user_id, component_unique_code, session_id, c_type, c_id, + play_result, user_behavior_info, updated_at + FROM {t} + WHERE user_id = %s + ORDER BY updated_at DESC + """, + (user_id,), + ) + part = cur.fetchall() or [] + if part: + print(f" [PG] 表{t}查到{len(part)}条记录") + for r in part: + r = dict(r) + r["play_result"] = to_json_str(r.get("play_result")) + r["user_behavior_info"] = to_json_str(r.get("user_behavior_info")) + # 将带时区的时间转换为无时区,避免Excel写入报错 + upd = r.get("updated_at") + if isinstance(upd, datetime.datetime): + try: + if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: + r["updated_at"] = upd.replace(tzinfo=None) + except Exception: + # 回退为字符串 + r["updated_at"] = str(upd) + rows.append(r) + except Exception as e: + print(f" [PG] 表{t}查询失败: {e}") + continue + + rows.sort(key=lambda x: parse_time(x.get("updated_at")) or datetime.datetime.min, reverse=True) + print(f" [PG] 互动组件学习记录查询完成,共{len(rows)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + + # 批量查询组件配置 + if rows and mysql_conn: + config_map = batch_fetch_component_configs(rows, mysql_conn) + + # 补充组件信息 + print(f" [PG] 开始补充组件配置信息...") + filled_count = 0 + empty_count = 0 + sample_keys = [] + sample_mode_check = [] # 检查对话互动的mode + + for r in rows: + c_type = r.get("c_type", "") + c_id = r.get("c_id") + key = f"{c_type}_{c_id}" if c_type and c_id else "" + + config = config_map.get(key, {}) + component_config = config.get("component_config", {}) + + component_name = get_component_name(c_type, component_config) + r["互动组件名称"] = component_name + r["组件标题"] = config.get("title", "") + r["组件配置摘要"] = config.get("summary", "") + r["知识点"] = config.get("kp_relation_info", "") + + # 统计填充情况 + if config: + filled_count += 1 + if len(sample_keys) < 3: + sample_keys.append((key, component_name, r["组件标题"][:30] if r["组件标题"] else "")) + + # 检查对话互动的mode + if c_type == "mid_sentence_dialogue" and len(sample_mode_check) < 3: + mode = "" + if isinstance(component_config, dict): + question = component_config.get("question", {}) + if isinstance(question, dict): + mode = question.get("mode", "") + sample_mode_check.append({ + "key": key, + "mode": mode, + "component_name": component_name + }) + else: + empty_count += 1 + if empty_count <= 5: # 输出前5个未匹配的key + print(f" [PG] [警告] 未找到组件配置: key={key}") + + print(f" [PG] 组件配置信息补充完成") + print(f" [PG] 匹配到配置: {filled_count}条, 未匹配: {empty_count}条") + if sample_keys: + print(f" [PG] 样例数据(前3条):") + for key, name, title in sample_keys: + print(f" [PG] - key={key}, 名称={name}, 标题={title}") + + if sample_mode_check: + print(f" [PG] 对话互动mode检查(前3条):") + for s in sample_mode_check: + print(f" [PG] - key={s['key']}, mode={s['mode']}, 最终名称={s['component_name']}") + + return rows + + +def fetch_pg_unit_review(user_id: str, conn: Any, id_2_unit_index: Dict[int, int], chapter_id_to_lesson_id: Dict[int, int]) -> List[Dict[str, Any]]: + """ + 查询课程巩固记录 + + Args: + user_id: 用户ID(角色ID) + conn: PostgreSQL数据库连接 + id_2_unit_index: story_id到unit_id的映射字典 + chapter_id_to_lesson_id: chapter_id到lesson_id的映射字典 + + Returns: + 课程巩固记录列表 + """ + print(f" [PG] 开始查询课程巩固记录...") + start_time = datetime.datetime.now() + + sql = ( + "SELECT user_id, story_id, chapter_id, question_list, updated_at " + "FROM user_unit_review_question_result WHERE user_id = %s ORDER BY updated_at DESC" + ) + with conn.cursor(cursor_factory=RealDictCursor) as cur: + try: + cur.execute(sql, (user_id,)) + rows = cur.fetchall() or [] + except Exception as e: + print(f" [PG] 课程巩固记录查询失败: {e}") + rows = [] + out: List[Dict[str, Any]] = [] + for r in rows: + d = dict(r) + + # 映射 story_id 到 unit_id + story_id = d.get("story_id") + unit_id = id_2_unit_index.get(story_id) if story_id else None + d["unit_id"] = unit_id + + # 映射 chapter_id 到 lesson_id + chapter_id = d.get("chapter_id") + lesson_id = chapter_id_to_lesson_id.get(chapter_id) if chapter_id else None + d["lesson_id"] = lesson_id + + # 计算正确率 + question_list = d.get("question_list") + d["正确率"] = calculate_accuracy(question_list) + + d["question_list"] = to_json_str(question_list) + upd = d.get("updated_at") + if isinstance(upd, datetime.datetime): + try: + if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: + d["updated_at"] = upd.replace(tzinfo=None) + except Exception: + d["updated_at"] = str(upd) + out.append(d) + + print(f" [PG] 课程巩固记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + return out + + +def fetch_pg_unit_challenge(user_id: str, conn: Any, id_2_unit_index: Dict[int, int]) -> List[Dict[str, Any]]: + """ + 查询单元挑战记录 + + Args: + user_id: 用户ID(角色ID) + conn: PostgreSQL数据库连接 + id_2_unit_index: story_id到unit_id的映射字典 + + Returns: + 单元挑战记录列表 + """ + print(f" [PG] 开始查询单元挑战记录...") + start_time = datetime.datetime.now() + + sql = ( + "SELECT user_id, story_id, category, score_text, question_list, updated_at " + "FROM user_unit_challenge_question_result WHERE user_id = %s ORDER BY updated_at DESC" + ) + with conn.cursor(cursor_factory=RealDictCursor) as cur: + try: + cur.execute(sql, (user_id,)) + rows = cur.fetchall() or [] + except Exception as e: + print(f" [PG] 单元挑战记录查询失败: {e}") + rows = [] + out: List[Dict[str, Any]] = [] + for r in rows: + d = dict(r) + + # 映射 story_id 到 unit_id + story_id = d.get("story_id") + unit_id = id_2_unit_index.get(story_id) if story_id else None + d["unit_id"] = unit_id + + d["question_list"] = to_json_str(d.get("question_list")) + upd = d.get("updated_at") + if isinstance(upd, datetime.datetime): + try: + if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: + d["updated_at"] = upd.replace(tzinfo=None) + except Exception: + d["updated_at"] = str(upd) + out.append(d) + + print(f" [PG] 单元挑战记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + return out + + +def fetch_pg_unit_summary(user_id: str, conn: Any, id_2_unit_index: Dict[int, int]) -> List[Dict[str, Any]]: + """ + 查询单元总结知识点结果数据 + + Args: + user_id: 用户ID(角色ID) + conn: PostgreSQL数据库连接 + id_2_unit_index: story_id到unit_id的映射字典 + + Returns: + 单元总结记录列表 + """ + print(f" [PG] 开始查询单元总结记录...") + start_time = datetime.datetime.now() + + sql = ( + "SELECT id, user_id, story_id, updated_at, km_id, km_type, play_time " + "FROM user_unit_summary_km_result WHERE user_id = %s AND deleted_at IS NULL ORDER BY updated_at DESC" + ) + with conn.cursor(cursor_factory=RealDictCursor) as cur: + try: + cur.execute(sql, (user_id,)) + rows = cur.fetchall() or [] + except Exception as e: + print(f" [PG] 单元总结记录查询失败: {e}") + rows = [] + + out: List[Dict[str, Any]] = [] + for r in rows: + d = dict(r) + # 映射 story_id 到 unit_id + story_id = d.get("story_id") + unit_id = id_2_unit_index.get(story_id) if story_id else None + d["unit_id"] = unit_id + + # 转换 play_time (毫秒) 为秒 (整数) + play_time = d.get("play_time") + d["play_time_seconds"] = play_time // 1000 if play_time else 0 + + # 移除时区信息 + upd = d.get("updated_at") + if isinstance(upd, datetime.datetime): + try: + if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: + d["updated_at"] = upd.replace(tzinfo=None) + except Exception: + d["updated_at"] = str(upd) + out.append(d) + + print(f" [PG] 单元总结记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + return out + + +def generate_statistics(sheet2_rows: List[Dict[str, Any]], sheet5_rows: List[Dict[str, Any]]) -> tuple: + """ + 生成汇总统计数据 + + Args: + sheet2_rows: 互动组件学习记录 + sheet5_rows: 单元总结记录 + + Returns: + (组件统计DataFrame, 知识点统计DataFrame, 单元时长统计DataFrame) + """ + if pd is None: + raise RuntimeError("缺少pandas依赖,请安装后再运行。") + + print(f" [统计] 开始生成汇总统计数据...") + start_time = datetime.datetime.now() + + from collections import defaultdict + + # ============ a. 所有互动-按互动组件类型-通过情况统计 ============ + component_stats_data = [] + component_stats = defaultdict(lambda: {"Perfect": 0, "Good": 0, "Failed": 0, "Pass": 0, "Oops": 0, "total": 0}) + + # 用于调试 + sample_results = [] + parse_error_count = 0 + + for idx, record in enumerate(sheet2_rows): + component_name = record.get("互动组件名称", "") + if not component_name: + continue + + play_result_str = record.get("play_result", "") + + # 解析play_result + result = "" + try: + # 先判断是否是简单的字符串(Perfect/Good/Failed/Pass/Oops) + if isinstance(play_result_str, str): + # 去除空格后检查 + stripped = play_result_str.strip() + if stripped in ["Perfect", "Good", "Failed", "Pass", "Oops"]: + # 直接使用 + result = stripped + else: + # 尝试JSON解析 + try: + play_result = json.loads(play_result_str) + if isinstance(play_result, dict): + result = play_result.get("result", "") + else: + result = "" + except: + result = "" + else: + # 如果不是字符串,尝试当dict处理 + if isinstance(play_result_str, dict): + result = play_result_str.get("result", "") + else: + result = "" + + # 收集前3个样例 + if idx < 3: + sample_results.append({ + "component": component_name, + "raw": str(play_result_str)[:100], + "result": result + }) + except Exception as e: + parse_error_count += 1 + if parse_error_count <= 3: + print(f" [统计] [警告] 解析play_result失败 (第{idx+1}条): {e}, 原始值: {str(play_result_str)[:100]}") + result = "" + + component_stats[component_name]["total"] += 1 + if result in ["Perfect", "Good", "Failed", "Pass", "Oops"]: + component_stats[component_name][result] += 1 + + print(f" [统计] play_result解析样例(前3条):") + for s in sample_results: + print(f" [统计] - 组件: {s['component']}, 结果: {s['result']}, 原始: {s['raw']}") + if parse_error_count > 0: + print(f" [统计] play_result解析失败总数: {parse_error_count}") + + # 生成统计数据行 + for component_name in sorted(component_stats.keys()): + stats = component_stats[component_name] + total = stats["total"] + perfect = stats["Perfect"] + good = stats["Good"] + failed = stats["Failed"] + pass_count = stats["Pass"] + oops = stats["Oops"] + + perfect_ratio = round(perfect / total * 100, 2) if total > 0 else 0 + good_ratio = round(good / total * 100, 2) if total > 0 else 0 + failed_ratio = round(failed / total * 100, 2) if total > 0 else 0 + pass_ratio = round(pass_count / total * 100, 2) if total > 0 else 0 + oops_ratio = round(oops / total * 100, 2) if total > 0 else 0 + + component_stats_data.append({ + "互动组件名称": component_name, + "总数量": total, + "Perfect数量": perfect, + "Good数量": good, + "Failed数量": failed, + "Pass数量": pass_count, + "Oops数量": oops, + "Perfect比例(%)": perfect_ratio, + "Good比例(%)": good_ratio, + "Failed比例(%)": failed_ratio, + "Pass比例(%)": pass_ratio, + "Oops比例(%)": oops_ratio, + }) + + # ============ b. 中互动组件-按知识点-通过情况统计 ============ + kp_stats_data = [] + kp_stats = defaultdict(lambda: {"Perfect": 0, "Good": 0, "Failed": 0, "Pass": 0, "Oops": 0, "total": 0}) + + # 调试信息 + mid_count = 0 + has_kp_count = 0 + sample_kp_records = [] + + for idx, record in enumerate(sheet2_rows): + c_type = record.get("c_type", "") + if not c_type or not c_type.startswith("mid"): + continue + + mid_count += 1 + kp_relation_info_str = record.get("知识点", "") + + if not kp_relation_info_str: + continue + + has_kp_count += 1 + + # 解析知识点 + try: + if isinstance(kp_relation_info_str, str): + kp_relation_info = json.loads(kp_relation_info_str) + else: + kp_relation_info = kp_relation_info_str + + if not isinstance(kp_relation_info, list): + continue + + # 收集样例 + if len(sample_kp_records) < 3: + sample_kp_records.append({ + "c_type": c_type, + "kp_count": len(kp_relation_info), + "kp_info": str(kp_relation_info)[:200] + }) + + # 解析play_result(使用相同的逻辑) + play_result_str = record.get("play_result", "") + result = "" + if isinstance(play_result_str, str): + stripped = play_result_str.strip() + if stripped in ["Perfect", "Good", "Failed", "Pass", "Oops"]: + result = stripped + else: + try: + play_result = json.loads(play_result_str) + if isinstance(play_result, dict): + result = play_result.get("result", "") + except: + pass + elif isinstance(play_result_str, dict): + result = play_result_str.get("result", "") + + # 为每个知识点统计 + for kp in kp_relation_info: + if not isinstance(kp, dict): + continue + + kp_id = kp.get("kpId", "") + kp_type = kp.get("kpType", "") + kp_title = kp.get("kpTitle", "") + + if not kp_id: + continue + + kp_key = f"{kp_id}|{kp_type}|{kp_title}" + kp_stats[kp_key]["total"] += 1 + if result in ["Perfect", "Good", "Failed", "Pass", "Oops"]: + kp_stats[kp_key][result] += 1 + + except Exception as e: + if len(sample_kp_records) < 5: + print(f" [统计] [警告] 解析知识点失败: {e}, 原始值: {str(kp_relation_info_str)[:100]}") + continue + + print(f" [统计] 中互动组件统计: 总数={mid_count}, 有知识点={has_kp_count}, 知识点条目数={len(kp_stats)}") + if sample_kp_records: + print(f" [统计] 知识点样例(前3条):") + for s in sample_kp_records: + print(f" [统计] - c_type={s['c_type']}, 知识点数量={s['kp_count']}, 内容={s['kp_info']}") + + # 生成知识点统计数据行 + for kp_key in sorted(kp_stats.keys()): + parts = kp_key.split("|") + if len(parts) != 3: + continue + + kp_id, kp_type, kp_title = parts + stats = kp_stats[kp_key] + total = stats["total"] + perfect = stats["Perfect"] + good = stats["Good"] + failed = stats["Failed"] + pass_count = stats["Pass"] + oops = stats["Oops"] + + perfect_ratio = round(perfect / total * 100, 2) if total > 0 else 0 + good_ratio = round(good / total * 100, 2) if total > 0 else 0 + failed_ratio = round(failed / total * 100, 2) if total > 0 else 0 + pass_ratio = round(pass_count / total * 100, 2) if total > 0 else 0 + oops_ratio = round(oops / total * 100, 2) if total > 0 else 0 + + kp_stats_data.append({ + "知识点ID": kp_id, + "知识点类型": kp_type, + "知识点标题": kp_title, + "总数量": total, + "Perfect数量": perfect, + "Good数量": good, + "Failed数量": failed, + "Pass数量": pass_count, + "Oops数量": oops, + "Perfect比例(%)": perfect_ratio, + "Good比例(%)": good_ratio, + "Failed比例(%)": failed_ratio, + "Pass比例(%)": pass_ratio, + "Oops比例(%)": oops_ratio, + }) + + # ============ c. 单元总结-按单元统计时长 ============ + unit_time_stats_data = [] + unit_time_stats = defaultdict(int) + + for record in sheet5_rows: + unit_id = record.get("unit_id") + play_time_seconds = record.get("play_time_seconds", 0) + + if unit_id is not None: + unit_time_stats[unit_id] += play_time_seconds + + # 生成单元时长统计数据行 + for unit_id in sorted(unit_time_stats.keys()): + total_seconds = unit_time_stats[unit_id] + total_minutes = int(total_seconds / 60) + + unit_time_stats_data.append({ + "单元ID": f"unit_{unit_id}", + "总时长(秒)": total_seconds, + "总时长(分钟)": total_minutes, + }) + + print(f" [统计] 汇总统计数据生成完成,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + print(f" [统计] 生成了{len(component_stats_data)}条组件统计, {len(kp_stats_data)}条知识点统计, {len(unit_time_stats_data)}条单元时长统计") + + return ( + pd.DataFrame(component_stats_data), + pd.DataFrame(kp_stats_data), + pd.DataFrame(unit_time_stats_data) + ) + + + +def write_excel(path: str, sheet1_rows: List[Dict[str, Any]], sheet2_rows: List[Dict[str, Any]], sheet3_rows: List[Dict[str, Any]], sheet4_rows: List[Dict[str, Any]], sheet5_rows: List[Dict[str, Any]], stats_component_df: Any, stats_kp_df: Any, stats_unit_time_df: Any) -> None: + if pd is None: + raise RuntimeError("缺少pandas依赖,请安装后再运行。") + + print(f" [Excel] 开始写入Excel文件: {path}") + start_time = datetime.datetime.now() + + out_dir = os.path.dirname(path) or "." + os.makedirs(out_dir, exist_ok=True) + with pd.ExcelWriter(path, engine="openpyxl") as writer: + pd.DataFrame(sheet1_rows, columns=SHEET1_COLUMNS).to_excel(writer, sheet_name="全部音频数据", index=False) + pd.DataFrame(sheet2_rows, columns=SHEET2_COLUMNS).to_excel(writer, sheet_name="互动组件学习记录", index=False) + pd.DataFrame(sheet3_rows, columns=SHEET3_COLUMNS).to_excel(writer, sheet_name="课程巩固记录", index=False) + pd.DataFrame(sheet4_rows, columns=SHEET4_COLUMNS).to_excel(writer, sheet_name="单元挑战记录", index=False) + pd.DataFrame(sheet5_rows, columns=SHEET5_COLUMNS).to_excel(writer, sheet_name="单元总结记录", index=False) + stats_component_df.to_excel(writer, sheet_name="统计-互动组件通过情况", index=False) + stats_kp_df.to_excel(writer, sheet_name="统计-知识点通过情况", index=False) + stats_unit_time_df.to_excel(writer, sheet_name="统计-单元总结时长", index=False) + + print(f" [Excel] 写入完成,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + + +def get_date_str() -> str: + """获取当前日期字符串 格式:YYYYMMDD""" + return datetime.datetime.now().strftime("%Y%m%d") + + +def export_single_user(user_id: str, es_cfg: Dict[str, Any], pg_conn: Any, mysql_conn: Any, output_path: str, id_2_unit_index: Dict[int, int], chapter_id_to_lesson_id: Dict[int, int]) -> bool: + """ + 导出单个角色id的数据 + + Args: + user_id: 角色ID + es_cfg: ES配置 + pg_conn: PostgreSQL连接 + mysql_conn: MySQL连接 + output_path: 输出路径 + id_2_unit_index: story_id到unit_id的映射字典 + chapter_id_to_lesson_id: chapter_id到lesson_id的映射字典 + + Returns: + True表示成功,False表示失败 + """ + try: + print(f"\n[INFO] ========== 开始导出角色id={user_id} ==========") + total_start_time = datetime.datetime.now() + + # 查询ES数据 + sheet1_rows = fetch_es_user_audio(user_id, es_cfg) + + # 查询PG数据 + sheet2_rows = fetch_pg_play_records(user_id, pg_conn, mysql_conn) + sheet3_rows = fetch_pg_unit_review(user_id, pg_conn, id_2_unit_index, chapter_id_to_lesson_id) + sheet4_rows = fetch_pg_unit_challenge(user_id, pg_conn, id_2_unit_index) + sheet5_rows = fetch_pg_unit_summary(user_id, pg_conn, id_2_unit_index) + + # 检查是否有有效数据 + total_records = len(sheet1_rows) + len(sheet2_rows) + len(sheet3_rows) + len(sheet4_rows) + len(sheet5_rows) + print(f" [统计] 数据汇总:") + print(f" - 全部音频数据: {len(sheet1_rows)}条") + print(f" - 互动组件学习记录: {len(sheet2_rows)}条") + print(f" - 课程巩固记录: {len(sheet3_rows)}条") + print(f" - 单元挑战记录: {len(sheet4_rows)}条") + print(f" - 单元总结记录: {len(sheet5_rows)}条") + print(f" - 总计: {total_records}条") + + if total_records == 0: + print(f"[WARN] 角色id={user_id} 没有找到任何有效记录,跳过导出") + return False + + # 生成汇总统计数据 + stats_component_df, stats_kp_df, stats_unit_time_df = generate_statistics(sheet2_rows, sheet5_rows) + + # 写入Excel + write_excel(output_path, sheet1_rows, sheet2_rows, sheet3_rows, sheet4_rows, sheet5_rows, stats_component_df, stats_kp_df, stats_unit_time_df) + + total_time = (datetime.datetime.now() - total_start_time).total_seconds() + print(f"[INFO] 角色id={user_id} 导出成功") + print(f"[INFO] 文件路径: {output_path}") + print(f"[INFO] 总耗时: {total_time:.2f}秒") + print(f"[INFO] ========== 完成 ==========\n") + return True + + except Exception as e: + print(f"[ERROR] 角色id={user_id} 导出失败: {e}") + import traceback + traceback.print_exc() + return False + + +def main(): + load_env() + + # 确定运行模式并收集需要导出的角色id列表 + user_id_list: List[tuple] = [] # [(user_id, account_id or None), ...] + date_str = get_date_str() + + # 检查三种模式的配置 + has_user_id = USER_ID is not None + has_user_id_list = USER_ID_LIST is not None and len(USER_ID_LIST) > 0 + has_account_id_list = ACCOUNT_ID_LIST is not None and len(ACCOUNT_ID_LIST) > 0 + + # 验证只能配置一种模式 + mode_count = sum([has_user_id, has_user_id_list, has_account_id_list]) + if mode_count == 0: + raise RuntimeError("请配置 USER_ID、USER_ID_LIST 或 ACCOUNT_ID_LIST 中的一个") + if mode_count > 1: + raise RuntimeError("USER_ID、USER_ID_LIST、ACCOUNT_ID_LIST 只能配置一个,请检查配置") + + # 模式1:单个角色id + if has_user_id: + user_id_list = [(str(USER_ID), None)] + print(f"[INFO] 运行模式:单个角色id") + + # 模式2:角色id列表 + elif has_user_id_list: + user_id_list = [(str(uid), None) for uid in USER_ID_LIST] + print(f"[INFO] 运行模式:角色id列表,共{len(user_id_list)}个角色") + + # 模式3:账户id列表 + elif has_account_id_list: + print(f"[INFO] 运行模式:账户id列表,共{len(ACCOUNT_ID_LIST)}个账户") + mysql_conn = None + try: + mysql_conn = get_mysql_conn("vala_user") # 查询用户表,使用 vala_user 数据库 + for account_id in ACCOUNT_ID_LIST: + account_id_str = str(account_id) + print(f"[INFO] 查询账户id={account_id_str}对应的角色id...") + character_ids = fetch_character_ids_by_account(account_id_str, mysql_conn) + if not character_ids: + print(f"[WARN] 账户id={account_id_str} 未找到关联的角色id,跳过") + continue + print(f"[INFO] 账户id={account_id_str} 找到{len(character_ids)}个角色id: {character_ids}") + for cid in character_ids: + user_id_list.append((cid, account_id_str)) + finally: + if mysql_conn: + try: + mysql_conn.close() + except Exception: + pass + + if not user_id_list: + print("[WARN] 没有需要导出的角色id,程序退出") + return + + # 初始化连接 + es_cfg = get_es_config() + pg_conn = get_pg_conn() + + # 获取映射表(只需要查询一次,所有角色共用) + print(f"\n[INFO] ===== 准备工作:获取映射表 =====") + mysql_conn = None + id_2_unit_index = {} + chapter_id_to_lesson_id = {} + try: + print(f"[INFO] 正在连接MySQL数据库(vala_test)...") + mysql_conn = get_mysql_conn("vala_test") # 查询游戏配置表,使用 vala_test 数据库 + print(f"[INFO] 正在获取 story_id 到 unit_id 的映射...") + id_2_unit_index = get_id_2_unit_index(mysql_conn) + print(f"[INFO] 成功获取 {len(id_2_unit_index)} 个 story_id 映射") + print(f"[INFO] 正在获取 chapter_id 到 lesson_id 的映射...") + chapter_id_to_lesson_id = get_chapter_id_to_lesson_id(mysql_conn) + print(f"[INFO] 成功获取 {len(chapter_id_to_lesson_id)} 个 chapter_id 映射") + except Exception as e: + print(f"[ERROR] 获取映射表失败: {e}") + import traceback + traceback.print_exc() + if pg_conn: + try: + pg_conn.close() + except Exception: + pass + if mysql_conn: + try: + mysql_conn.close() + except Exception: + pass + return + + try: + # 统计信息 + success_count = 0 + skip_count = 0 + + print(f"\n[INFO] ===== 开始批量导出 =====") + print(f"[INFO] 共需导出{len(user_id_list)}个角色\n") + batch_start_time = datetime.datetime.now() + + # 循环处理每个角色id + for idx, (user_id, account_id) in enumerate(user_id_list, 1): + print(f"\n{'='*60}") + print(f"[INFO] 进度: {idx}/{len(user_id_list)} ({idx*100//len(user_id_list)}%)") + print(f"{'='*60}") + + # 生成输出文件名 + if account_id is None: + # 模式1和模式2:角色id_{}_导出时间_{}.xlsx + filename = f"角色id_{user_id}_导出时间_{date_str}.xlsx" + else: + # 模式3:账户id_{}_角色id_{}_导出时间_{}.xlsx + filename = f"账户id_{account_id}_角色id_{user_id}_导出时间_{date_str}.xlsx" + + output_path = os.path.join(OUTPUT_DIR, filename) + + # 导出单个角色的数据 + result = export_single_user(user_id, es_cfg, pg_conn, mysql_conn, output_path, id_2_unit_index, chapter_id_to_lesson_id) + if result: + success_count += 1 + else: + skip_count += 1 + + # 输出统计信息 + batch_total_time = (datetime.datetime.now() - batch_start_time).total_seconds() + print(f"\n{'='*60}") + print(f"[INFO] ===== 全部导出完成 =====") + print(f"[INFO] 总计: {len(user_id_list)}个角色") + print(f"[INFO] 成功: {success_count}个") + print(f"[INFO] 跳过: {skip_count}个") + print(f"[INFO] 总耗时: {batch_total_time:.2f}秒 ({batch_total_time/60:.2f}分钟)") + if success_count > 0: + print(f"[INFO] 平均每个角色: {batch_total_time/success_count:.2f}秒") + print(f"{'='*60}\n") + + finally: + if pg_conn: + try: + pg_conn.close() + except Exception: + pass + if mysql_conn: + try: + mysql_conn.close() + except Exception: + pass + + +if __name__ == "__main__": + main() diff --git a/export_user_id_data_debug.py b/export_user_id_data_debug.py new file mode 100644 index 0000000..4be3cb8 --- /dev/null +++ b/export_user_id_data_debug.py @@ -0,0 +1,1845 @@ +""" +初版需求v1.0: 2025.11.18 + +导出 一个userId的多表数据, 最终按照不同sheet,输出到一个 excel文件中。 + +1. 第一个sheet:"全部音频数据" +es相关配置通过以下环境变量 +ES_HOST=xxx +ES_PORT=9200 +ES_SCHEME=https +ES_USER=elastic +ES_PASSWORD=xxx + +index: user-audio + +脚本思路: +过滤字段: +userId == xxxx + +输出该userId的全部记录 按时间倒序排序 +包含以下字段内容: + +userId +userMsg +userName +soeData +audioUrl +asrStatus +componentId +componentType +dataVersion + +2. 第二个sheet:"互动组件学习记录" +在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 +数据库相关配置 从.env中读取: +PG_DB_HOST = xxx +PG_DB_PORT = xxx +PG_DB_USER = xxx +PG_DB_PASSWORD = xxx +PG_DB_DATABASE = xxx + +读取以下数据表: +user_component_play_record_0 ~ user_component_play_record_7 + +输出以下字段: +user_id, +component_unique_code, +session_id, +c_type, +c_id, +play_result, +user_behavior_info, +updated_at + +3.第三个sheet:"课程巩固记录" +在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 + +数据表:user_unit_review_question_result + +输出以下字段: +user_id +story_id +chapter_id +question_list +updated_at + +4.第四个sheet:"单元挑战记录" +在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 + +数据表:user_unit_challenge_question_result + +输出以下字段: +user_id +story_id +category +score_text, +question_list +updated_at +------------ + +需求补充v1.1: +"全部音频数据"这个sheet +输出字段 添加timeStr 并按时间倒序排列 最新的记录 在最上面 + +------------ +需求补充v1.2: +"全部音频数据"这个sheet +如果userMsg字段内容 包含 ”makee_id“ 要进行以下处理: + +从userMsg字段中提取出具体的makee_id: +此时的字段样例: +``` +asr msg信息为:{ + "time_ms": 358, + "time_ms_api": 357, + "hot_words_str": "{\n \"context_type\": \"dialog_ctx\",\n \"context_data\": [\n {\n \"text\": \"planet Walla\"\n },\n {\n \"text\": \"Walla\"\n }\n ]\n}", + "makee_id": "d208c617-902f-4f81-8255-b5fb73599546", + "volcano_fast_x_tt_logid": "202511151541355DF72BE5EBFE73795BFD", + "api_name": "volcano-fast" +} +``` +然后基于makee_id 去另一个表里查记录: index:llm_asr_log +将查询到的记录的 result_text 字段内容 回填到 userMsg。 +将source字段内容 输出 到 source。 + +如果userMsg字段内容 不包含 ”makee_id“ 保持之前的逻辑。 + +-------------- +需求补充 v1.3 +当前输入 只支持配置单个 userId (业务侧名称为角色id) + + +期望扩展为以下逻辑: +1. 改为配置 角色id list , 分别 导出 多份excel文件。命名格式为 角色id_{}_导出时间_{}.xlsx +2. 改为配置 账户id list , 分别 导出 多份excel文件。命名格式为 账户id_{}_角色id_{}_导出时间_{}.xlsx + +关于 账户 id 到角色id 的映射逻辑, +首先 读取 mysql 表 vala_app_character +筛选 account_id字段值 == 账户id 的 记录, 其中 该记录 的 id值,则为角色id 一个 账户id 可以对应多个角色id + +本次需求只针对输入侧调整, 数据抽取聚合逻辑部分和之前保持一致 + +--------------- +需求补充 v1.4 + +增加一个sheet "单元总结记录", +导出对应角色id的单元总结记录。 参考 export_unit_summary.py 中的原始数据提取方案即可(不必关注其中的数据统计部分)。 + +其他已有逻辑保持不动哦。 + +---------------- +需求补充 v1.5 + +1."互动组件学习记录"sheet 增加以下字段 +"互动组件名称"、"组件标题"、"组件配置摘要"、"知识点": +字段取值规则: +根据 c_type 及组件配置(从mysql表获取) 进行映射和处理: +``` +1).如果 c_type 开头为"mid" + +则读取下表:表名:middle_interaction_component + +获取以下字段值: +title (作为组件标题) +component_config (完整的组件配置) 获取其中 的 question 字段值 作为 组件配置摘要; +kp_relation_info 字段值 作为 知识点 + +"互动组件名称"规则: + +"物品互动": "mid_vocab_item", +"图片互动": "mid_vocab_image", +"填词互动": "mid_vocab_fillBlank", +"指令互动": "mid_vocab_instruction" +"对话互动-表达": "mid_sentence_dialogue", 且 component_config->question->mode == "express" +"对话互动-朗读": "mid_sentence_dialogue", 且 component_config->question->mode == "read" +"语音互动": "mid_sentence_voice", +"材料互动": "mid_sentence_material", +"造句互动": "mid_sentence_makeSentence" +"挖空互动": "mid_grammar_cloze", +"组句互动": "mid_grammar_sentence" +"发音互动": "mid_pron_pron" + + +2). 如果 c_type 开头为"core" +则读取下表:表名:core_interaction_component + +获取以下字段值: +title (作为组件标题) +component_config (完整的组件配置) 获取其中 的 taskInfo 字段值 作为 组件配置摘要 +kp_relation_info 字段值 作为 知识点 + +"互动组件名称"规则: +"口语快答": "core_speaking_reply", +"口语妙问": "core_speaking_inquiry", +"口语探讨": "core_speaking_explore", +"口语独白": "core_speaking_monologue" +"合作阅读": "core_reading_order", +"合作听力": "core_listening_order", +"看图组句": "core_writing_imgMakeSentence", +"看图撰写": "core_writing_imgWrite", +"问题组句": "core_writing_questionMakeSentence", +"问题撰写": "core_writing_questionWrite", +``` + +2."课程巩固记录" sheet 增加以下字段 +"正确率": 参考 export_lesson_review.py 中的计算逻辑 + +3. 新增一个"汇总统计"sheet +统计并展示以下内容 请以 可读性 比较好的方式排列、展示 + +a. "所有互动-按互动组件类型-通过情况统计" +以每种"互动组件名称"进行聚合 +统计play_result的取值分布情况,算以下指标: +总数量、Perfect数量、Good数量、Failed数量、Pass数量、Perfect比例、Good比例、Failed比例、Pass比例 + +b. "中互动组件-按知识点-通过情况统计" +以每个知识点进行聚合 + +其中 知识点配置格式如下: +``` +[{"kpId":"0000004","kpType":"sentence","kpTitle":"My name is ...","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000004","kpType":"sentence","kpTitle":"My name is ...","kpSkill":"sentence_meaning","kpSkillName":"语义"},{"kpId":"0000005","kpType":"sentence","kpTitle":"I'm… years old.","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000005","kpType":"sentence","kpTitle":"I'm… years old.","kpSkill":"sentence_meaning","kpSkillName":"语义"},{"kpId":"0000014","kpType":"sentence","kpTitle":"Nice to meet you.","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000014","kpType":"sentence","kpTitle":"Nice to meet you.","kpSkill":"sentence_meaning","kpSkillName":"语义"}] +``` +一个组件可以绑定多个知识点,以每个知识点的 kpId + kpType + kpTitle 进行 展示及聚合 + +对所有绑定了某个知识点的中互动组件(c_type以mid开头) +统计play_result的取值分布情况,算以下指标: +总数量、Perfect数量、Good数量、Failed数量、Pass数量、Perfect比例、Good比例、Failed比例、Pass比例 + +c. "单元总结-按单元统计时长" + +将"单元总结记录"中的"play_time_seconds"字段值 以每个单元id 进行聚合 进行 累加 统计,并增加一列 转换为分钟为单位 取整数 + + +""" +# ==== 可直接修改的脚本变量(不使用命令行传参) ==== +# 三种模式互斥,只能配置一个: +# 模式1:单个角色id +USER_ID = None # 单个角色ID,示例:2911 + +# 模式2:角色id列表(多个角色id批量导出) +USER_ID_LIST = None # 角色ID列表,示例:[2911, 2912, 2913] + +# 模式3:账户id列表(通过账户id查询对应的角色id后批量导出) +ACCOUNT_ID_LIST = [9343] # 账户ID列表,示例:[100, 101, 102] + +OUTPUT_DIR = "output/" # 输出目录,默认为output文件夹 +# ==== 变量结束 ==== +import os +import json +import re +from typing import Any, Dict, List, Optional + +import datetime + +try: + import requests +except Exception: + requests = None + +try: + import psycopg2 + from psycopg2.extras import RealDictCursor +except Exception: + psycopg2 = None + RealDictCursor = None + +try: + import pymysql + import pymysql.cursors +except Exception: + pymysql = None + +try: + import pandas as pd +except Exception: + pd = None + +try: + import urllib3 +except Exception: + urllib3 = None + + +SHEET1_COLUMNS = [ + "userId", + "userMsg", + "source", + "userName", + "soeData", + "audioUrl", + "asrStatus", + "componentId", + "componentType", + "dataVersion", + "timeStr", +] + +SHEET2_COLUMNS = [ + "user_id", + "component_unique_code", + "session_id", + "c_type", + "c_id", + "互动组件名称", + "组件标题", + "组件配置摘要", + "知识点", + "play_result", + "user_behavior_info", + "updated_at", +] + +SHEET3_COLUMNS = [ + "user_id", + "unit_id", + "lesson_id", + "question_list", + "正确率", + "updated_at", +] + +SHEET4_COLUMNS = [ + "user_id", + "unit_id", + "category", + "score_text", + "question_list", + "updated_at", +] + +SHEET5_COLUMNS = [ + "id", + "user_id", + "unit_id", + "updated_at", + "km_id", + "km_type", + "play_time_seconds", +] + + +def _load_env_file(path: str) -> None: + if not os.path.exists(path): + return + try: + with open(path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + if "=" not in line: + continue + k, v = line.split("=", 1) + k = k.strip() + v = v.strip().strip('"').strip("'") + if k and (os.getenv(k) is None): + os.environ[k] = v + except Exception: + pass + + +def load_env() -> None: + _load_env_file(os.path.join(os.getcwd(), ".env")) + _load_env_file(os.path.join(os.getcwd(), ".env.local")) + + +def to_json_str(v: Any) -> Any: + if isinstance(v, (dict, list)): + try: + return json.dumps(v, ensure_ascii=False) + except Exception: + return str(v) + return v + + +def parse_time(value: Any) -> Optional[datetime.datetime]: + if value is None: + return None + if isinstance(value, (int, float)): + try: + v = float(value) + # 兼容毫秒级时间戳 + if v > 1e11: + v = v / 1000.0 + return datetime.datetime.fromtimestamp(v) + except Exception: + return None + if isinstance(value, str): + fmts = [ + "%Y-%m-%dT%H:%M:%S.%fZ", + "%Y-%m-%dT%H:%M:%S.%f%z", + "%Y-%m-%dT%H:%M:%S%z", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d", + ] + for fmt in fmts: + try: + return datetime.datetime.strptime(value, fmt) + except Exception: + continue + try: + return datetime.datetime.fromisoformat(value) + except Exception: + return None + return None + + +def pick_time(source: Dict[str, Any]) -> Optional[datetime.datetime]: + candidates = [ + "updated_at", + "created_at", + "@timestamp", + "timestamp", + "updatedAt", + "createdAt", + "time", + "ts", + "timeStr", + "update_time", + "create_time", + ] + for key in candidates: + if key in source: + t = parse_time(source.get(key)) + if t is not None: + return t + # 宽松匹配:尝试扫描所有可能的时间相关字段 + for k, v in source.items(): + lk = str(k).lower() + if any(s in lk for s in ["time", "date", "_at", "timestamp"]): + t = parse_time(v) + if t is not None: + return t + return None + + +def extract_makee_id_from_user_msg(user_msg: Any) -> Optional[str]: + # 支持dict或字符串形式 + if isinstance(user_msg, dict): + mk = user_msg.get("makee_id") + if isinstance(mk, str) and mk: + return mk + if isinstance(user_msg, str) and user_msg: + # 1) 尝试整体解析为JSON + try: + obj = json.loads(user_msg) + mk = obj.get("makee_id") + if isinstance(mk, str) and mk: + return mk + except Exception: + pass + # 2) 尝试截取大括号中的JSON + try: + start = user_msg.find("{") + end = user_msg.rfind("}") + if start != -1 and end != -1 and end > start: + candidate = user_msg[start : end + 1] + obj = json.loads(candidate) + mk = obj.get("makee_id") + if isinstance(mk, str) and mk: + return mk + except Exception: + pass + # 3) 正则匹配 makee_id + m = re.search(r"\bmakee_id\b\s*:\s*\"([^\"]+)\"", user_msg) + if m: + return m.group(1) + return None + + +def fetch_es_asr_log(makee_id: str, es_cfg: Dict[str, Any]) -> Optional[Dict[str, Any]]: + if requests is None: + raise RuntimeError("缺少requests依赖,请安装后再运行。") + host = es_cfg.get("host") + port = es_cfg.get("port") + scheme = es_cfg.get("scheme", "http") + user = es_cfg.get("user") + password = es_cfg.get("password") + index = "llm_asr_log" + if not host: + return None + base = f"{scheme}://{host}:{port}" + url = f"{base}/{index}/_search" + headers = {"Content-Type": "application/json"} + body = { + "query": { + "bool": { + "should": [ + {"term": {"makee_id": {"value": str(makee_id)}}}, + {"term": {"makee_id.keyword": {"value": str(makee_id)}}}, + ], + "minimum_should_match": 1, + } + }, + "size": 10, + "_source": [ + "makee_id", + "result_text", + "source", + "updated_at", + "created_at", + "@timestamp", + "timestamp", + "updatedAt", + "createdAt", + "time", + "ts", + "timeStr", + "update_time", + "create_time", + ], + } + auth = (user, password) if user and password else None + try: + if scheme == "https" and urllib3 is not None: + try: + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + except Exception: + pass + resp = requests.post(url, headers=headers, json=body, auth=auth, timeout=20, verify=False if scheme == "https" else True) + resp.raise_for_status() + data = resp.json() + except Exception: + return None + hits = data.get("hits", {}).get("hits", []) + if not hits: + return None + # 选最新的 + chosen = None + best_t = None + for h in hits: + src = h.get("_source", {}) or {} + t = pick_time(src) + if t is None: + continue + if best_t is None or t > best_t: + best_t = t + chosen = src + if chosen is None: + # 如果都没有时间,选第一条 + chosen = (hits[0].get("_source", {}) or {}) + return chosen + + +def get_es_config() -> Dict[str, Any]: + return { + "host": os.getenv("ES_HOST"), + "port": os.getenv("ES_PORT", "9200"), + "scheme": os.getenv("ES_SCHEME", "http"), + "user": os.getenv("ES_USER"), + "password": os.getenv("ES_PASSWORD"), + "index": "user-audio", + } + + +def fetch_es_user_audio(user_id: str, es_cfg: Dict[str, Any]) -> List[Dict[str, Any]]: + if requests is None: + raise RuntimeError("缺少requests依赖,请安装后再运行。") + + print(f" [ES] 开始查询user-audio索引...") + start_time = datetime.datetime.now() + + host = es_cfg.get("host") + port = es_cfg.get("port") + scheme = es_cfg.get("scheme", "http") + user = es_cfg.get("user") + password = es_cfg.get("password") + index = es_cfg.get("index", "user-audio") + + if not host: + return [] + + base = f"{scheme}://{host}:{port}" + url = f"{base}/{index}/_search" + headers = {"Content-Type": "application/json"} + + body = { + "query": { + "bool": { + "should": [ + {"term": {"userId": {"value": str(user_id)}}}, + {"term": {"userId.keyword": {"value": str(user_id)}}}, + ], + "minimum_should_match": 1, + } + }, + "size": 10000, + "_source": [ + "userId", + "userMsg", + "userName", + "soeData", + "audioUrl", + "asrStatus", + "componentId", + "componentType", + "dataVersion", + "updated_at", + "created_at", + "@timestamp", + "timestamp", + "updatedAt", + "createdAt", + "time", + "ts", + "timeStr", + "update_time", + "create_time", + ], + } + + auth = (user, password) if user and password else None + + try: + # 抑制自签证书下的HTTPS不安全警告 + if scheme == "https" and urllib3 is not None: + try: + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + except Exception: + pass + resp = requests.post(url, headers=headers, json=body, auth=auth, timeout=30, verify=False if scheme == "https" else True) + resp.raise_for_status() + data = resp.json() + except Exception as e: + raise RuntimeError(f"ES查询失败: {e}") + + hits = data.get("hits", {}).get("hits", []) + print(f" [ES] 查询完成,获得{len(hits)}条记录,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + + if not hits: + return [] + + print(f" [ES] 开始处理音频数据...") + process_start = datetime.datetime.now() + + rows: List[Dict[str, Any]] = [] + asr_cache: Dict[str, Dict[str, Any]] = {} + makee_id_count = 0 + + for idx, h in enumerate(hits, 1): + # 每处理100条显示一次进度 + if idx % 100 == 0 or idx == len(hits): + print(f" [ES] 处理进度: {idx}/{len(hits)} ({idx*100//len(hits)}%)") + + src = h.get("_source", {}) or {} + row = { + "userId": src.get("userId"), + "userMsg": src.get("userMsg"), + "source": None, + "userName": src.get("userName"), + "soeData": to_json_str(src.get("soeData")), + "audioUrl": src.get("audioUrl"), + "asrStatus": src.get("asrStatus"), + "componentId": src.get("componentId"), + "componentType": src.get("componentType"), + "dataVersion": src.get("dataVersion"), + } + t = pick_time(src) + row["_time"] = t.isoformat() if t else None + row["timeStr"] = t.strftime("%Y-%m-%d %H:%M:%S") if t else None + # v1.2: 当userMsg包含makee_id时,补充查询llm_asr_log并回填 + mk = extract_makee_id_from_user_msg(row.get("userMsg")) + if mk: + makee_id_count += 1 + asr_doc = asr_cache.get(mk) + if asr_doc is None: + asr_doc = fetch_es_asr_log(mk, es_cfg) + if asr_doc is not None: + asr_cache[mk] = asr_doc + if asr_doc is not None: + rt = asr_doc.get("result_text") + if rt: + row["userMsg"] = rt + row["source"] = to_json_str(asr_doc.get("source")) + rows.append(row) + + print(f" [ES] 数据处理完成,发现{makee_id_count}条包含makee_id的记录,耗时{(datetime.datetime.now() - process_start).total_seconds():.2f}秒") + + print(f" [ES] 开始排序...") + rows.sort(key=lambda x: parse_time(x.get("_time")) or datetime.datetime.min, reverse=True) + print(f" [ES] 音频数据处理完成,总耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + + return rows + + +def get_pg_conn() -> Any: + if psycopg2 is None: + raise RuntimeError("缺少psycopg2依赖,请安装后再运行。") + host = os.getenv("PG_DB_HOST") + port = int(os.getenv("PG_DB_PORT", "5432")) + user = os.getenv("PG_DB_USER") + password = os.getenv("PG_DB_PASSWORD") + dbname = os.getenv("PG_DB_DATABASE") + if not host or not dbname: + raise RuntimeError("PG数据库环境变量未配置完整") + conn = psycopg2.connect(host=host, port=port, user=user, password=password, dbname=dbname) + return conn + + +def get_mysql_conn(database: str) -> Any: + """ + 获取MySQL数据库连接 + + Args: + database: 数据库名,可选值:'vala_user' 或 'vala_test' + vala_user 使用 online 配置(环境变量后缀 _online) + vala_test 使用默认配置 + + Returns: + MySQL连接对象 + """ + if pymysql is None: + raise RuntimeError("缺少pymysql依赖,请安装后再运行。") + + # 根据数据库选择不同的环境变量配置 + if database == "vala_user": + # vala_user 数据库使用 online 配置 + host = os.getenv("MYSQL_HOST_online") + port = int(os.getenv("MYSQL_PORT_online", "3306")) + user = os.getenv("MYSQL_USERNAME_online") + password = os.getenv("MYSQL_PASSWORD_online") + if not host: + raise RuntimeError("MySQL数据库环境变量未配置完整(缺少MYSQL_HOST_online)") + else: + # vala_test 等其他数据库使用默认配置 + host = os.getenv("MYSQL_HOST") + port = int(os.getenv("MYSQL_PORT", "3306")) + user = os.getenv("MYSQL_USERNAME") + password = os.getenv("MYSQL_PASSWORD") + if not host: + raise RuntimeError("MySQL数据库环境变量未配置完整(缺少MYSQL_HOST)") + + conn = pymysql.connect( + host=host, + port=port, + user=user, + password=password, + database=database, # 直接使用传入的数据库名 + charset="utf8mb4", + cursorclass=pymysql.cursors.DictCursor, + ) + return conn + + +def get_id_2_unit_index(conn: Any) -> Dict[int, int]: + """ + 从MySQL获取 story_id 到 unit_id 的映射关系 + + Args: + conn: MySQL数据库连接 + + Returns: + 映射字典 {story_id: unit_id} + """ + sql = """ + SELECT * + FROM `vala_game_info` + WHERE id > 0 + AND `vala_game_info`.`deleted_at` IS NULL + ORDER BY season_package_id asc, `index` asc + """ + try: + with conn.cursor() as cur: + cur.execute(sql) + rows = cur.fetchall() or [] + # 构建映射表:按查询结果的顺序,索引即为unit_id + id_2_unit_index = {} + for index, row in enumerate(rows): + id_2_unit_index[row["id"]] = index + return id_2_unit_index + except Exception as e: + print(f"[ERROR] 获取story_id到unit_id映射失败: {e}") + return {} + + +def get_chapter_id_to_lesson_id(conn: Any) -> Dict[int, int]: + """ + 从MySQL获取 chapter_id 到 lesson_id 的映射关系 + + Args: + conn: MySQL数据库连接 + + Returns: + 映射字典 {chapter_id: lesson_id} + """ + sql = """ + SELECT id, `index` + FROM `vala_game_chapter` + WHERE deleted_at IS NULL + """ + try: + with conn.cursor() as cur: + cur.execute(sql) + rows = cur.fetchall() or [] + # 构建映射表:chapter的index字段即为lesson_id + chapter_id_to_lesson_id = {} + for row in rows: + chapter_id_to_lesson_id[row["id"]] = row["index"] + return chapter_id_to_lesson_id + except Exception as e: + print(f"[ERROR] 获取chapter_id到lesson_id映射失败: {e}") + return {} + + +# 组件类型到组件名称的映射 +COMPONENT_TYPE_NAMES = { + "mid_vocab_item": "物品互动", + "mid_vocab_image": "图片互动", + "mid_vocab_fillBlank": "填词互动", + "mid_vocab_instruction": "指令互动", + "mid_sentence_dialogue": "对话互动", # 需要根据mode进一步判断 + "mid_sentence_voice": "语音互动", + "mid_sentence_material": "材料互动", + "mid_sentence_makeSentence": "造句互动", + "mid_grammar_cloze": "挖空互动", + "mid_grammar_sentence": "组句互动", + "mid_pron_pron": "发音互动", + "core_speaking_reply": "口语快答", + "core_speaking_inquiry": "口语妙问", + "core_speaking_explore": "口语探讨", + "core_speaking_monologue": "口语独白", + "core_reading_order": "合作阅读", + "core_listening_order": "合作听力", + "core_writing_imgMakeSentence": "看图组句", + "core_writing_imgWrite": "看图撰写", + "core_writing_questionMakeSentence": "问题组句", + "core_writing_questionWrite": "问题撰写", +} + + +def get_component_name(c_type: str, component_config: Optional[Dict[str, Any]]) -> str: + """ + 根据c_type和组件配置获取组件名称 + + Args: + c_type: 组件类型 + component_config: 组件配置(用于判断对话互动的mode) + + Returns: + 组件名称 + """ + if not c_type: + return "" + + # 特殊处理:对话互动需要根据mode判断 + if c_type == "mid_sentence_dialogue" and component_config: + try: + question = component_config.get("question", {}) + mode = question.get("mode", "") + if mode == "express": + return "对话互动-表达" + elif mode == "read": + return "对话互动-朗读" + except Exception: + pass + + return COMPONENT_TYPE_NAMES.get(c_type, "") + + +def batch_fetch_component_configs(play_records: List[Dict[str, Any]], mysql_conn: Any) -> Dict[str, Dict[str, Any]]: + """ + 批量查询组件配置信息 + + Args: + play_records: 播放记录列表 + mysql_conn: MySQL连接 + + Returns: + 组件配置映射 {c_type_c_id: {title, component_config, kp_relation_info}} + """ + print(f" [MySQL] 开始批量查询组件配置...") + start_time = datetime.datetime.now() + + # 收集需要查询的c_type和c_id + mid_c_ids = set() + core_c_ids = set() + mid_type_id_pairs = [] # 用于调试日志 + core_type_id_pairs = [] + + for record in play_records: + c_type = record.get("c_type", "") + c_id = record.get("c_id") + if c_type and c_id: + if c_type.startswith("mid"): + mid_c_ids.add(c_id) + mid_type_id_pairs.append((c_type, c_id)) + elif c_type.startswith("core"): + core_c_ids.add(c_id) + core_type_id_pairs.append((c_type, c_id)) + + print(f" [MySQL] 需要查询中互动组件: {len(mid_c_ids)}个, 核心互动组件: {len(core_c_ids)}个") + if mid_c_ids: + print(f" [MySQL] 中互动组件ID列表(前10个): {sorted(list(mid_c_ids))[:10]}") + if core_c_ids: + print(f" [MySQL] 核心互动组件ID列表(前10个): {sorted(list(core_c_ids))[:10]}") + + config_map = {} + + # 批量查询middle_interaction_component + if mid_c_ids: + try: + with mysql_conn.cursor() as cur: + placeholders = ','.join(['%s'] * len(mid_c_ids)) + sql = f""" + SELECT c_id, c_type, title, component_config, kp_relation_info + FROM middle_interaction_component + WHERE c_id IN ({placeholders}) AND deleted_at IS NULL + """ + print(f" [MySQL] 执行中互动组件查询,查询条件: c_id IN ({len(mid_c_ids)}个ID)") + cur.execute(sql, tuple(mid_c_ids)) + rows = cur.fetchall() or [] + print(f" [MySQL] 查询到{len(rows)}条中互动组件配置") + + if len(rows) == 0 and len(mid_c_ids) > 0: + print(f" [MySQL] [警告] 查询结果为空!可能的原因:") + print(f" [MySQL] - 数据库中没有匹配的c_id记录") + print(f" [MySQL] - deleted_at字段不为NULL") + print(f" [MySQL] - c_id不存在") + + for idx, row in enumerate(rows): + c_type = row.get("c_type", "") + c_id = row.get("c_id") + key = f"{c_type}_{c_id}" + + if idx < 3: # 输出前3条的详细信息 + print(f" [MySQL] [样例{idx+1}] id={c_id}, c_type={c_type}, key={key}") + print(f" [MySQL] [样例{idx+1}] title={row.get('title', '')[:50]}") + + # 解析component_config + component_config = row.get("component_config") + if isinstance(component_config, str): + try: + component_config = json.loads(component_config) + except Exception as e: + print(f" [MySQL] [警告] 解析component_config失败 (id={c_id}): {e}") + component_config = {} + + # 提取question字段作为摘要 + summary = "" + if isinstance(component_config, dict): + question = component_config.get("question") + summary = to_json_str(question) if question else "" + if idx < 3 and question: + print(f" [MySQL] [样例{idx+1}] 提取到question字段,长度: {len(summary)}") + + # 解析kp_relation_info + kp_relation_info = row.get("kp_relation_info") + if isinstance(kp_relation_info, str): + try: + kp_relation_info = json.loads(kp_relation_info) + except Exception: + kp_relation_info = [] + + config_map[key] = { + "title": row.get("title", ""), + "component_config": component_config, + "summary": summary, + "kp_relation_info": to_json_str(kp_relation_info), + } + + print(f" [MySQL] 中互动组件配置已加入config_map,当前map大小: {len(config_map)}") + except Exception as e: + print(f" [MySQL] [错误] 查询中互动组件配置失败: {e}") + import traceback + traceback.print_exc() + + # 批量查询core_interaction_component + if core_c_ids: + try: + with mysql_conn.cursor() as cur: + placeholders = ','.join(['%s'] * len(core_c_ids)) + sql = f""" + SELECT c_id, c_type, title, component_config, kp_relation_info + FROM core_interaction_component + WHERE c_id IN ({placeholders}) AND deleted_at IS NULL + """ + print(f" [MySQL] 执行核心互动组件查询,查询条件: c_id IN ({len(core_c_ids)}个ID)") + cur.execute(sql, tuple(core_c_ids)) + rows = cur.fetchall() or [] + print(f" [MySQL] 查询到{len(rows)}条核心互动组件配置") + + if len(rows) == 0 and len(core_c_ids) > 0: + print(f" [MySQL] [警告] 查询结果为空!可能的原因:") + print(f" [MySQL] - 数据库中没有匹配的c_id记录") + print(f" [MySQL] - deleted_at字段不为NULL") + print(f" [MySQL] - c_id不存在") + + for idx, row in enumerate(rows): + c_type = row.get("c_type", "") + c_id = row.get("c_id") + key = f"{c_type}_{c_id}" + + if idx < 3: # 输出前3条的详细信息 + print(f" [MySQL] [样例{idx+1}] id={c_id}, c_type={c_type}, key={key}") + print(f" [MySQL] [样例{idx+1}] title={row.get('title', '')[:50]}") + + # 解析component_config + component_config = row.get("component_config") + if isinstance(component_config, str): + try: + component_config = json.loads(component_config) + except Exception as e: + print(f" [MySQL] [警告] 解析component_config失败 (id={c_id}): {e}") + component_config = {} + + # 提取taskInfo字段作为摘要 + summary = "" + if isinstance(component_config, dict): + task_info = component_config.get("taskInfo") + summary = to_json_str(task_info) if task_info else "" + if idx < 3 and task_info: + print(f" [MySQL] [样例{idx+1}] 提取到taskInfo字段,长度: {len(summary)}") + + # 解析kp_relation_info + kp_relation_info = row.get("kp_relation_info") + if isinstance(kp_relation_info, str): + try: + kp_relation_info = json.loads(kp_relation_info) + except Exception: + kp_relation_info = [] + + config_map[key] = { + "title": row.get("title", ""), + "component_config": component_config, + "summary": summary, + "kp_relation_info": to_json_str(kp_relation_info), + } + + print(f" [MySQL] 核心互动组件配置已加入config_map,当前map大小: {len(config_map)}") + except Exception as e: + print(f" [MySQL] [错误] 查询核心互动组件配置失败: {e}") + import traceback + traceback.print_exc() + + print(f" [MySQL] 组件配置查询完成,共{len(config_map)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + return config_map + + +def calculate_accuracy(question_list: Any) -> float: + """ + 计算问题列表的正确率 + + Args: + question_list: 问题列表(可能是JSON字符串或list) + + Returns: + 正确率(百分比,保留2位小数) + """ + try: + if isinstance(question_list, str): + question_list = json.loads(question_list) + + if not isinstance(question_list, list) or len(question_list) == 0: + return 0.0 + + total = len(question_list) + correct = sum(1 for q in question_list if q.get('isRight') == True) + accuracy = round(correct / total * 100, 2) if total > 0 else 0.0 + + return accuracy + except Exception: + return 0.0 + + + +def fetch_character_ids_by_account(account_id: str, conn: Any) -> List[str]: + """根据账户id查询对应的角色id列表""" + sql = "SELECT id FROM vala_app_character WHERE account_id = %s" + try: + with conn.cursor() as cur: + cur.execute(sql, (account_id,)) + rows = cur.fetchall() or [] + return [str(row["id"]) for row in rows if row.get("id")] + except Exception as e: + print(f"[ERROR] 查询账户id={account_id}的角色id失败: {e}") + return [] + + +def fetch_pg_play_records(user_id: str, conn: Any, mysql_conn: Any) -> List[Dict[str, Any]]: + """ + 查询互动组件学习记录并补充组件配置信息 + + Args: + user_id: 用户ID(角色ID) + conn: PostgreSQL数据库连接 + mysql_conn: MySQL数据库连接 + + Returns: + 互动组件学习记录列表 + """ + print(f" [PG] 开始查询互动组件学习记录(8张分表)...") + start_time = datetime.datetime.now() + + tables = [f"user_component_play_record_{i}" for i in range(8)] + rows: List[Dict[str, Any]] = [] + with conn.cursor(cursor_factory=RealDictCursor) as cur: + for t in tables: + try: + sql = f""" + SELECT user_id, component_unique_code, session_id, c_type, c_id, + play_result, user_behavior_info, updated_at + FROM {t} + WHERE user_id = %s + ORDER BY updated_at DESC + """ + print(f" [PG_DEBUG] 准备查询表 {t},SQL:{sql.strip()},参数:{user_id}") + cur.execute(sql, (user_id,)) + part = cur.fetchall() or [] + if part: + print(f" [PG] 表{t}查到{len(part)}条记录") + for r in part: + r = dict(r) + r["play_result"] = to_json_str(r.get("play_result")) + r["user_behavior_info"] = to_json_str(r.get("user_behavior_info")) + # 将带时区的时间转换为无时区,避免Excel写入报错 + upd = r.get("updated_at") + if isinstance(upd, datetime.datetime): + try: + if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: + r["updated_at"] = upd.replace(tzinfo=None) + except Exception: + # 回退为字符串 + r["updated_at"] = str(upd) + rows.append(r) + except Exception as e: + print(f" [PG] 表{t}查询失败: {e}") + continue + + rows.sort(key=lambda x: parse_time(x.get("updated_at")) or datetime.datetime.min, reverse=True) + print(f" [PG] 互动组件学习记录查询完成,共{len(rows)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + + # 批量查询组件配置 + if rows and mysql_conn: + config_map = batch_fetch_component_configs(rows, mysql_conn) + + # 补充组件信息 + print(f" [PG] 开始补充组件配置信息...") + filled_count = 0 + empty_count = 0 + sample_keys = [] + sample_mode_check = [] # 检查对话互动的mode + + for r in rows: + c_type = r.get("c_type", "") + c_id = r.get("c_id") + key = f"{c_type}_{c_id}" if c_type and c_id else "" + + config = config_map.get(key, {}) + component_config = config.get("component_config", {}) + + component_name = get_component_name(c_type, component_config) + r["互动组件名称"] = component_name + r["组件标题"] = config.get("title", "") + r["组件配置摘要"] = config.get("summary", "") + r["知识点"] = config.get("kp_relation_info", "") + + # 统计填充情况 + if config: + filled_count += 1 + if len(sample_keys) < 3: + sample_keys.append((key, component_name, r["组件标题"][:30] if r["组件标题"] else "")) + + # 检查对话互动的mode + if c_type == "mid_sentence_dialogue" and len(sample_mode_check) < 3: + mode = "" + if isinstance(component_config, dict): + question = component_config.get("question", {}) + if isinstance(question, dict): + mode = question.get("mode", "") + sample_mode_check.append({ + "key": key, + "mode": mode, + "component_name": component_name + }) + else: + empty_count += 1 + if empty_count <= 5: # 输出前5个未匹配的key + print(f" [PG] [警告] 未找到组件配置: key={key}") + + print(f" [PG] 组件配置信息补充完成") + print(f" [PG] 匹配到配置: {filled_count}条, 未匹配: {empty_count}条") + if sample_keys: + print(f" [PG] 样例数据(前3条):") + for key, name, title in sample_keys: + print(f" [PG] - key={key}, 名称={name}, 标题={title}") + + if sample_mode_check: + print(f" [PG] 对话互动mode检查(前3条):") + for s in sample_mode_check: + print(f" [PG] - key={s['key']}, mode={s['mode']}, 最终名称={s['component_name']}") + + return rows + + +def fetch_pg_unit_review(user_id: str, conn: Any, id_2_unit_index: Dict[int, int], chapter_id_to_lesson_id: Dict[int, int]) -> List[Dict[str, Any]]: + """ + 查询课程巩固记录 + + Args: + user_id: 用户ID(角色ID) + conn: PostgreSQL数据库连接 + id_2_unit_index: story_id到unit_id的映射字典 + chapter_id_to_lesson_id: chapter_id到lesson_id的映射字典 + + Returns: + 课程巩固记录列表 + """ + print(f" [PG] 开始查询课程巩固记录...") + start_time = datetime.datetime.now() + + sql = ( + "SELECT user_id, story_id, chapter_id, question_list, updated_at " + "FROM user_unit_review_question_result WHERE user_id = %s ORDER BY updated_at DESC" + ) + with conn.cursor(cursor_factory=RealDictCursor) as cur: + try: + cur.execute(sql, (user_id,)) + rows = cur.fetchall() or [] + except Exception as e: + print(f" [PG] 课程巩固记录查询失败: {e}") + rows = [] + out: List[Dict[str, Any]] = [] + for r in rows: + d = dict(r) + + # 映射 story_id 到 unit_id + story_id = d.get("story_id") + unit_id = id_2_unit_index.get(story_id) if story_id else None + d["unit_id"] = unit_id + + # 映射 chapter_id 到 lesson_id + chapter_id = d.get("chapter_id") + lesson_id = chapter_id_to_lesson_id.get(chapter_id) if chapter_id else None + d["lesson_id"] = lesson_id + + # 计算正确率 + question_list = d.get("question_list") + d["正确率"] = calculate_accuracy(question_list) + + d["question_list"] = to_json_str(question_list) + upd = d.get("updated_at") + if isinstance(upd, datetime.datetime): + try: + if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: + d["updated_at"] = upd.replace(tzinfo=None) + except Exception: + d["updated_at"] = str(upd) + out.append(d) + + print(f" [PG] 课程巩固记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + return out + + +def fetch_pg_unit_challenge(user_id: str, conn: Any, id_2_unit_index: Dict[int, int]) -> List[Dict[str, Any]]: + """ + 查询单元挑战记录 + + Args: + user_id: 用户ID(角色ID) + conn: PostgreSQL数据库连接 + id_2_unit_index: story_id到unit_id的映射字典 + + Returns: + 单元挑战记录列表 + """ + print(f" [PG] 开始查询单元挑战记录...") + start_time = datetime.datetime.now() + + sql = ( + "SELECT user_id, story_id, category, score_text, question_list, updated_at " + "FROM user_unit_challenge_question_result WHERE user_id = %s ORDER BY updated_at DESC" + ) + with conn.cursor(cursor_factory=RealDictCursor) as cur: + try: + cur.execute(sql, (user_id,)) + rows = cur.fetchall() or [] + except Exception as e: + print(f" [PG] 单元挑战记录查询失败: {e}") + rows = [] + out: List[Dict[str, Any]] = [] + for r in rows: + d = dict(r) + + # 映射 story_id 到 unit_id + story_id = d.get("story_id") + unit_id = id_2_unit_index.get(story_id) if story_id else None + d["unit_id"] = unit_id + + d["question_list"] = to_json_str(d.get("question_list")) + upd = d.get("updated_at") + if isinstance(upd, datetime.datetime): + try: + if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: + d["updated_at"] = upd.replace(tzinfo=None) + except Exception: + d["updated_at"] = str(upd) + out.append(d) + + print(f" [PG] 单元挑战记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + return out + + +def fetch_pg_unit_summary(user_id: str, conn: Any, id_2_unit_index: Dict[int, int]) -> List[Dict[str, Any]]: + """ + 查询单元总结知识点结果数据 + + Args: + user_id: 用户ID(角色ID) + conn: PostgreSQL数据库连接 + id_2_unit_index: story_id到unit_id的映射字典 + + Returns: + 单元总结记录列表 + """ + print(f" [PG] 开始查询单元总结记录...") + start_time = datetime.datetime.now() + + sql = ( + "SELECT id, user_id, story_id, updated_at, km_id, km_type, play_time " + "FROM user_unit_summary_km_result WHERE user_id = %s AND deleted_at IS NULL ORDER BY updated_at DESC" + ) + with conn.cursor(cursor_factory=RealDictCursor) as cur: + try: + cur.execute(sql, (user_id,)) + rows = cur.fetchall() or [] + except Exception as e: + print(f" [PG] 单元总结记录查询失败: {e}") + rows = [] + + out: List[Dict[str, Any]] = [] + for r in rows: + d = dict(r) + # 映射 story_id 到 unit_id + story_id = d.get("story_id") + unit_id = id_2_unit_index.get(story_id) if story_id else None + d["unit_id"] = unit_id + + # 转换 play_time (毫秒) 为秒 (整数) + play_time = d.get("play_time") + d["play_time_seconds"] = play_time // 1000 if play_time else 0 + + # 移除时区信息 + upd = d.get("updated_at") + if isinstance(upd, datetime.datetime): + try: + if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: + d["updated_at"] = upd.replace(tzinfo=None) + except Exception: + d["updated_at"] = str(upd) + out.append(d) + + print(f" [PG] 单元总结记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + return out + + +def generate_statistics(sheet2_rows: List[Dict[str, Any]], sheet5_rows: List[Dict[str, Any]]) -> tuple: + """ + 生成汇总统计数据 + + Args: + sheet2_rows: 互动组件学习记录 + sheet5_rows: 单元总结记录 + + Returns: + (组件统计DataFrame, 知识点统计DataFrame, 单元时长统计DataFrame) + """ + if pd is None: + raise RuntimeError("缺少pandas依赖,请安装后再运行。") + + print(f" [统计] 开始生成汇总统计数据...") + start_time = datetime.datetime.now() + + from collections import defaultdict + + # ============ a. 所有互动-按互动组件类型-通过情况统计 ============ + component_stats_data = [] + component_stats = defaultdict(lambda: {"Perfect": 0, "Good": 0, "Failed": 0, "Pass": 0, "Oops": 0, "total": 0}) + + # 用于调试 + sample_results = [] + parse_error_count = 0 + + for idx, record in enumerate(sheet2_rows): + component_name = record.get("互动组件名称", "") + if not component_name: + continue + + play_result_str = record.get("play_result", "") + + # 解析play_result + result = "" + try: + # 先判断是否是简单的字符串(Perfect/Good/Failed/Pass/Oops) + if isinstance(play_result_str, str): + # 去除空格后检查 + stripped = play_result_str.strip() + if stripped in ["Perfect", "Good", "Failed", "Pass", "Oops"]: + # 直接使用 + result = stripped + else: + # 尝试JSON解析 + try: + play_result = json.loads(play_result_str) + if isinstance(play_result, dict): + result = play_result.get("result", "") + else: + result = "" + except: + result = "" + else: + # 如果不是字符串,尝试当dict处理 + if isinstance(play_result_str, dict): + result = play_result_str.get("result", "") + else: + result = "" + + # 收集前3个样例 + if idx < 3: + sample_results.append({ + "component": component_name, + "raw": str(play_result_str)[:100], + "result": result + }) + except Exception as e: + parse_error_count += 1 + if parse_error_count <= 3: + print(f" [统计] [警告] 解析play_result失败 (第{idx+1}条): {e}, 原始值: {str(play_result_str)[:100]}") + result = "" + + component_stats[component_name]["total"] += 1 + if result in ["Perfect", "Good", "Failed", "Pass", "Oops"]: + component_stats[component_name][result] += 1 + + print(f" [统计] play_result解析样例(前3条):") + for s in sample_results: + print(f" [统计] - 组件: {s['component']}, 结果: {s['result']}, 原始: {s['raw']}") + if parse_error_count > 0: + print(f" [统计] play_result解析失败总数: {parse_error_count}") + + # 生成统计数据行 + for component_name in sorted(component_stats.keys()): + stats = component_stats[component_name] + total = stats["total"] + perfect = stats["Perfect"] + good = stats["Good"] + failed = stats["Failed"] + pass_count = stats["Pass"] + oops = stats["Oops"] + + perfect_ratio = round(perfect / total * 100, 2) if total > 0 else 0 + good_ratio = round(good / total * 100, 2) if total > 0 else 0 + failed_ratio = round(failed / total * 100, 2) if total > 0 else 0 + pass_ratio = round(pass_count / total * 100, 2) if total > 0 else 0 + oops_ratio = round(oops / total * 100, 2) if total > 0 else 0 + + component_stats_data.append({ + "互动组件名称": component_name, + "总数量": total, + "Perfect数量": perfect, + "Good数量": good, + "Failed数量": failed, + "Pass数量": pass_count, + "Oops数量": oops, + "Perfect比例(%)": perfect_ratio, + "Good比例(%)": good_ratio, + "Failed比例(%)": failed_ratio, + "Pass比例(%)": pass_ratio, + "Oops比例(%)": oops_ratio, + }) + + # ============ b. 中互动组件-按知识点-通过情况统计 ============ + kp_stats_data = [] + kp_stats = defaultdict(lambda: {"Perfect": 0, "Good": 0, "Failed": 0, "Pass": 0, "Oops": 0, "total": 0}) + + # 调试信息 + mid_count = 0 + has_kp_count = 0 + sample_kp_records = [] + + for idx, record in enumerate(sheet2_rows): + c_type = record.get("c_type", "") + if not c_type or not c_type.startswith("mid"): + continue + + mid_count += 1 + kp_relation_info_str = record.get("知识点", "") + + if not kp_relation_info_str: + continue + + has_kp_count += 1 + + # 解析知识点 + try: + if isinstance(kp_relation_info_str, str): + kp_relation_info = json.loads(kp_relation_info_str) + else: + kp_relation_info = kp_relation_info_str + + if not isinstance(kp_relation_info, list): + continue + + # 收集样例 + if len(sample_kp_records) < 3: + sample_kp_records.append({ + "c_type": c_type, + "kp_count": len(kp_relation_info), + "kp_info": str(kp_relation_info)[:200] + }) + + # 解析play_result(使用相同的逻辑) + play_result_str = record.get("play_result", "") + result = "" + if isinstance(play_result_str, str): + stripped = play_result_str.strip() + if stripped in ["Perfect", "Good", "Failed", "Pass", "Oops"]: + result = stripped + else: + try: + play_result = json.loads(play_result_str) + if isinstance(play_result, dict): + result = play_result.get("result", "") + except: + pass + elif isinstance(play_result_str, dict): + result = play_result_str.get("result", "") + + # 为每个知识点统计 + for kp in kp_relation_info: + if not isinstance(kp, dict): + continue + + kp_id = kp.get("kpId", "") + kp_type = kp.get("kpType", "") + kp_title = kp.get("kpTitle", "") + + if not kp_id: + continue + + kp_key = f"{kp_id}|{kp_type}|{kp_title}" + kp_stats[kp_key]["total"] += 1 + if result in ["Perfect", "Good", "Failed", "Pass", "Oops"]: + kp_stats[kp_key][result] += 1 + + except Exception as e: + if len(sample_kp_records) < 5: + print(f" [统计] [警告] 解析知识点失败: {e}, 原始值: {str(kp_relation_info_str)[:100]}") + continue + + print(f" [统计] 中互动组件统计: 总数={mid_count}, 有知识点={has_kp_count}, 知识点条目数={len(kp_stats)}") + if sample_kp_records: + print(f" [统计] 知识点样例(前3条):") + for s in sample_kp_records: + print(f" [统计] - c_type={s['c_type']}, 知识点数量={s['kp_count']}, 内容={s['kp_info']}") + + # 生成知识点统计数据行 + for kp_key in sorted(kp_stats.keys()): + parts = kp_key.split("|") + if len(parts) != 3: + continue + + kp_id, kp_type, kp_title = parts + stats = kp_stats[kp_key] + total = stats["total"] + perfect = stats["Perfect"] + good = stats["Good"] + failed = stats["Failed"] + pass_count = stats["Pass"] + oops = stats["Oops"] + + perfect_ratio = round(perfect / total * 100, 2) if total > 0 else 0 + good_ratio = round(good / total * 100, 2) if total > 0 else 0 + failed_ratio = round(failed / total * 100, 2) if total > 0 else 0 + pass_ratio = round(pass_count / total * 100, 2) if total > 0 else 0 + oops_ratio = round(oops / total * 100, 2) if total > 0 else 0 + + kp_stats_data.append({ + "知识点ID": kp_id, + "知识点类型": kp_type, + "知识点标题": kp_title, + "总数量": total, + "Perfect数量": perfect, + "Good数量": good, + "Failed数量": failed, + "Pass数量": pass_count, + "Oops数量": oops, + "Perfect比例(%)": perfect_ratio, + "Good比例(%)": good_ratio, + "Failed比例(%)": failed_ratio, + "Pass比例(%)": pass_ratio, + "Oops比例(%)": oops_ratio, + }) + + # ============ c. 单元总结-按单元统计时长 ============ + unit_time_stats_data = [] + unit_time_stats = defaultdict(int) + + for record in sheet5_rows: + unit_id = record.get("unit_id") + play_time_seconds = record.get("play_time_seconds", 0) + + if unit_id is not None: + unit_time_stats[unit_id] += play_time_seconds + + # 生成单元时长统计数据行 + for unit_id in sorted(unit_time_stats.keys()): + total_seconds = unit_time_stats[unit_id] + total_minutes = int(total_seconds / 60) + + unit_time_stats_data.append({ + "单元ID": f"unit_{unit_id}", + "总时长(秒)": total_seconds, + "总时长(分钟)": total_minutes, + }) + + print(f" [统计] 汇总统计数据生成完成,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + print(f" [统计] 生成了{len(component_stats_data)}条组件统计, {len(kp_stats_data)}条知识点统计, {len(unit_time_stats_data)}条单元时长统计") + + return ( + pd.DataFrame(component_stats_data), + pd.DataFrame(kp_stats_data), + pd.DataFrame(unit_time_stats_data) + ) + + + +def write_excel(path: str, sheet1_rows: List[Dict[str, Any]], sheet2_rows: List[Dict[str, Any]], sheet3_rows: List[Dict[str, Any]], sheet4_rows: List[Dict[str, Any]], sheet5_rows: List[Dict[str, Any]], stats_component_df: Any, stats_kp_df: Any, stats_unit_time_df: Any) -> None: + if pd is None: + raise RuntimeError("缺少pandas依赖,请安装后再运行。") + + print(f" [Excel] 开始写入Excel文件: {path}") + start_time = datetime.datetime.now() + + out_dir = os.path.dirname(path) or "." + os.makedirs(out_dir, exist_ok=True) + with pd.ExcelWriter(path, engine="openpyxl") as writer: + pd.DataFrame(sheet1_rows, columns=SHEET1_COLUMNS).to_excel(writer, sheet_name="全部音频数据", index=False) + pd.DataFrame(sheet2_rows, columns=SHEET2_COLUMNS).to_excel(writer, sheet_name="互动组件学习记录", index=False) + pd.DataFrame(sheet3_rows, columns=SHEET3_COLUMNS).to_excel(writer, sheet_name="课程巩固记录", index=False) + pd.DataFrame(sheet4_rows, columns=SHEET4_COLUMNS).to_excel(writer, sheet_name="单元挑战记录", index=False) + pd.DataFrame(sheet5_rows, columns=SHEET5_COLUMNS).to_excel(writer, sheet_name="单元总结记录", index=False) + stats_component_df.to_excel(writer, sheet_name="统计-互动组件通过情况", index=False) + stats_kp_df.to_excel(writer, sheet_name="统计-知识点通过情况", index=False) + stats_unit_time_df.to_excel(writer, sheet_name="统计-单元总结时长", index=False) + + print(f" [Excel] 写入完成,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + + +def get_date_str() -> str: + """获取当前日期字符串 格式:YYYYMMDD""" + return datetime.datetime.now().strftime("%Y%m%d") + + +def export_single_user(user_id: str, es_cfg: Dict[str, Any], pg_conn: Any, mysql_conn: Any, output_path: str, id_2_unit_index: Dict[int, int], chapter_id_to_lesson_id: Dict[int, int]) -> bool: + """ + 导出单个角色id的数据 + + Args: + user_id: 角色ID + es_cfg: ES配置 + pg_conn: PostgreSQL连接 + mysql_conn: MySQL连接 + output_path: 输出路径 + id_2_unit_index: story_id到unit_id的映射字典 + chapter_id_to_lesson_id: chapter_id到lesson_id的映射字典 + + Returns: + True表示成功,False表示失败 + """ + try: + print(f"\n[INFO] ========== 开始导出角色id={user_id} ==========") + total_start_time = datetime.datetime.now() + + # 查询ES数据 + sheet1_rows = fetch_es_user_audio(user_id, es_cfg) + + # 查询PG数据 + sheet2_rows = fetch_pg_play_records(user_id, pg_conn, mysql_conn) + sheet3_rows = fetch_pg_unit_review(user_id, pg_conn, id_2_unit_index, chapter_id_to_lesson_id) + sheet4_rows = fetch_pg_unit_challenge(user_id, pg_conn, id_2_unit_index) + sheet5_rows = fetch_pg_unit_summary(user_id, pg_conn, id_2_unit_index) + + # 检查是否有有效数据 + total_records = len(sheet1_rows) + len(sheet2_rows) + len(sheet3_rows) + len(sheet4_rows) + len(sheet5_rows) + print(f" [统计] 数据汇总:") + print(f" - 全部音频数据: {len(sheet1_rows)}条") + print(f" - 互动组件学习记录: {len(sheet2_rows)}条") + print(f" - 课程巩固记录: {len(sheet3_rows)}条") + print(f" - 单元挑战记录: {len(sheet4_rows)}条") + print(f" - 单元总结记录: {len(sheet5_rows)}条") + print(f" - 总计: {total_records}条") + + if total_records == 0: + print(f"[WARN] 角色id={user_id} 没有找到任何有效记录,跳过导出") + return False + + # 生成汇总统计数据 + stats_component_df, stats_kp_df, stats_unit_time_df = generate_statistics(sheet2_rows, sheet5_rows) + + # 写入Excel + write_excel(output_path, sheet1_rows, sheet2_rows, sheet3_rows, sheet4_rows, sheet5_rows, stats_component_df, stats_kp_df, stats_unit_time_df) + + total_time = (datetime.datetime.now() - total_start_time).total_seconds() + print(f"[INFO] 角色id={user_id} 导出成功") + print(f"[INFO] 文件路径: {output_path}") + print(f"[INFO] 总耗时: {total_time:.2f}秒") + print(f"[INFO] ========== 完成 ==========\n") + return True + + except Exception as e: + print(f"[ERROR] 角色id={user_id} 导出失败: {e}") + import traceback + traceback.print_exc() + return False + + +def main(): + load_env() + + # 确定运行模式并收集需要导出的角色id列表 + user_id_list: List[tuple] = [] # [(user_id, account_id or None), ...] + date_str = get_date_str() + + # 检查三种模式的配置 + has_user_id = USER_ID is not None + has_user_id_list = USER_ID_LIST is not None and len(USER_ID_LIST) > 0 + has_account_id_list = ACCOUNT_ID_LIST is not None and len(ACCOUNT_ID_LIST) > 0 + + # 验证只能配置一种模式 + mode_count = sum([has_user_id, has_user_id_list, has_account_id_list]) + if mode_count == 0: + raise RuntimeError("请配置 USER_ID、USER_ID_LIST 或 ACCOUNT_ID_LIST 中的一个") + if mode_count > 1: + raise RuntimeError("USER_ID、USER_ID_LIST、ACCOUNT_ID_LIST 只能配置一个,请检查配置") + + # 模式1:单个角色id + if has_user_id: + user_id_list = [(str(USER_ID), None)] + print(f"[INFO] 运行模式:单个角色id") + + # 模式2:角色id列表 + elif has_user_id_list: + user_id_list = [(str(uid), None) for uid in USER_ID_LIST] + print(f"[INFO] 运行模式:角色id列表,共{len(user_id_list)}个角色") + + # 模式3:账户id列表 + elif has_account_id_list: + print(f"[INFO] 运行模式:账户id列表,共{len(ACCOUNT_ID_LIST)}个账户") + mysql_conn = None + try: + mysql_conn = get_mysql_conn("vala_user") # 查询用户表,使用 vala_user 数据库 + for account_id in ACCOUNT_ID_LIST: + account_id_str = str(account_id) + print(f"[INFO] 查询账户id={account_id_str}对应的角色id...") + character_ids = fetch_character_ids_by_account(account_id_str, mysql_conn) + if not character_ids: + print(f"[WARN] 账户id={account_id_str} 未找到关联的角色id,跳过") + continue + print(f"[INFO] 账户id={account_id_str} 找到{len(character_ids)}个角色id: {character_ids}") + for cid in character_ids: + user_id_list.append((cid, account_id_str)) + finally: + if mysql_conn: + try: + mysql_conn.close() + except Exception: + pass + + if not user_id_list: + print("[WARN] 没有需要导出的角色id,程序退出") + return + + # 初始化连接 + es_cfg = get_es_config() + pg_conn = get_pg_conn() + + # 获取映射表(只需要查询一次,所有角色共用) + print(f"\n[INFO] ===== 准备工作:获取映射表 =====") + mysql_conn = None + id_2_unit_index = {} + chapter_id_to_lesson_id = {} + try: + print(f"[INFO] 正在连接MySQL数据库(vala_test)...") + mysql_conn = get_mysql_conn("vala_test") # 查询游戏配置表,使用 vala_test 数据库 + print(f"[INFO] 正在获取 story_id 到 unit_id 的映射...") + id_2_unit_index = get_id_2_unit_index(mysql_conn) + print(f"[INFO] 成功获取 {len(id_2_unit_index)} 个 story_id 映射") + print(f"[INFO] 正在获取 chapter_id 到 lesson_id 的映射...") + chapter_id_to_lesson_id = get_chapter_id_to_lesson_id(mysql_conn) + print(f"[INFO] 成功获取 {len(chapter_id_to_lesson_id)} 个 chapter_id 映射") + except Exception as e: + print(f"[ERROR] 获取映射表失败: {e}") + import traceback + traceback.print_exc() + if pg_conn: + try: + pg_conn.close() + except Exception: + pass + if mysql_conn: + try: + mysql_conn.close() + except Exception: + pass + return + + try: + # 统计信息 + success_count = 0 + skip_count = 0 + + print(f"\n[INFO] ===== 开始批量导出 =====") + print(f"[INFO] 共需导出{len(user_id_list)}个角色\n") + batch_start_time = datetime.datetime.now() + + # 循环处理每个角色id + for idx, (user_id, account_id) in enumerate(user_id_list, 1): + print(f"\n{'='*60}") + print(f"[INFO] 进度: {idx}/{len(user_id_list)} ({idx*100//len(user_id_list)}%)") + print(f"{'='*60}") + + # 生成输出文件名 + if account_id is None: + # 模式1和模式2:角色id_{}_导出时间_{}.xlsx + filename = f"角色id_{user_id}_导出时间_{date_str}.xlsx" + else: + # 模式3:账户id_{}_角色id_{}_导出时间_{}.xlsx + filename = f"账户id_{account_id}_角色id_{user_id}_导出时间_{date_str}.xlsx" + + output_path = os.path.join(OUTPUT_DIR, filename) + + # 导出单个角色的数据 + result = export_single_user(user_id, es_cfg, pg_conn, mysql_conn, output_path, id_2_unit_index, chapter_id_to_lesson_id) + if result: + success_count += 1 + else: + skip_count += 1 + + # 输出统计信息 + batch_total_time = (datetime.datetime.now() - batch_start_time).total_seconds() + print(f"\n{'='*60}") + print(f"[INFO] ===== 全部导出完成 =====") + print(f"[INFO] 总计: {len(user_id_list)}个角色") + print(f"[INFO] 成功: {success_count}个") + print(f"[INFO] 跳过: {skip_count}个") + print(f"[INFO] 总耗时: {batch_total_time:.2f}秒 ({batch_total_time/60:.2f}分钟)") + if success_count > 0: + print(f"[INFO] 平均每个角色: {batch_total_time/success_count:.2f}秒") + print(f"{'='*60}\n") + + finally: + if pg_conn: + try: + pg_conn.close() + except Exception: + pass + if mysql_conn: + try: + mysql_conn.close() + except Exception: + pass + + +if __name__ == "__main__": + main() diff --git a/export_user_id_data_latest.py b/export_user_id_data_latest.py new file mode 100644 index 0000000..22495f5 --- /dev/null +++ b/export_user_id_data_latest.py @@ -0,0 +1,1846 @@ +""" +初版需求v1.0: 2025.11.18 + +导出 一个userId的多表数据, 最终按照不同sheet,输出到一个 excel文件中。 + +1. 第一个sheet:"全部音频数据" +es相关配置通过以下环境变量 +ES_HOST=xxx +ES_PORT=9200 +ES_SCHEME=https +ES_USER=elastic +ES_PASSWORD=xxx + +index: user-audio + +脚本思路: +过滤字段: +userId == xxxx + +输出该userId的全部记录 按时间倒序排序 +包含以下字段内容: + +userId +userMsg +userName +soeData +audioUrl +asrStatus +componentId +componentType +dataVersion + +2. 第二个sheet:"互动组件学习记录" +在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 +数据库相关配置 从.env中读取: +PG_DB_HOST = xxx +PG_DB_PORT = xxx +PG_DB_USER = xxx +PG_DB_PASSWORD = xxx +PG_DB_DATABASE = xxx + +读取以下数据表: +user_component_play_record_0 ~ user_component_play_record_7 + +输出以下字段: +user_id, +component_unique_code, +session_id, +c_type, +c_id, +play_result, +user_behavior_info, +updated_at + +3.第三个sheet:"课程巩固记录" +在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 + +数据表:user_unit_review_question_result + +输出以下字段: +user_id +story_id +chapter_id +question_list +updated_at + +4.第四个sheet:"单元挑战记录" +在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 + +数据表:user_unit_challenge_question_result + +输出以下字段: +user_id +story_id +category +score_text, +question_list +updated_at +------------ + +需求补充v1.1: +"全部音频数据"这个sheet +输出字段 添加timeStr 并按时间倒序排列 最新的记录 在最上面 + +------------ +需求补充v1.2: +"全部音频数据"这个sheet +如果userMsg字段内容 包含 ”makee_id“ 要进行以下处理: + +从userMsg字段中提取出具体的makee_id: +此时的字段样例: +``` +asr msg信息为:{ + "time_ms": 358, + "time_ms_api": 357, + "hot_words_str": "{\n \"context_type\": \"dialog_ctx\",\n \"context_data\": [\n {\n \"text\": \"planet Walla\"\n },\n {\n \"text\": \"Walla\"\n }\n ]\n}", + "makee_id": "d208c617-902f-4f81-8255-b5fb73599546", + "volcano_fast_x_tt_logid": "202511151541355DF72BE5EBFE73795BFD", + "api_name": "volcano-fast" +} +``` +然后基于makee_id 去另一个表里查记录: index:llm_asr_log +将查询到的记录的 result_text 字段内容 回填到 userMsg。 +将source字段内容 输出 到 source。 + +如果userMsg字段内容 不包含 ”makee_id“ 保持之前的逻辑。 + +-------------- +需求补充 v1.3 +当前输入 只支持配置单个 userId (业务侧名称为角色id) + + +期望扩展为以下逻辑: +1. 改为配置 角色id list , 分别 导出 多份excel文件。命名格式为 角色id_{}_导出时间_{}.xlsx +2. 改为配置 账户id list , 分别 导出 多份excel文件。命名格式为 账户id_{}_角色id_{}_导出时间_{}.xlsx + +关于 账户 id 到角色id 的映射逻辑, +首先 读取 mysql 表 vala_app_character +筛选 account_id字段值 == 账户id 的 记录, 其中 该记录 的 id值,则为角色id 一个 账户id 可以对应多个角色id + +本次需求只针对输入侧调整, 数据抽取聚合逻辑部分和之前保持一致 + +--------------- +需求补充 v1.4 + +增加一个sheet "单元总结记录", +导出对应角色id的单元总结记录。 参考 export_unit_summary.py 中的原始数据提取方案即可(不必关注其中的数据统计部分)。 + +其他已有逻辑保持不动哦。 + +---------------- +需求补充 v1.5 + +1."互动组件学习记录"sheet 增加以下字段 +"互动组件名称"、"组件标题"、"组件配置摘要"、"知识点": +字段取值规则: +根据 c_type 及组件配置(从mysql表获取) 进行映射和处理: +``` +1).如果 c_type 开头为"mid" + +则读取下表:表名:middle_interaction_component + +获取以下字段值: +title (作为组件标题) +component_config (完整的组件配置) 获取其中 的 question 字段值 作为 组件配置摘要; +kp_relation_info 字段值 作为 知识点 + +"互动组件名称"规则: + +"物品互动": "mid_vocab_item", +"图片互动": "mid_vocab_image", +"填词互动": "mid_vocab_fillBlank", +"指令互动": "mid_vocab_instruction" +"对话互动-表达": "mid_sentence_dialogue", 且 component_config->question->mode == "express" +"对话互动-朗读": "mid_sentence_dialogue", 且 component_config->question->mode == "read" +"语音互动": "mid_sentence_voice", +"材料互动": "mid_sentence_material", +"造句互动": "mid_sentence_makeSentence" +"挖空互动": "mid_grammar_cloze", +"组句互动": "mid_grammar_sentence" +"发音互动": "mid_pron_pron" + + +2). 如果 c_type 开头为"core" +则读取下表:表名:core_interaction_component + +获取以下字段值: +title (作为组件标题) +component_config (完整的组件配置) 获取其中 的 taskInfo 字段值 作为 组件配置摘要 +kp_relation_info 字段值 作为 知识点 + +"互动组件名称"规则: +"口语快答": "core_speaking_reply", +"口语妙问": "core_speaking_inquiry", +"口语探讨": "core_speaking_explore", +"口语独白": "core_speaking_monologue" +"合作阅读": "core_reading_order", +"合作听力": "core_listening_order", +"看图组句": "core_writing_imgMakeSentence", +"看图撰写": "core_writing_imgWrite", +"问题组句": "core_writing_questionMakeSentence", +"问题撰写": "core_writing_questionWrite", +``` + +2."课程巩固记录" sheet 增加以下字段 +"正确率": 参考 export_lesson_review.py 中的计算逻辑 + +3. 新增一个"汇总统计"sheet +统计并展示以下内容 请以 可读性 比较好的方式排列、展示 + +a. "所有互动-按互动组件类型-通过情况统计" +以每种"互动组件名称"进行聚合 +统计play_result的取值分布情况,算以下指标: +总数量、Perfect数量、Good数量、Failed数量、Pass数量、Perfect比例、Good比例、Failed比例、Pass比例 + +b. "中互动组件-按知识点-通过情况统计" +以每个知识点进行聚合 + +其中 知识点配置格式如下: +``` +[{"kpId":"0000004","kpType":"sentence","kpTitle":"My name is ...","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000004","kpType":"sentence","kpTitle":"My name is ...","kpSkill":"sentence_meaning","kpSkillName":"语义"},{"kpId":"0000005","kpType":"sentence","kpTitle":"I'm… years old.","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000005","kpType":"sentence","kpTitle":"I'm… years old.","kpSkill":"sentence_meaning","kpSkillName":"语义"},{"kpId":"0000014","kpType":"sentence","kpTitle":"Nice to meet you.","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000014","kpType":"sentence","kpTitle":"Nice to meet you.","kpSkill":"sentence_meaning","kpSkillName":"语义"}] +``` +一个组件可以绑定多个知识点,以每个知识点的 kpId + kpType + kpTitle 进行 展示及聚合 + +对所有绑定了某个知识点的中互动组件(c_type以mid开头) +统计play_result的取值分布情况,算以下指标: +总数量、Perfect数量、Good数量、Failed数量、Pass数量、Perfect比例、Good比例、Failed比例、Pass比例 + +c. "单元总结-按单元统计时长" + +将"单元总结记录"中的"play_time_seconds"字段值 以每个单元id 进行聚合 进行 累加 统计,并增加一列 转换为分钟为单位 取整数 + + +""" +# ==== 可直接修改的脚本变量(不使用命令行传参) ==== +# 三种模式互斥,只能配置一个: +# 模式1:单个角色id +USER_ID = None # 单个角色ID,示例:2911 + +# 模式2:角色id列表(多个角色id批量导出) +USER_ID_LIST = None # 角色ID列表,示例:[2911, 2912, 2913] + +# 模式3:账户id列表(通过账户id查询对应的角色id后批量导出) +ACCOUNT_ID_LIST = [9343] # 5095[7232] # [1783,5375,5371,5345,5303,5293,5095,4289,4494,4473,4460,4452,4386,4388,4236,4043,2758,2841,2756,2750,2692,1781,1693,2256,2234,2373] # 账户ID列表,示例:[100, 101, 102] + +OUTPUT_DIR = "output_latest/" # 输出目录,默认为output文件夹 +# ==== 变量结束 ==== +import os +import json +import re +from typing import Any, Dict, List, Optional + +import datetime + +try: + import requests +except Exception: + requests = None + +try: + import psycopg2 + from psycopg2.extras import RealDictCursor +except Exception: + psycopg2 = None + RealDictCursor = None + +try: + import pymysql + import pymysql.cursors +except Exception: + pymysql = None + +try: + import pandas as pd +except Exception: + pd = None + +try: + import urllib3 +except Exception: + urllib3 = None + + +SHEET1_COLUMNS = [ + "userId", + "userMsg", + "source", + "userName", + "soeData", + "audioUrl", + "asrStatus", + "componentId", + "componentType", + "dataVersion", + "timeStr", +] + +SHEET2_COLUMNS = [ + "user_id", + "component_unique_code", + "session_id", + "c_type", + "c_id", + "互动组件名称", + "组件标题", + "组件配置摘要", + "知识点", + "play_result", + "user_behavior_info", + "updated_at", +] + +SHEET3_COLUMNS = [ + "user_id", + "unit_id", + "lesson_id", + "question_list", + "正确率", + "updated_at", +] + +SHEET4_COLUMNS = [ + "user_id", + "unit_id", + "category", + "score_text", + "question_list", + "updated_at", +] + +SHEET5_COLUMNS = [ + "id", + "user_id", + "unit_id", + "updated_at", + "km_id", + "km_type", + "play_time_seconds", +] + + +def _load_env_file(path: str) -> None: + if not os.path.exists(path): + return + try: + with open(path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + if "=" not in line: + continue + k, v = line.split("=", 1) + k = k.strip() + v = v.strip().strip('"').strip("'") + if k and (os.getenv(k) is None): + os.environ[k] = v + except Exception: + pass + + +def load_env() -> None: + _load_env_file(os.path.join(os.getcwd(), ".env")) + _load_env_file(os.path.join(os.getcwd(), ".env.local")) + + +def to_json_str(v: Any) -> Any: + if isinstance(v, (dict, list)): + try: + return json.dumps(v, ensure_ascii=False) + except Exception: + return str(v) + return v + + +def parse_time(value: Any) -> Optional[datetime.datetime]: + if value is None: + return None + if isinstance(value, (int, float)): + try: + v = float(value) + # 兼容毫秒级时间戳 + if v > 1e11: + v = v / 1000.0 + return datetime.datetime.fromtimestamp(v) + except Exception: + return None + if isinstance(value, str): + fmts = [ + "%Y-%m-%dT%H:%M:%S.%fZ", + "%Y-%m-%dT%H:%M:%S.%f%z", + "%Y-%m-%dT%H:%M:%S%z", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d", + ] + for fmt in fmts: + try: + return datetime.datetime.strptime(value, fmt) + except Exception: + continue + try: + return datetime.datetime.fromisoformat(value) + except Exception: + return None + return None + + +def pick_time(source: Dict[str, Any]) -> Optional[datetime.datetime]: + candidates = [ + "updated_at", + "created_at", + "@timestamp", + "timestamp", + "updatedAt", + "createdAt", + "time", + "ts", + "timeStr", + "update_time", + "create_time", + ] + for key in candidates: + if key in source: + t = parse_time(source.get(key)) + if t is not None: + return t + # 宽松匹配:尝试扫描所有可能的时间相关字段 + for k, v in source.items(): + lk = str(k).lower() + if any(s in lk for s in ["time", "date", "_at", "timestamp"]): + t = parse_time(v) + if t is not None: + return t + return None + + +def extract_makee_id_from_user_msg(user_msg: Any) -> Optional[str]: + # 支持dict或字符串形式 + if isinstance(user_msg, dict): + mk = user_msg.get("makee_id") + if isinstance(mk, str) and mk: + return mk + if isinstance(user_msg, str) and user_msg: + # 1) 尝试整体解析为JSON + try: + obj = json.loads(user_msg) + mk = obj.get("makee_id") + if isinstance(mk, str) and mk: + return mk + except Exception: + pass + # 2) 尝试截取大括号中的JSON + try: + start = user_msg.find("{") + end = user_msg.rfind("}") + if start != -1 and end != -1 and end > start: + candidate = user_msg[start : end + 1] + obj = json.loads(candidate) + mk = obj.get("makee_id") + if isinstance(mk, str) and mk: + return mk + except Exception: + pass + # 3) 正则匹配 makee_id + m = re.search(r"\bmakee_id\b\s*:\s*\"([^\"]+)\"", user_msg) + if m: + return m.group(1) + return None + + +def fetch_es_asr_log(makee_id: str, es_cfg: Dict[str, Any]) -> Optional[Dict[str, Any]]: + if requests is None: + raise RuntimeError("缺少requests依赖,请安装后再运行。") + host = es_cfg.get("host") + port = es_cfg.get("port") + scheme = es_cfg.get("scheme", "http") + user = es_cfg.get("user") + password = es_cfg.get("password") + index = "llm_asr_log" + if not host: + return None + base = f"{scheme}://{host}:{port}" + url = f"{base}/{index}/_search" + headers = {"Content-Type": "application/json"} + body = { + "query": { + "bool": { + "should": [ + {"term": {"makee_id": {"value": str(makee_id)}}}, + {"term": {"makee_id.keyword": {"value": str(makee_id)}}}, + ], + "minimum_should_match": 1, + } + }, + "size": 10, + "_source": [ + "makee_id", + "result_text", + "source", + "updated_at", + "created_at", + "@timestamp", + "timestamp", + "updatedAt", + "createdAt", + "time", + "ts", + "timeStr", + "update_time", + "create_time", + ], + } + auth = (user, password) if user and password else None + try: + if scheme == "https" and urllib3 is not None: + try: + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + except Exception: + pass + resp = requests.post(url, headers=headers, json=body, auth=auth, timeout=20, verify=False if scheme == "https" else True) + resp.raise_for_status() + data = resp.json() + except Exception: + return None + hits = data.get("hits", {}).get("hits", []) + if not hits: + return None + # 选最新的 + chosen = None + best_t = None + for h in hits: + src = h.get("_source", {}) or {} + t = pick_time(src) + if t is None: + continue + if best_t is None or t > best_t: + best_t = t + chosen = src + if chosen is None: + # 如果都没有时间,选第一条 + chosen = (hits[0].get("_source", {}) or {}) + return chosen + + +def get_es_config() -> Dict[str, Any]: + return { + "host": os.getenv("ES_HOST"), + "port": os.getenv("ES_PORT", "9200"), + "scheme": os.getenv("ES_SCHEME", "http"), + "user": os.getenv("ES_USER"), + "password": os.getenv("ES_PASSWORD"), + "index": "user-audio", + } + + +def fetch_es_user_audio(user_id: str, es_cfg: Dict[str, Any]) -> List[Dict[str, Any]]: + if requests is None: + raise RuntimeError("缺少requests依赖,请安装后再运行。") + + print(f" [ES] 开始查询user-audio索引...") + start_time = datetime.datetime.now() + + host = es_cfg.get("host") + port = es_cfg.get("port") + scheme = es_cfg.get("scheme", "http") + user = es_cfg.get("user") + password = es_cfg.get("password") + index = es_cfg.get("index", "user-audio") + + if not host: + return [] + + base = f"{scheme}://{host}:{port}" + url = f"{base}/{index}/_search" + headers = {"Content-Type": "application/json"} + + body = { + "query": { + "bool": { + "should": [ + {"term": {"userId": {"value": str(user_id)}}}, + {"term": {"userId.keyword": {"value": str(user_id)}}}, + ], + "minimum_should_match": 1, + } + }, + "size": 10000, + "_source": [ + "userId", + "userMsg", + "userName", + "soeData", + "audioUrl", + "asrStatus", + "componentId", + "componentType", + "dataVersion", + "updated_at", + "created_at", + "@timestamp", + "timestamp", + "updatedAt", + "createdAt", + "time", + "ts", + "timeStr", + "update_time", + "create_time", + ], + } + + auth = (user, password) if user and password else None + + try: + # 抑制自签证书下的HTTPS不安全警告 + if scheme == "https" and urllib3 is not None: + try: + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + except Exception: + pass + resp = requests.post(url, headers=headers, json=body, auth=auth, timeout=30, verify=False if scheme == "https" else True) + resp.raise_for_status() + data = resp.json() + except Exception as e: + raise RuntimeError(f"ES查询失败: {e}") + + hits = data.get("hits", {}).get("hits", []) + print(f" [ES] 查询完成,获得{len(hits)}条记录,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + + if not hits: + return [] + + print(f" [ES] 开始处理音频数据...") + process_start = datetime.datetime.now() + + rows: List[Dict[str, Any]] = [] + asr_cache: Dict[str, Dict[str, Any]] = {} + makee_id_count = 0 + + for idx, h in enumerate(hits, 1): + # 每处理100条显示一次进度 + if idx % 100 == 0 or idx == len(hits): + print(f" [ES] 处理进度: {idx}/{len(hits)} ({idx*100//len(hits)}%)") + + src = h.get("_source", {}) or {} + row = { + "userId": src.get("userId"), + "userMsg": src.get("userMsg"), + "source": None, + "userName": src.get("userName"), + "soeData": to_json_str(src.get("soeData")), + "audioUrl": src.get("audioUrl"), + "asrStatus": src.get("asrStatus"), + "componentId": src.get("componentId"), + "componentType": src.get("componentType"), + "dataVersion": src.get("dataVersion"), + } + t = pick_time(src) + row["_time"] = t.isoformat() if t else None + row["timeStr"] = t.strftime("%Y-%m-%d %H:%M:%S") if t else None + # v1.2: 当userMsg包含makee_id时,补充查询llm_asr_log并回填 + mk = extract_makee_id_from_user_msg(row.get("userMsg")) + if mk: + makee_id_count += 1 + asr_doc = asr_cache.get(mk) + if asr_doc is None: + asr_doc = fetch_es_asr_log(mk, es_cfg) + if asr_doc is not None: + asr_cache[mk] = asr_doc + if asr_doc is not None: + rt = asr_doc.get("result_text") + if rt: + row["userMsg"] = rt + row["source"] = to_json_str(asr_doc.get("source")) + rows.append(row) + + print(f" [ES] 数据处理完成,发现{makee_id_count}条包含makee_id的记录,耗时{(datetime.datetime.now() - process_start).total_seconds():.2f}秒") + + print(f" [ES] 开始排序...") + rows.sort(key=lambda x: parse_time(x.get("_time")) or datetime.datetime.min, reverse=True) + print(f" [ES] 音频数据处理完成,总耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + + return rows + + +def get_pg_conn() -> Any: + if psycopg2 is None: + raise RuntimeError("缺少psycopg2依赖,请安装后再运行。") + host = os.getenv("PG_DB_HOST") + port = int(os.getenv("PG_DB_PORT", "5432")) + user = os.getenv("PG_DB_USER") + password = os.getenv("PG_DB_PASSWORD") + dbname = os.getenv("PG_DB_DATABASE") + if not host or not dbname: + raise RuntimeError("PG数据库环境变量未配置完整") + conn = psycopg2.connect(host=host, port=port, user=user, password=password, dbname=dbname) + return conn + + +def get_mysql_conn(database: str) -> Any: + """ + 获取MySQL数据库连接 + + Args: + database: 数据库名,可选值:'vala_user' 或 'vala_test' + vala_user 使用 online 配置(环境变量后缀 _online) + vala_test 使用默认配置 + + Returns: + MySQL连接对象 + """ + if pymysql is None: + raise RuntimeError("缺少pymysql依赖,请安装后再运行。") + + # 根据数据库选择不同的环境变量配置 + if database == "vala_user": + # vala_user 数据库使用 online 配置 + host = os.getenv("MYSQL_HOST_online") + port = int(os.getenv("MYSQL_PORT_online", "3306")) + user = os.getenv("MYSQL_USERNAME_online") + password = os.getenv("MYSQL_PASSWORD_online") + if not host: + raise RuntimeError("MySQL数据库环境变量未配置完整(缺少MYSQL_HOST_online)") + else: + # vala_test 等其他数据库使用默认配置 + host = os.getenv("MYSQL_HOST") + port = int(os.getenv("MYSQL_PORT", "3306")) + user = os.getenv("MYSQL_USERNAME") + password = os.getenv("MYSQL_PASSWORD") + if not host: + raise RuntimeError("MySQL数据库环境变量未配置完整(缺少MYSQL_HOST)") + + conn = pymysql.connect( + host=host, + port=port, + user=user, + password=password, + database=database, # 直接使用传入的数据库名 + charset="utf8mb4", + cursorclass=pymysql.cursors.DictCursor, + ) + return conn + + +def get_id_2_unit_index(conn: Any) -> Dict[int, int]: + """ + 从MySQL获取 story_id 到 unit_id 的映射关系 + + Args: + conn: MySQL数据库连接 + + Returns: + 映射字典 {story_id: unit_id} + """ + sql = """ + SELECT * + FROM `vala_game_info` + WHERE id > 0 + AND `vala_game_info`.`deleted_at` IS NULL + ORDER BY season_package_id asc, `index` asc + """ + try: + with conn.cursor() as cur: + cur.execute(sql) + rows = cur.fetchall() or [] + # 构建映射表:按查询结果的顺序,索引即为unit_id + id_2_unit_index = {} + for index, row in enumerate(rows): + id_2_unit_index[row["id"]] = index + return id_2_unit_index + except Exception as e: + print(f"[ERROR] 获取story_id到unit_id映射失败: {e}") + return {} + + +def get_chapter_id_to_lesson_id(conn: Any) -> Dict[int, int]: + """ + 从MySQL获取 chapter_id 到 lesson_id 的映射关系 + + Args: + conn: MySQL数据库连接 + + Returns: + 映射字典 {chapter_id: lesson_id} + """ + sql = """ + SELECT id, `index` + FROM `vala_game_chapter` + WHERE deleted_at IS NULL + """ + try: + with conn.cursor() as cur: + cur.execute(sql) + rows = cur.fetchall() or [] + # 构建映射表:chapter的index字段即为lesson_id + chapter_id_to_lesson_id = {} + for row in rows: + chapter_id_to_lesson_id[row["id"]] = row["index"] + return chapter_id_to_lesson_id + except Exception as e: + print(f"[ERROR] 获取chapter_id到lesson_id映射失败: {e}") + return {} + + +# 组件类型到组件名称的映射 +COMPONENT_TYPE_NAMES = { + "mid_vocab_item": "物品互动", + "mid_vocab_image": "图片互动", + "mid_vocab_fillBlank": "填词互动", + "mid_vocab_instruction": "指令互动", + "mid_sentence_dialogue": "对话互动", # 需要根据mode进一步判断 + "mid_sentence_voice": "语音互动", + "mid_sentence_material": "材料互动", + "mid_sentence_makeSentence": "造句互动", + "mid_grammar_cloze": "挖空互动", + "mid_grammar_sentence": "组句互动", + "mid_pron_pron": "发音互动", + "core_speaking_reply": "口语快答", + "core_speaking_inquiry": "口语妙问", + "core_speaking_explore": "口语探讨", + "core_speaking_monologue": "口语独白", + "core_reading_order": "合作阅读", + "core_listening_order": "合作听力", + "core_writing_imgMakeSentence": "看图组句", + "core_writing_imgWrite": "看图撰写", + "core_writing_questionMakeSentence": "问题组句", + "core_writing_questionWrite": "问题撰写", +} + + +def get_component_name(c_type: str, component_config: Optional[Dict[str, Any]]) -> str: + """ + 根据c_type和组件配置获取组件名称 + + Args: + c_type: 组件类型 + component_config: 组件配置(用于判断对话互动的mode) + + Returns: + 组件名称 + """ + if not c_type: + return "" + + # 特殊处理:对话互动需要根据mode判断 + if c_type == "mid_sentence_dialogue" and component_config: + try: + question = component_config.get("question", {}) + mode = question.get("mode", "") + if mode == "express": + return "对话互动-表达" + elif mode == "read": + return "对话互动-朗读" + except Exception: + pass + + return COMPONENT_TYPE_NAMES.get(c_type, "") + + +def batch_fetch_component_configs(play_records: List[Dict[str, Any]], mysql_conn: Any) -> Dict[str, Dict[str, Any]]: + """ + 批量查询组件配置信息 + + Args: + play_records: 播放记录列表 + mysql_conn: MySQL连接 + + Returns: + 组件配置映射 {c_type_c_id: {title, component_config, kp_relation_info}} + """ + print(f" [MySQL] 开始批量查询组件配置...") + start_time = datetime.datetime.now() + + # 收集需要查询的c_type和c_id + mid_c_ids = set() + core_c_ids = set() + mid_type_id_pairs = [] # 用于调试日志 + core_type_id_pairs = [] + + for record in play_records: + c_type = record.get("c_type", "") + c_id = record.get("c_id") + if c_type and c_id: + if c_type.startswith("mid"): + mid_c_ids.add(c_id) + mid_type_id_pairs.append((c_type, c_id)) + elif c_type.startswith("core"): + core_c_ids.add(c_id) + core_type_id_pairs.append((c_type, c_id)) + + print(f" [MySQL] 需要查询中互动组件: {len(mid_c_ids)}个, 核心互动组件: {len(core_c_ids)}个") + if mid_c_ids: + print(f" [MySQL] 中互动组件ID列表(前10个): {sorted(list(mid_c_ids))[:10]}") + if core_c_ids: + print(f" [MySQL] 核心互动组件ID列表(前10个): {sorted(list(core_c_ids))[:10]}") + + config_map = {} + + # 批量查询middle_interaction_component + if mid_c_ids: + try: + with mysql_conn.cursor() as cur: + placeholders = ','.join(['%s'] * len(mid_c_ids)) + sql = f""" + SELECT c_id, c_type, title, component_config, kp_relation_info + FROM middle_interaction_component + WHERE c_id IN ({placeholders}) AND deleted_at IS NULL + """ + print(f" [MySQL] 执行中互动组件查询,查询条件: c_id IN ({len(mid_c_ids)}个ID)") + cur.execute(sql, tuple(mid_c_ids)) + rows = cur.fetchall() or [] + print(f" [MySQL] 查询到{len(rows)}条中互动组件配置") + + if len(rows) == 0 and len(mid_c_ids) > 0: + print(f" [MySQL] [警告] 查询结果为空!可能的原因:") + print(f" [MySQL] - 数据库中没有匹配的c_id记录") + print(f" [MySQL] - deleted_at字段不为NULL") + print(f" [MySQL] - c_id不存在") + + for idx, row in enumerate(rows): + c_type = row.get("c_type", "") + c_id = row.get("c_id") + key = f"{c_type}_{c_id}" + + if idx < 3: # 输出前3条的详细信息 + print(f" [MySQL] [样例{idx+1}] id={c_id}, c_type={c_type}, key={key}") + print(f" [MySQL] [样例{idx+1}] title={row.get('title', '')[:50]}") + + # 解析component_config + component_config = row.get("component_config") + if isinstance(component_config, str): + try: + component_config = json.loads(component_config) + except Exception as e: + print(f" [MySQL] [警告] 解析component_config失败 (id={c_id}): {e}") + component_config = {} + + # 提取question字段作为摘要 + summary = "" + if isinstance(component_config, dict): + question = component_config.get("question") + summary = to_json_str(question) if question else "" + if idx < 3 and question: + print(f" [MySQL] [样例{idx+1}] 提取到question字段,长度: {len(summary)}") + + # 解析kp_relation_info + kp_relation_info = row.get("kp_relation_info") + if isinstance(kp_relation_info, str): + try: + kp_relation_info = json.loads(kp_relation_info) + except Exception: + kp_relation_info = [] + + config_map[key] = { + "title": row.get("title", ""), + "component_config": component_config, + "summary": summary, + "kp_relation_info": to_json_str(kp_relation_info), + } + + print(f" [MySQL] 中互动组件配置已加入config_map,当前map大小: {len(config_map)}") + except Exception as e: + print(f" [MySQL] [错误] 查询中互动组件配置失败: {e}") + import traceback + traceback.print_exc() + + # 批量查询core_interaction_component + if core_c_ids: + try: + with mysql_conn.cursor() as cur: + placeholders = ','.join(['%s'] * len(core_c_ids)) + sql = f""" + SELECT c_id, c_type, title, component_config, kp_relation_info + FROM core_interaction_component + WHERE c_id IN ({placeholders}) AND deleted_at IS NULL + """ + print(f" [MySQL] 执行核心互动组件查询,查询条件: c_id IN ({len(core_c_ids)}个ID)") + cur.execute(sql, tuple(core_c_ids)) + rows = cur.fetchall() or [] + print(f" [MySQL] 查询到{len(rows)}条核心互动组件配置") + + if len(rows) == 0 and len(core_c_ids) > 0: + print(f" [MySQL] [警告] 查询结果为空!可能的原因:") + print(f" [MySQL] - 数据库中没有匹配的c_id记录") + print(f" [MySQL] - deleted_at字段不为NULL") + print(f" [MySQL] - c_id不存在") + + for idx, row in enumerate(rows): + c_type = row.get("c_type", "") + c_id = row.get("c_id") + key = f"{c_type}_{c_id}" + + if idx < 3: # 输出前3条的详细信息 + print(f" [MySQL] [样例{idx+1}] id={c_id}, c_type={c_type}, key={key}") + print(f" [MySQL] [样例{idx+1}] title={row.get('title', '')[:50]}") + + # 解析component_config + component_config = row.get("component_config") + if isinstance(component_config, str): + try: + component_config = json.loads(component_config) + except Exception as e: + print(f" [MySQL] [警告] 解析component_config失败 (id={c_id}): {e}") + component_config = {} + + # 提取taskInfo字段作为摘要 + summary = "" + if isinstance(component_config, dict): + task_info = component_config.get("taskInfo") + summary = to_json_str(task_info) if task_info else "" + if idx < 3 and task_info: + print(f" [MySQL] [样例{idx+1}] 提取到taskInfo字段,长度: {len(summary)}") + + # 解析kp_relation_info + kp_relation_info = row.get("kp_relation_info") + if isinstance(kp_relation_info, str): + try: + kp_relation_info = json.loads(kp_relation_info) + except Exception: + kp_relation_info = [] + + config_map[key] = { + "title": row.get("title", ""), + "component_config": component_config, + "summary": summary, + "kp_relation_info": to_json_str(kp_relation_info), + } + + print(f" [MySQL] 核心互动组件配置已加入config_map,当前map大小: {len(config_map)}") + except Exception as e: + print(f" [MySQL] [错误] 查询核心互动组件配置失败: {e}") + import traceback + traceback.print_exc() + + print(f" [MySQL] 组件配置查询完成,共{len(config_map)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + return config_map + + +def calculate_accuracy(question_list: Any) -> float: + """ + 计算问题列表的正确率 + + Args: + question_list: 问题列表(可能是JSON字符串或list) + + Returns: + 正确率(百分比,保留2位小数) + """ + try: + if isinstance(question_list, str): + question_list = json.loads(question_list) + + if not isinstance(question_list, list) or len(question_list) == 0: + return 0.0 + + total = len(question_list) + correct = sum(1 for q in question_list if q.get('isRight') == True) + accuracy = round(correct / total * 100, 2) if total > 0 else 0.0 + + return accuracy + except Exception: + return 0.0 + + + +def fetch_character_ids_by_account(account_id: str, conn: Any) -> List[str]: + """根据账户id查询对应的角色id列表""" + sql = "SELECT id FROM vala_app_character WHERE account_id = %s" + try: + with conn.cursor() as cur: + cur.execute(sql, (account_id,)) + rows = cur.fetchall() or [] + return [str(row["id"]) for row in rows if row.get("id")] + except Exception as e: + print(f"[ERROR] 查询账户id={account_id}的角色id失败: {e}") + return [] + + +def fetch_pg_play_records(user_id: str, conn: Any, mysql_conn: Any) -> List[Dict[str, Any]]: + """ + 查询互动组件学习记录并补充组件配置信息 + + Args: + user_id: 用户ID(角色ID) + conn: PostgreSQL数据库连接 + mysql_conn: MySQL数据库连接 + + Returns: + 互动组件学习记录列表 + """ + print(f" [PG] 开始查询互动组件学习记录(8张分表)...") + start_time = datetime.datetime.now() + + tables = [f"user_component_play_record_{i}" for i in range(8)] + rows: List[Dict[str, Any]] = [] + with conn.cursor(cursor_factory=RealDictCursor) as cur: + for t in tables: + try: + cur.execute( + f""" + SELECT user_id, component_unique_code, session_id, c_type, c_id, + play_result, user_behavior_info, updated_at + FROM {t} + WHERE user_id = %s + ORDER BY updated_at DESC + """, + (user_id,), + ) + part = cur.fetchall() or [] + if part: + print(f" [PG] 表{t}查到{len(part)}条记录") + for r in part: + r = dict(r) + r["play_result"] = to_json_str(r.get("play_result")) + r["user_behavior_info"] = to_json_str(r.get("user_behavior_info")) + # 将带时区的时间转换为无时区,避免Excel写入报错 + upd = r.get("updated_at") + if isinstance(upd, datetime.datetime): + try: + if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: + r["updated_at"] = upd.replace(tzinfo=None) + except Exception: + # 回退为字符串 + r["updated_at"] = str(upd) + rows.append(r) + except Exception as e: + print(f" [PG] 表{t}查询失败: {e}") + continue + + rows.sort(key=lambda x: parse_time(x.get("updated_at")) or datetime.datetime.min, reverse=True) + print(f" [PG] 互动组件学习记录查询完成,共{len(rows)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + + # 批量查询组件配置 + if rows and mysql_conn: + config_map = batch_fetch_component_configs(rows, mysql_conn) + + # 补充组件信息 + print(f" [PG] 开始补充组件配置信息...") + filled_count = 0 + empty_count = 0 + sample_keys = [] + sample_mode_check = [] # 检查对话互动的mode + + for r in rows: + c_type = r.get("c_type", "") + c_id = r.get("c_id") + key = f"{c_type}_{c_id}" if c_type and c_id else "" + + config = config_map.get(key, {}) + component_config = config.get("component_config", {}) + + component_name = get_component_name(c_type, component_config) + r["互动组件名称"] = component_name + r["组件标题"] = config.get("title", "") + r["组件配置摘要"] = config.get("summary", "") + r["知识点"] = config.get("kp_relation_info", "") + + # 统计填充情况 + if config: + filled_count += 1 + if len(sample_keys) < 3: + sample_keys.append((key, component_name, r["组件标题"][:30] if r["组件标题"] else "")) + + # 检查对话互动的mode + if c_type == "mid_sentence_dialogue" and len(sample_mode_check) < 3: + mode = "" + if isinstance(component_config, dict): + question = component_config.get("question", {}) + if isinstance(question, dict): + mode = question.get("mode", "") + sample_mode_check.append({ + "key": key, + "mode": mode, + "component_name": component_name + }) + else: + empty_count += 1 + if empty_count <= 5: # 输出前5个未匹配的key + print(f" [PG] [警告] 未找到组件配置: key={key}") + + print(f" [PG] 组件配置信息补充完成") + print(f" [PG] 匹配到配置: {filled_count}条, 未匹配: {empty_count}条") + if sample_keys: + print(f" [PG] 样例数据(前3条):") + for key, name, title in sample_keys: + print(f" [PG] - key={key}, 名称={name}, 标题={title}") + + if sample_mode_check: + print(f" [PG] 对话互动mode检查(前3条):") + for s in sample_mode_check: + print(f" [PG] - key={s['key']}, mode={s['mode']}, 最终名称={s['component_name']}") + + return rows + + +def fetch_pg_unit_review(user_id: str, conn: Any, id_2_unit_index: Dict[int, int], chapter_id_to_lesson_id: Dict[int, int]) -> List[Dict[str, Any]]: + """ + 查询课程巩固记录 + + Args: + user_id: 用户ID(角色ID) + conn: PostgreSQL数据库连接 + id_2_unit_index: story_id到unit_id的映射字典 + chapter_id_to_lesson_id: chapter_id到lesson_id的映射字典 + + Returns: + 课程巩固记录列表 + """ + print(f" [PG] 开始查询课程巩固记录...") + start_time = datetime.datetime.now() + + sql = ( + "SELECT user_id, story_id, chapter_id, question_list, updated_at " + "FROM user_unit_review_question_result WHERE user_id = %s ORDER BY updated_at DESC" + ) + with conn.cursor(cursor_factory=RealDictCursor) as cur: + try: + cur.execute(sql, (user_id,)) + rows = cur.fetchall() or [] + except Exception as e: + print(f" [PG] 课程巩固记录查询失败: {e}") + rows = [] + out: List[Dict[str, Any]] = [] + for r in rows: + d = dict(r) + + # 映射 story_id 到 unit_id + story_id = d.get("story_id") + unit_id = id_2_unit_index.get(story_id) if story_id else None + d["unit_id"] = unit_id + + # 映射 chapter_id 到 lesson_id + chapter_id = d.get("chapter_id") + lesson_id = chapter_id_to_lesson_id.get(chapter_id) if chapter_id else None + d["lesson_id"] = lesson_id + + # 计算正确率 + question_list = d.get("question_list") + d["正确率"] = calculate_accuracy(question_list) + + d["question_list"] = to_json_str(question_list) + upd = d.get("updated_at") + if isinstance(upd, datetime.datetime): + try: + if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: + d["updated_at"] = upd.replace(tzinfo=None) + except Exception: + d["updated_at"] = str(upd) + out.append(d) + + print(f" [PG] 课程巩固记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + return out + + +def fetch_pg_unit_challenge(user_id: str, conn: Any, id_2_unit_index: Dict[int, int]) -> List[Dict[str, Any]]: + """ + 查询单元挑战记录 + + Args: + user_id: 用户ID(角色ID) + conn: PostgreSQL数据库连接 + id_2_unit_index: story_id到unit_id的映射字典 + + Returns: + 单元挑战记录列表 + """ + print(f" [PG] 开始查询单元挑战记录...") + start_time = datetime.datetime.now() + + sql = ( + "SELECT user_id, story_id, category, score_text, question_list, updated_at " + "FROM user_unit_challenge_question_result WHERE user_id = %s ORDER BY updated_at DESC" + ) + with conn.cursor(cursor_factory=RealDictCursor) as cur: + try: + cur.execute(sql, (user_id,)) + rows = cur.fetchall() or [] + except Exception as e: + print(f" [PG] 单元挑战记录查询失败: {e}") + rows = [] + out: List[Dict[str, Any]] = [] + for r in rows: + d = dict(r) + + # 映射 story_id 到 unit_id + story_id = d.get("story_id") + unit_id = id_2_unit_index.get(story_id) if story_id else None + d["unit_id"] = unit_id + + d["question_list"] = to_json_str(d.get("question_list")) + upd = d.get("updated_at") + if isinstance(upd, datetime.datetime): + try: + if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: + d["updated_at"] = upd.replace(tzinfo=None) + except Exception: + d["updated_at"] = str(upd) + out.append(d) + + print(f" [PG] 单元挑战记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + return out + + +def fetch_pg_unit_summary(user_id: str, conn: Any, id_2_unit_index: Dict[int, int]) -> List[Dict[str, Any]]: + """ + 查询单元总结知识点结果数据 + + Args: + user_id: 用户ID(角色ID) + conn: PostgreSQL数据库连接 + id_2_unit_index: story_id到unit_id的映射字典 + + Returns: + 单元总结记录列表 + """ + print(f" [PG] 开始查询单元总结记录...") + start_time = datetime.datetime.now() + + sql = ( + "SELECT id, user_id, story_id, updated_at, km_id, km_type, play_time " + "FROM user_unit_summary_km_result WHERE user_id = %s AND deleted_at IS NULL ORDER BY updated_at DESC" + ) + with conn.cursor(cursor_factory=RealDictCursor) as cur: + try: + cur.execute(sql, (user_id,)) + rows = cur.fetchall() or [] + except Exception as e: + print(f" [PG] 单元总结记录查询失败: {e}") + rows = [] + + out: List[Dict[str, Any]] = [] + for r in rows: + d = dict(r) + # 映射 story_id 到 unit_id + story_id = d.get("story_id") + unit_id = id_2_unit_index.get(story_id) if story_id else None + d["unit_id"] = unit_id + + # 转换 play_time (毫秒) 为秒 (整数) + play_time = d.get("play_time") + d["play_time_seconds"] = play_time // 1000 if play_time else 0 + + # 移除时区信息 + upd = d.get("updated_at") + if isinstance(upd, datetime.datetime): + try: + if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: + d["updated_at"] = upd.replace(tzinfo=None) + except Exception: + d["updated_at"] = str(upd) + out.append(d) + + print(f" [PG] 单元总结记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + return out + + +def generate_statistics(sheet2_rows: List[Dict[str, Any]], sheet5_rows: List[Dict[str, Any]]) -> tuple: + """ + 生成汇总统计数据 + + Args: + sheet2_rows: 互动组件学习记录 + sheet5_rows: 单元总结记录 + + Returns: + (组件统计DataFrame, 知识点统计DataFrame, 单元时长统计DataFrame) + """ + if pd is None: + raise RuntimeError("缺少pandas依赖,请安装后再运行。") + + print(f" [统计] 开始生成汇总统计数据...") + start_time = datetime.datetime.now() + + from collections import defaultdict + + # ============ a. 所有互动-按互动组件类型-通过情况统计 ============ + component_stats_data = [] + component_stats = defaultdict(lambda: {"Perfect": 0, "Good": 0, "Failed": 0, "Pass": 0, "Oops": 0, "total": 0}) + + # 用于调试 + sample_results = [] + parse_error_count = 0 + + for idx, record in enumerate(sheet2_rows): + component_name = record.get("互动组件名称", "") + if not component_name: + continue + + play_result_str = record.get("play_result", "") + + # 解析play_result + result = "" + try: + # 先判断是否是简单的字符串(Perfect/Good/Failed/Pass/Oops) + if isinstance(play_result_str, str): + # 去除空格后检查 + stripped = play_result_str.strip() + if stripped in ["Perfect", "Good", "Failed", "Pass", "Oops"]: + # 直接使用 + result = stripped + else: + # 尝试JSON解析 + try: + play_result = json.loads(play_result_str) + if isinstance(play_result, dict): + result = play_result.get("result", "") + else: + result = "" + except: + result = "" + else: + # 如果不是字符串,尝试当dict处理 + if isinstance(play_result_str, dict): + result = play_result_str.get("result", "") + else: + result = "" + + # 收集前3个样例 + if idx < 3: + sample_results.append({ + "component": component_name, + "raw": str(play_result_str)[:100], + "result": result + }) + except Exception as e: + parse_error_count += 1 + if parse_error_count <= 3: + print(f" [统计] [警告] 解析play_result失败 (第{idx+1}条): {e}, 原始值: {str(play_result_str)[:100]}") + result = "" + + component_stats[component_name]["total"] += 1 + if result in ["Perfect", "Good", "Failed", "Pass", "Oops"]: + component_stats[component_name][result] += 1 + + print(f" [统计] play_result解析样例(前3条):") + for s in sample_results: + print(f" [统计] - 组件: {s['component']}, 结果: {s['result']}, 原始: {s['raw']}") + if parse_error_count > 0: + print(f" [统计] play_result解析失败总数: {parse_error_count}") + + # 生成统计数据行 + for component_name in sorted(component_stats.keys()): + stats = component_stats[component_name] + total = stats["total"] + perfect = stats["Perfect"] + good = stats["Good"] + failed = stats["Failed"] + pass_count = stats["Pass"] + oops = stats["Oops"] + + perfect_ratio = round(perfect / total * 100, 2) if total > 0 else 0 + good_ratio = round(good / total * 100, 2) if total > 0 else 0 + failed_ratio = round(failed / total * 100, 2) if total > 0 else 0 + pass_ratio = round(pass_count / total * 100, 2) if total > 0 else 0 + oops_ratio = round(oops / total * 100, 2) if total > 0 else 0 + + component_stats_data.append({ + "互动组件名称": component_name, + "总数量": total, + "Perfect数量": perfect, + "Good数量": good, + "Failed数量": failed, + "Pass数量": pass_count, + "Oops数量": oops, + "Perfect比例(%)": perfect_ratio, + "Good比例(%)": good_ratio, + "Failed比例(%)": failed_ratio, + "Pass比例(%)": pass_ratio, + "Oops比例(%)": oops_ratio, + }) + + # ============ b. 中互动组件-按知识点-通过情况统计 ============ + kp_stats_data = [] + kp_stats = defaultdict(lambda: {"Perfect": 0, "Good": 0, "Failed": 0, "Pass": 0, "Oops": 0, "total": 0}) + + # 调试信息 + mid_count = 0 + has_kp_count = 0 + sample_kp_records = [] + + for idx, record in enumerate(sheet2_rows): + c_type = record.get("c_type", "") + if not c_type or not c_type.startswith("mid"): + continue + + mid_count += 1 + kp_relation_info_str = record.get("知识点", "") + + if not kp_relation_info_str: + continue + + has_kp_count += 1 + + # 解析知识点 + try: + if isinstance(kp_relation_info_str, str): + kp_relation_info = json.loads(kp_relation_info_str) + else: + kp_relation_info = kp_relation_info_str + + if not isinstance(kp_relation_info, list): + continue + + # 收集样例 + if len(sample_kp_records) < 3: + sample_kp_records.append({ + "c_type": c_type, + "kp_count": len(kp_relation_info), + "kp_info": str(kp_relation_info)[:200] + }) + + # 解析play_result(使用相同的逻辑) + play_result_str = record.get("play_result", "") + result = "" + if isinstance(play_result_str, str): + stripped = play_result_str.strip() + if stripped in ["Perfect", "Good", "Failed", "Pass", "Oops"]: + result = stripped + else: + try: + play_result = json.loads(play_result_str) + if isinstance(play_result, dict): + result = play_result.get("result", "") + except: + pass + elif isinstance(play_result_str, dict): + result = play_result_str.get("result", "") + + # 为每个知识点统计 + for kp in kp_relation_info: + if not isinstance(kp, dict): + continue + + kp_id = kp.get("kpId", "") + kp_type = kp.get("kpType", "") + kp_title = kp.get("kpTitle", "") + + if not kp_id: + continue + + kp_key = f"{kp_id}|{kp_type}|{kp_title}" + kp_stats[kp_key]["total"] += 1 + if result in ["Perfect", "Good", "Failed", "Pass", "Oops"]: + kp_stats[kp_key][result] += 1 + + except Exception as e: + if len(sample_kp_records) < 5: + print(f" [统计] [警告] 解析知识点失败: {e}, 原始值: {str(kp_relation_info_str)[:100]}") + continue + + print(f" [统计] 中互动组件统计: 总数={mid_count}, 有知识点={has_kp_count}, 知识点条目数={len(kp_stats)}") + if sample_kp_records: + print(f" [统计] 知识点样例(前3条):") + for s in sample_kp_records: + print(f" [统计] - c_type={s['c_type']}, 知识点数量={s['kp_count']}, 内容={s['kp_info']}") + + # 生成知识点统计数据行 + for kp_key in sorted(kp_stats.keys()): + parts = kp_key.split("|") + if len(parts) != 3: + continue + + kp_id, kp_type, kp_title = parts + stats = kp_stats[kp_key] + total = stats["total"] + perfect = stats["Perfect"] + good = stats["Good"] + failed = stats["Failed"] + pass_count = stats["Pass"] + oops = stats["Oops"] + + perfect_ratio = round(perfect / total * 100, 2) if total > 0 else 0 + good_ratio = round(good / total * 100, 2) if total > 0 else 0 + failed_ratio = round(failed / total * 100, 2) if total > 0 else 0 + pass_ratio = round(pass_count / total * 100, 2) if total > 0 else 0 + oops_ratio = round(oops / total * 100, 2) if total > 0 else 0 + + kp_stats_data.append({ + "知识点ID": kp_id, + "知识点类型": kp_type, + "知识点标题": kp_title, + "总数量": total, + "Perfect数量": perfect, + "Good数量": good, + "Failed数量": failed, + "Pass数量": pass_count, + "Oops数量": oops, + "Perfect比例(%)": perfect_ratio, + "Good比例(%)": good_ratio, + "Failed比例(%)": failed_ratio, + "Pass比例(%)": pass_ratio, + "Oops比例(%)": oops_ratio, + }) + + # ============ c. 单元总结-按单元统计时长 ============ + unit_time_stats_data = [] + unit_time_stats = defaultdict(int) + + for record in sheet5_rows: + unit_id = record.get("unit_id") + play_time_seconds = record.get("play_time_seconds", 0) + + if unit_id is not None: + unit_time_stats[unit_id] += play_time_seconds + + # 生成单元时长统计数据行 + for unit_id in sorted(unit_time_stats.keys()): + total_seconds = unit_time_stats[unit_id] + total_minutes = int(total_seconds / 60) + + unit_time_stats_data.append({ + "单元ID": f"unit_{unit_id}", + "总时长(秒)": total_seconds, + "总时长(分钟)": total_minutes, + }) + + print(f" [统计] 汇总统计数据生成完成,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + print(f" [统计] 生成了{len(component_stats_data)}条组件统计, {len(kp_stats_data)}条知识点统计, {len(unit_time_stats_data)}条单元时长统计") + + return ( + pd.DataFrame(component_stats_data), + pd.DataFrame(kp_stats_data), + pd.DataFrame(unit_time_stats_data) + ) + + + +def write_excel(path: str, sheet1_rows: List[Dict[str, Any]], sheet2_rows: List[Dict[str, Any]], sheet3_rows: List[Dict[str, Any]], sheet4_rows: List[Dict[str, Any]], sheet5_rows: List[Dict[str, Any]], stats_component_df: Any, stats_kp_df: Any, stats_unit_time_df: Any) -> None: + if pd is None: + raise RuntimeError("缺少pandas依赖,请安装后再运行。") + + print(f" [Excel] 开始写入Excel文件: {path}") + start_time = datetime.datetime.now() + + out_dir = os.path.dirname(path) or "." + os.makedirs(out_dir, exist_ok=True) + with pd.ExcelWriter(path, engine="openpyxl") as writer: + pd.DataFrame(sheet1_rows, columns=SHEET1_COLUMNS).to_excel(writer, sheet_name="全部音频数据", index=False) + pd.DataFrame(sheet2_rows, columns=SHEET2_COLUMNS).to_excel(writer, sheet_name="互动组件学习记录", index=False) + pd.DataFrame(sheet3_rows, columns=SHEET3_COLUMNS).to_excel(writer, sheet_name="课程巩固记录", index=False) + pd.DataFrame(sheet4_rows, columns=SHEET4_COLUMNS).to_excel(writer, sheet_name="单元挑战记录", index=False) + pd.DataFrame(sheet5_rows, columns=SHEET5_COLUMNS).to_excel(writer, sheet_name="单元总结记录", index=False) + stats_component_df.to_excel(writer, sheet_name="统计-互动组件通过情况", index=False) + stats_kp_df.to_excel(writer, sheet_name="统计-知识点通过情况", index=False) + stats_unit_time_df.to_excel(writer, sheet_name="统计-单元总结时长", index=False) + + print(f" [Excel] 写入完成,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") + + +def get_date_str() -> str: + """获取当前日期字符串 格式:YYYYMMDD""" + return datetime.datetime.now().strftime("%Y%m%d") + + +def export_single_user(user_id: str, es_cfg: Dict[str, Any], pg_conn: Any, mysql_conn: Any, output_path: str, id_2_unit_index: Dict[int, int], chapter_id_to_lesson_id: Dict[int, int]) -> bool: + """ + 导出单个角色id的数据 + + Args: + user_id: 角色ID + es_cfg: ES配置 + pg_conn: PostgreSQL连接 + mysql_conn: MySQL连接 + output_path: 输出路径 + id_2_unit_index: story_id到unit_id的映射字典 + chapter_id_to_lesson_id: chapter_id到lesson_id的映射字典 + + Returns: + True表示成功,False表示失败 + """ + try: + print(f"\n[INFO] ========== 开始导出角色id={user_id} ==========") + total_start_time = datetime.datetime.now() + + # 查询ES数据 + sheet1_rows = fetch_es_user_audio(user_id, es_cfg) + + # 查询PG数据 + sheet2_rows = fetch_pg_play_records(user_id, pg_conn, mysql_conn) + sheet3_rows = fetch_pg_unit_review(user_id, pg_conn, id_2_unit_index, chapter_id_to_lesson_id) + sheet4_rows = fetch_pg_unit_challenge(user_id, pg_conn, id_2_unit_index) + sheet5_rows = fetch_pg_unit_summary(user_id, pg_conn, id_2_unit_index) + + # 检查是否有有效数据 + total_records = len(sheet1_rows) + len(sheet2_rows) + len(sheet3_rows) + len(sheet4_rows) + len(sheet5_rows) + print(f" [统计] 数据汇总:") + print(f" - 全部音频数据: {len(sheet1_rows)}条") + print(f" - 互动组件学习记录: {len(sheet2_rows)}条") + print(f" - 课程巩固记录: {len(sheet3_rows)}条") + print(f" - 单元挑战记录: {len(sheet4_rows)}条") + print(f" - 单元总结记录: {len(sheet5_rows)}条") + print(f" - 总计: {total_records}条") + + if total_records == 0: + print(f"[WARN] 角色id={user_id} 没有找到任何有效记录,跳过导出") + return False + + # 生成汇总统计数据 + stats_component_df, stats_kp_df, stats_unit_time_df = generate_statistics(sheet2_rows, sheet5_rows) + + # 写入Excel + write_excel(output_path, sheet1_rows, sheet2_rows, sheet3_rows, sheet4_rows, sheet5_rows, stats_component_df, stats_kp_df, stats_unit_time_df) + + total_time = (datetime.datetime.now() - total_start_time).total_seconds() + print(f"[INFO] 角色id={user_id} 导出成功") + print(f"[INFO] 文件路径: {output_path}") + print(f"[INFO] 总耗时: {total_time:.2f}秒") + print(f"[INFO] ========== 完成 ==========\n") + return True + + except Exception as e: + print(f"[ERROR] 角色id={user_id} 导出失败: {e}") + import traceback + traceback.print_exc() + return False + + +def main(): + load_env() + + # 确定运行模式并收集需要导出的角色id列表 + user_id_list: List[tuple] = [] # [(user_id, account_id or None), ...] + date_str = get_date_str() + + # 检查三种模式的配置 + has_user_id = USER_ID is not None + has_user_id_list = USER_ID_LIST is not None and len(USER_ID_LIST) > 0 + has_account_id_list = ACCOUNT_ID_LIST is not None and len(ACCOUNT_ID_LIST) > 0 + + # 验证只能配置一种模式 + mode_count = sum([has_user_id, has_user_id_list, has_account_id_list]) + if mode_count == 0: + raise RuntimeError("请配置 USER_ID、USER_ID_LIST 或 ACCOUNT_ID_LIST 中的一个") + if mode_count > 1: + raise RuntimeError("USER_ID、USER_ID_LIST、ACCOUNT_ID_LIST 只能配置一个,请检查配置") + + # 模式1:单个角色id + if has_user_id: + user_id_list = [(str(USER_ID), None)] + print(f"[INFO] 运行模式:单个角色id") + + # 模式2:角色id列表 + elif has_user_id_list: + user_id_list = [(str(uid), None) for uid in USER_ID_LIST] + print(f"[INFO] 运行模式:角色id列表,共{len(user_id_list)}个角色") + + # 模式3:账户id列表 + elif has_account_id_list: + print(f"[INFO] 运行模式:账户id列表,共{len(ACCOUNT_ID_LIST)}个账户") + mysql_conn = None + try: + mysql_conn = get_mysql_conn("vala_user") # 查询用户表,使用 vala_user 数据库 + for account_id in ACCOUNT_ID_LIST: + account_id_str = str(account_id) + print(f"[INFO] 查询账户id={account_id_str}对应的角色id...") + character_ids = fetch_character_ids_by_account(account_id_str, mysql_conn) + if not character_ids: + print(f"[WARN] 账户id={account_id_str} 未找到关联的角色id,跳过") + continue + print(f"[INFO] 账户id={account_id_str} 找到{len(character_ids)}个角色id: {character_ids}") + for cid in character_ids: + user_id_list.append((cid, account_id_str)) + finally: + if mysql_conn: + try: + mysql_conn.close() + except Exception: + pass + + if not user_id_list: + print("[WARN] 没有需要导出的角色id,程序退出") + return + + # 初始化连接 + es_cfg = get_es_config() + pg_conn = get_pg_conn() + + # 获取映射表(只需要查询一次,所有角色共用) + print(f"\n[INFO] ===== 准备工作:获取映射表 =====") + mysql_conn = None + id_2_unit_index = {} + chapter_id_to_lesson_id = {} + try: + print(f"[INFO] 正在连接MySQL数据库(vala_test)...") + mysql_conn = get_mysql_conn("vala_test") # 查询游戏配置表,使用 vala_test 数据库 + print(f"[INFO] 正在获取 story_id 到 unit_id 的映射...") + id_2_unit_index = get_id_2_unit_index(mysql_conn) + print(f"[INFO] 成功获取 {len(id_2_unit_index)} 个 story_id 映射") + print(f"[INFO] 正在获取 chapter_id 到 lesson_id 的映射...") + chapter_id_to_lesson_id = get_chapter_id_to_lesson_id(mysql_conn) + print(f"[INFO] 成功获取 {len(chapter_id_to_lesson_id)} 个 chapter_id 映射") + except Exception as e: + print(f"[ERROR] 获取映射表失败: {e}") + import traceback + traceback.print_exc() + if pg_conn: + try: + pg_conn.close() + except Exception: + pass + if mysql_conn: + try: + mysql_conn.close() + except Exception: + pass + return + + try: + # 统计信息 + success_count = 0 + skip_count = 0 + + print(f"\n[INFO] ===== 开始批量导出 =====") + print(f"[INFO] 共需导出{len(user_id_list)}个角色\n") + batch_start_time = datetime.datetime.now() + + # 循环处理每个角色id + for idx, (user_id, account_id) in enumerate(user_id_list, 1): + print(f"\n{'='*60}") + print(f"[INFO] 进度: {idx}/{len(user_id_list)} ({idx*100//len(user_id_list)}%)") + print(f"{'='*60}") + + # 生成输出文件名 + if account_id is None: + # 模式1和模式2:角色id_{}_导出时间_{}.xlsx + filename = f"角色id_{user_id}_导出时间_{date_str}.xlsx" + else: + # 模式3:账户id_{}_角色id_{}_导出时间_{}.xlsx + filename = f"账户id_{account_id}_角色id_{user_id}_导出时间_{date_str}.xlsx" + + output_path = os.path.join(OUTPUT_DIR, filename) + + # 导出单个角色的数据 + result = export_single_user(user_id, es_cfg, pg_conn, mysql_conn, output_path, id_2_unit_index, chapter_id_to_lesson_id) + if result: + success_count += 1 + else: + skip_count += 1 + + # 输出统计信息 + batch_total_time = (datetime.datetime.now() - batch_start_time).total_seconds() + print(f"\n{'='*60}") + print(f"[INFO] ===== 全部导出完成 =====") + print(f"[INFO] 总计: {len(user_id_list)}个角色") + print(f"[INFO] 成功: {success_count}个") + print(f"[INFO] 跳过: {skip_count}个") + print(f"[INFO] 总耗时: {batch_total_time:.2f}秒 ({batch_total_time/60:.2f}分钟)") + if success_count > 0: + print(f"[INFO] 平均每个角色: {batch_total_time/success_count:.2f}秒") + print(f"{'='*60}\n") + + finally: + if pg_conn: + try: + pg_conn.close() + except Exception: + pass + if mysql_conn: + try: + mysql_conn.close() + except Exception: + pass + + +if __name__ == "__main__": + main() diff --git a/test_db_connections.py b/test_db_connections.py new file mode 100644 index 0000000..77545eb --- /dev/null +++ b/test_db_connections.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 +"""测试各个数据库连接和查询""" + +import os +import json +import psycopg2 +import pymysql +import requests +from requests.auth import HTTPBasicAuth +import warnings +warnings.filterwarnings('ignore') + +def test_postgresql(): + """测试PostgreSQL连接""" + print("\n" + "="*60) + print("测试 PostgreSQL(Online)连接") + print("="*60) + + try: + conn = psycopg2.connect( + host="bj-postgres-16pob4sg.sql.tencentcdb.com", + port=28591, + user="ai_member", + password="LdfjdjL83h3h3^$&**YGG*", + dbname="vala", + connect_timeout=10 + ) + print("✅ PostgreSQL 连接成功!") + + # 测试查询 + with conn.cursor() as cur: + # 先查询所有表 + cur.execute("SELECT tablename FROM pg_tables WHERE schemaname = 'public' LIMIT 5") + tables = cur.fetchall() + print(f"✅ 查询成功!找到前5个表:{[t[0] for t in tables]}") + + # 尝试查询其中一个表的1条数据 + if tables: + table = tables[0][0] + cur.execute(f"SELECT * FROM {table} LIMIT 1") + row = cur.fetchone() + print(f"✅ 从表 {table} 读取到1条数据:{row if row else '空表'}") + + conn.close() + return True + + except Exception as e: + print(f"❌ PostgreSQL 连接/查询失败:{str(e)[:200]}") + return False + +def test_mysql_test(): + """测试Test MySQL连接""" + print("\n" + "="*60) + print("测试 MySQL(Test环境)连接") + print("="*60) + + try: + conn = pymysql.connect( + host="bj-cdb-8frbdwju.sql.tencentcdb.com", + port=25413, + user="read_only", + password="fdsfiidier^$*hjfdijjd232", + connect_timeout=10 + ) + print("✅ MySQL(Test)连接成功!") + + # 测试查询 + with conn.cursor() as cur: + cur.execute("SHOW DATABASES LIMIT 5") + dbs = cur.fetchall() + print(f"✅ 查询成功!找到前5个数据库:{[db[0] for db in dbs]}") + + if dbs: + db = dbs[0][0] + cur.execute(f"USE {db}") + cur.execute("SHOW TABLES LIMIT 1") + table = cur.fetchone() + if table: + cur.execute(f"SELECT * FROM {table[0]} LIMIT 1") + row = cur.fetchone() + print(f"✅ 从表 {table[0]} 读取到1条数据:{row if row else '空表'}") + + conn.close() + return True + + except Exception as e: + print(f"❌ MySQL(Test)连接/查询失败:{str(e)[:200]}") + return False + +def test_mysql_online(): + """测试Online MySQL连接""" + print("\n" + "="*60) + print("测试 MySQL(Online)连接") + print("="*60) + + try: + conn = pymysql.connect( + host="bj-cdb-dh2fkqa0.sql.tencentcdb.com", + port=27751, + user="read_only", + password="fsdo45ijfmfmuu77$%^&", + connect_timeout=10 + ) + print("✅ MySQL(Online)连接成功!") + + # 测试查询 + with conn.cursor() as cur: + cur.execute("SHOW DATABASES LIMIT 5") + dbs = cur.fetchall() + print(f"✅ 查询成功!找到前5个数据库:{[db[0] for db in dbs]}") + + conn.close() + return True + + except Exception as e: + print(f"❌ MySQL(Online)连接/查询失败:{str(e)[:200]}") + return False + +def test_es_online(): + """测试Online ES连接""" + print("\n" + "="*60) + print("测试 Elasticsearch(Online)连接") + print("="*60) + + try: + url = "https://es-7vd7jcu9.public.tencentelasticsearch.com:9200" + auth = HTTPBasicAuth("elastic", "F%?QDcWes7N2WTuiYD11") + + response = requests.get( + url, + auth=auth, + verify=False, + timeout=10 + ) + + if response.status_code == 200: + info = response.json() + print(f"✅ ES 连接成功!集群名称:{info.get('cluster_name')}") + + # 测试查询索引 + indices_resp = requests.get( + f"{url}/_cat/indices?format=json", + auth=auth, + verify=False, + timeout=10 + ) + if indices_resp.status_code == 200: + indices = indices_resp.json() + print(f"✅ 查询成功!索引数量:{len(indices)}") + if indices: + print(f" 前3个索引:{[idx['index'] for idx in indices[:3]]}") + + return True + else: + print(f"❌ ES 连接失败:HTTP {response.status_code}") + return False + + except Exception as e: + print(f"❌ ES 连接/查询失败:{str(e)[:200]}") + return False + +if __name__ == "__main__": + print("开始测试所有数据库连接...") + + results = {} + results["PostgreSQL(Online)"] = test_postgresql() + results["MySQL(Test)"] = test_mysql_test() + results["MySQL(Online)"] = test_mysql_online() + results["ES(Online)"] = test_es_online() + + print("\n" + "="*60) + print("测试总结") + print("="*60) + for name, result in results.items(): + status = "✅ 正常" if result else "❌ 异常" + print(f"{name}: {status}")