diff --git a/.gitignore b/.gitignore index abde2a6..6bb3d29 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,9 @@ __pycache__/ .openclaw/ .clawhub/ +# 归档备份目录 +backup/ + # 日志文件 *.log logs/ diff --git a/.vala_skill_hashes b/.vala_skill_hashes index 3fbe56d..83768e5 100644 --- a/.vala_skill_hashes +++ b/.vala_skill_hashes @@ -10,3 +10,4 @@ smart-auto-model-switch f94a7b1ff6357c287a374366b339cd06b0dde5ca9e965a71a369ff66 vala-component-oops-stat 2e3a3dc8ee252f133256ec5fb8edc031fc25ff602158fd2ddf778ce4d2b961f2 vala-component-practice-stat 8e768e2641019d27bd41f4647d2d90f24182a0554dad5ad9f4136e9ce0bae147 cron-schedule e103cbb1806b28c891b9c856963325086ecaff32edec208f0a841865f26e8f3e +refund-user-learning-analysis 18f23bb62167466319ab667225cad5fc654680fe7820a1f3d87cfe98c30e96fd diff --git a/AGENTS.md b/AGENTS.md index 557d7ae..9456509 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -133,6 +133,34 @@ System: [...] Feishu[xiaobian] group oc_xxx | 发送者 (ou_xxx) [msg:om_xxx] 参与,而非主导。质量 > 数量。 +## 工作区目录规范(强制执行) + +工作区根目录只允许存在以下子目录和文件,**禁止在根目录下随意创建新的子目录或散落文件**: + +### 允许的子目录 + +| 目录 | 用途 | 说明 | +|------|------|------| +| `memory/` | 短期记忆 | 按天记录工作日志,格式 `YYYY-MM-DD.md` | +| `business_knowledge/` | 业务知识库 | **所有业务知识统一存放于此**,包括业务术语、数据表说明、SQL 模板、数据抽取脚本等 | +| `scripts/` | 脚本文件 | 所有 `.py`、`.sh`、`.sql` 等脚本文件必须放在此目录 | +| `output/` | 输出文件 | 所有生成的报表(`.xlsx`、`.csv`)、日志(`.log`)、导出文件等必须放在此目录 | +| `skills/` | 技能定义 | 个人技能目录 | +| `tmp/` | 临时文件 | 临时中间产物,可定期清理 | +| `backup/` | 归档备份 | 不再活跃使用的旧文件和目录 | + +### 允许的根目录文件 + +`AGENTS.md`、`SOUL.md`、`USER.md`、`MEMORY.md`、`TOOLS.md`、`IDENTITY.md`、`HEARTBEAT.md`、`BOOTSTRAP.md`、`secrets.env`、`.env`、`.gitignore` + +### 强制规则 + +1. **脚本文件** → 始终创建在 `scripts/` 目录下,绝不放在根目录 +2. **输出文件**(xlsx/csv/log/报表等)→ 始终创建在 `output/` 目录下,绝不放在根目录 +3. **业务知识** → 统一记录到 `business_knowledge/` 目录 +4. **新增子目录** → 禁止在根目录下随意创建新子目录。如有特殊需要,须经技术负责人确认 +5. **临时文件** → 使用 `tmp/`,用完即清 + ## 工具 Skills 提供你的工具。当你需要某个工具时,查看它的 `SKILL.md`。在 `TOOLS.md` 中保存环境相关的备注(数据库连接、API 配置等)。 diff --git a/ai_member_xiaoxi b/ai_member_xiaoxi deleted file mode 160000 index c7e1952..0000000 --- a/ai_member_xiaoxi +++ /dev/null @@ -1 +0,0 @@ -Subproject commit c7e1952f72fd3e7379b14c10a87540fa2ce84037 diff --git a/ai_member_xiaoxi.bfg-report/2026-03-03/16-01-30/cache-stats.txt b/ai_member_xiaoxi.bfg-report/2026-03-03/16-01-30/cache-stats.txt deleted file mode 100644 index d0b8d79..0000000 --- a/ai_member_xiaoxi.bfg-report/2026-03-03/16-01-30/cache-stats.txt +++ /dev/null @@ -1,4 +0,0 @@ -(apply,CacheStats{hitCount=16, missCount=5, loadSuccessCount=5, loadExceptionCount=0, totalLoadTime=28346537, evictionCount=0}) -(tree,CacheStats{hitCount=5, missCount=8, loadSuccessCount=8, loadExceptionCount=0, totalLoadTime=13193310, evictionCount=0}) -(commit,CacheStats{hitCount=2, missCount=3, loadSuccessCount=3, loadExceptionCount=0, totalLoadTime=27853456, evictionCount=0}) -(tag,CacheStats{hitCount=0, missCount=0, loadSuccessCount=0, loadExceptionCount=0, totalLoadTime=0, evictionCount=0}) diff --git a/ai_member_xiaoxi.bfg-report/2026-03-03/16-01-30/changed-files.txt b/ai_member_xiaoxi.bfg-report/2026-03-03/16-01-30/changed-files.txt deleted file mode 100644 index 32bf58d..0000000 --- a/ai_member_xiaoxi.bfg-report/2026-03-03/16-01-30/changed-files.txt +++ /dev/null @@ -1 +0,0 @@ -8e09023a9bda22c602a1c8dc1eb2f7a0ebdcfb2d 3062188004198c6a73c95042a620fd20f1492c39 TOOLS.md diff --git a/ai_member_xiaoxi.bfg-report/2026-03-03/16-01-30/object-id-map.old-new.txt b/ai_member_xiaoxi.bfg-report/2026-03-03/16-01-30/object-id-map.old-new.txt deleted file mode 100644 index 1013c09..0000000 --- a/ai_member_xiaoxi.bfg-report/2026-03-03/16-01-30/object-id-map.old-new.txt +++ /dev/null @@ -1,3 +0,0 @@ -037a62079890d989cdcd02b38d4e3951a92fc87e 8ab4e0749014d066411ca1b40bc3db69af9a0c6e -2ee12bae8e137cf71fe5ab95293623d2017b7985 339001c1df863b8e5eaaf54ad3fff2d0aa35bce5 -b91ce3a3a73bd86c4ee7f2616c8259fb8ae1bcfc 85f58778f25a6ac5a5959531ef601405e827e902 diff --git a/backup_git b/backup_git deleted file mode 160000 index 2ee12ba..0000000 --- a/backup_git +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 2ee12bae8e137cf71fe5ab95293623d2017b7985 diff --git a/bfg.jar b/bfg.jar deleted file mode 100644 index 688fe71..0000000 Binary files a/bfg.jar and /dev/null differ diff --git a/makee_vala/business_rules.md b/business_knowledge/business_rules.md similarity index 100% rename from makee_vala/business_rules.md rename to business_knowledge/business_rules.md diff --git a/database/mysql_online_full.txt b/database/mysql_online_full.txt deleted file mode 100644 index 5434b21..0000000 --- a/database/mysql_online_full.txt +++ /dev/null @@ -1,441 +0,0 @@ -TABLE_NAME COLUMN_NAME DATA_TYPE IS_NULLABLE -casbin_rule id bigint NO -casbin_rule ptype varchar YES -casbin_rule v0 varchar YES -casbin_rule v1 varchar YES -casbin_rule v2 varchar YES -casbin_rule v3 varchar YES -casbin_rule v4 varchar YES -casbin_rule v5 varchar YES -exa_customers id bigint NO -exa_customers created_at datetime YES -exa_customers updated_at datetime YES -exa_customers deleted_at datetime YES -exa_customers customer_name varchar YES -exa_customers customer_phone_data varchar YES -exa_customers sys_user_id bigint YES -exa_customers sys_user_authority_id bigint YES -exa_file_chunks id bigint NO -exa_file_chunks created_at datetime YES -exa_file_chunks updated_at datetime YES -exa_file_chunks deleted_at datetime YES -exa_file_chunks exa_file_id bigint YES -exa_file_chunks file_chunk_number bigint YES -exa_file_chunks file_chunk_path varchar YES -exa_file_upload_and_downloads id bigint NO -exa_file_upload_and_downloads name varchar YES -exa_file_upload_and_downloads url varchar YES -exa_file_upload_and_downloads tag varchar YES -exa_file_upload_and_downloads key varchar YES -exa_file_upload_and_downloads created_at datetime YES -exa_file_upload_and_downloads updated_at datetime YES -exa_file_upload_and_downloads deleted_at datetime YES -exa_files id bigint NO -exa_files created_at datetime YES -exa_files updated_at datetime YES -exa_files deleted_at datetime YES -exa_files file_name varchar YES -exa_files file_md5 varchar YES -exa_files file_path varchar YES -exa_files chunk_total bigint YES -exa_files is_finish tinyint YES -jwt_blacklists id bigint NO -jwt_blacklists created_at datetime YES -jwt_blacklists updated_at datetime YES -jwt_blacklists deleted_at datetime YES -jwt_blacklists jwt text YES -sys_apis id bigint NO -sys_apis created_at datetime YES -sys_apis updated_at datetime YES -sys_apis deleted_at datetime YES -sys_apis path varchar YES -sys_apis description varchar YES -sys_apis api_group varchar YES -sys_apis method varchar YES -sys_authorities created_at datetime YES -sys_authorities updated_at datetime YES -sys_authorities deleted_at datetime YES -sys_authorities authority_id bigint NO -sys_authorities authority_name varchar YES -sys_authorities parent_id bigint YES -sys_authorities default_router varchar YES -sys_authority_btns authority_id bigint YES -sys_authority_btns sys_menu_id bigint YES -sys_authority_btns sys_base_menu_btn_id bigint YES -sys_authority_menus sys_base_menu_id bigint NO -sys_authority_menus sys_authority_authority_id bigint NO -sys_auto_code_histories id bigint NO -sys_auto_code_histories created_at datetime YES -sys_auto_code_histories updated_at datetime YES -sys_auto_code_histories deleted_at datetime YES -sys_auto_code_histories package varchar YES -sys_auto_code_histories business_db varchar YES -sys_auto_code_histories table_name varchar YES -sys_auto_code_histories request_meta text YES -sys_auto_code_histories auto_code_path text YES -sys_auto_code_histories injection_meta text YES -sys_auto_code_histories struct_name varchar YES -sys_auto_code_histories struct_cn_name varchar YES -sys_auto_code_histories api_ids varchar YES -sys_auto_code_histories flag bigint YES -sys_auto_codes id bigint NO -sys_auto_codes created_at datetime YES -sys_auto_codes updated_at datetime YES -sys_auto_codes deleted_at datetime YES -sys_auto_codes package_name varchar YES -sys_auto_codes label varchar YES -sys_auto_codes desc varchar YES -sys_base_menu_btns id bigint NO -sys_base_menu_btns created_at datetime YES -sys_base_menu_btns updated_at datetime YES -sys_base_menu_btns deleted_at datetime YES -sys_base_menu_btns name varchar YES -sys_base_menu_btns desc varchar YES -sys_base_menu_btns sys_base_menu_id bigint YES -sys_base_menu_parameters id bigint NO -sys_base_menu_parameters created_at datetime YES -sys_base_menu_parameters updated_at datetime YES -sys_base_menu_parameters deleted_at datetime YES -sys_base_menu_parameters sys_base_menu_id bigint YES -sys_base_menu_parameters type varchar YES -sys_base_menu_parameters key varchar YES -sys_base_menu_parameters value varchar YES -sys_base_menus id bigint NO -sys_base_menus created_at datetime YES -sys_base_menus updated_at datetime YES -sys_base_menus deleted_at datetime YES -sys_base_menus menu_level bigint YES -sys_base_menus parent_id varchar YES -sys_base_menus path varchar YES -sys_base_menus name varchar YES -sys_base_menus hidden tinyint YES -sys_base_menus component varchar YES -sys_base_menus sort bigint YES -sys_base_menus active_name varchar YES -sys_base_menus keep_alive tinyint YES -sys_base_menus default_menu tinyint YES -sys_base_menus title varchar YES -sys_base_menus icon varchar YES -sys_base_menus close_tab tinyint YES -sys_chat_gpt_options sk varchar YES -sys_data_authority_id sys_authority_authority_id bigint NO -sys_data_authority_id data_authority_id_authority_id bigint NO -sys_dictionaries id bigint NO -sys_dictionaries created_at datetime YES -sys_dictionaries updated_at datetime YES -sys_dictionaries deleted_at datetime YES -sys_dictionaries name varchar YES -sys_dictionaries type varchar YES -sys_dictionaries status tinyint YES -sys_dictionaries desc varchar YES -sys_dictionary_details id bigint NO -sys_dictionary_details created_at datetime YES -sys_dictionary_details updated_at datetime YES -sys_dictionary_details deleted_at datetime YES -sys_dictionary_details label varchar YES -sys_dictionary_details value bigint YES -sys_dictionary_details extend varchar YES -sys_dictionary_details status tinyint YES -sys_dictionary_details sort bigint YES -sys_dictionary_details sys_dictionary_id bigint YES -sys_operation_records id bigint NO -sys_operation_records updated_at datetime YES -sys_operation_records nick_name varchar YES -sys_operation_records body text YES -sys_operation_records path varchar YES -sys_operation_records ip varchar YES -sys_operation_records method varchar YES -sys_operation_records status bigint YES -sys_operation_records latency bigint YES -sys_operation_records agent varchar YES -sys_operation_records error_message varchar YES -sys_operation_records resp text YES -sys_operation_records user_id bigint YES -sys_operation_records deleted_at datetime YES -sys_operation_records created_at datetime YES -sys_user_authority sys_user_id bigint NO -sys_user_authority sys_authority_authority_id bigint NO -sys_users id bigint NO -sys_users uuid varchar YES -sys_users username varchar YES -sys_users password varchar YES -sys_users nick_name varchar YES -sys_users authority_id bigint YES -sys_users side_mode varchar YES -sys_users header_img varchar YES -sys_users base_color varchar YES -sys_users active_color varchar YES -sys_users phone varchar YES -sys_users email varchar YES -sys_users enable bigint YES -sys_users created_at datetime YES -sys_users updated_at datetime YES -sys_users deleted_at datetime YES -vala_app_account id bigint NO -vala_app_account tel varchar YES -vala_app_account tel_encrypt varchar YES -vala_app_account name varchar YES -vala_app_account id_card varchar YES -vala_app_account status tinyint NO -vala_app_account pay_status int YES -vala_app_account login_times int YES -vala_app_account remark text YES -vala_app_account key_from varchar NO -vala_app_account download_channel varchar YES -vala_app_account created_at datetime YES -vala_app_account updated_at datetime YES -vala_app_account deleted_at datetime YES -vala_app_character id bigint NO -vala_app_character account_id bigint YES -vala_app_character nickname varchar YES -vala_app_character gender tinyint YES -vala_app_character birthday varchar YES -vala_app_character avatar text YES -vala_app_character spine_name varchar YES -vala_app_character latest_login datetime YES -vala_app_character reputation char YES -vala_app_character robots_cnt int YES -vala_app_character head_image varchar YES -vala_app_character status tinyint YES -vala_app_character purchase_season_package text YES -vala_app_character total_sp_point int YES -vala_app_character balance_sp_point int YES -vala_app_character point int YES -vala_app_character pay_status tinyint YES -vala_app_character created_at datetime YES -vala_app_character updated_at datetime YES -vala_app_character deleted_at datetime YES -vala_app_character mood int NO -vala_app_character mood_start_time bigint NO -vala_applet_user id bigint NO -vala_applet_user tel varchar YES -vala_applet_user open_id varchar YES -vala_applet_user session_key varchar YES -vala_applet_user union_id varchar YES -vala_applet_user status tinyint NO -vala_applet_user created_at datetime YES -vala_applet_user updated_at datetime YES -vala_applet_user deleted_at datetime YES -vala_card_use_statistics id bigint NO -vala_card_use_statistics user_id int NO -vala_card_use_statistics card_type tinyint YES -vala_card_use_statistics card_id int YES -vala_card_use_statistics count_num int NO -vala_card_use_statistics created_at datetime YES -vala_card_use_statistics updated_at datetime YES -vala_card_use_statistics deleted_at datetime YES -vala_chat id bigint NO -vala_chat user_id bigint NO -vala_chat mod_id bigint NO -vala_chat npc_id bigint NO -vala_chat chat_id varchar NO -vala_chat msg varchar NO -vala_chat assistant text YES -vala_chat created_at datetime YES -vala_chat updated_at datetime YES -vala_chat deleted_at datetime YES -vala_friend_relation id int NO -vala_friend_relation resource_id int NO -vala_friend_relation relation_type tinyint YES -vala_friend_relation chapter_id int NO -vala_friend_relation component_id int NO -vala_friend_relation interact_times int NO -vala_friend_relation favor_level int NO -vala_friend_relation content_show varchar YES -vala_friend_relation created_at datetime YES -vala_friend_relation updated_at datetime YES -vala_friend_relation deleted_at datetime YES -vala_invitation_code id bigint NO -vala_invitation_code code varchar YES -vala_invitation_code owner varchar YES -vala_invitation_code status tinyint NO -vala_invitation_code created_at datetime YES -vala_invitation_code updated_at datetime YES -vala_invitation_code deleted_at datetime YES -vala_invitation_code_log id bigint NO -vala_invitation_code_log code_id bigint NO -vala_invitation_code_log ip_address varchar YES -vala_invitation_code_log city varchar YES -vala_invitation_code_log created_at datetime YES -vala_invitation_code_log updated_at datetime YES -vala_invitation_code_log deleted_at timestamp YES -vala_relation_level id int NO -vala_relation_level level int NO -vala_relation_level upgrade_need_value int NO -vala_relation_level total_require_value int NO -vala_relation_level favorability_lower_limit int NO -vala_relation_level favorability_upper_limit int NO -vala_relation_level created_at datetime YES -vala_relation_level updated_at datetime YES -vala_relation_level deleted_at datetime YES -vala_seasonal_ticket id bigint NO -vala_seasonal_ticket unique_code varchar YES -vala_seasonal_ticket account_id bigint YES -vala_seasonal_ticket account_name varchar YES -vala_seasonal_ticket tel varchar YES -vala_seasonal_ticket give_time int YES -vala_seasonal_ticket expire_time int YES -vala_seasonal_ticket expire_type tinyint NO -vala_seasonal_ticket ticket_type tinyint NO -vala_seasonal_ticket used_time int YES -vala_seasonal_ticket season_package_id bigint YES -vala_seasonal_ticket season_package_name varchar YES -vala_seasonal_ticket status tinyint NO -vala_seasonal_ticket created_at datetime YES -vala_seasonal_ticket updated_at datetime YES -vala_seasonal_ticket deleted_at datetime YES -vala_user_card_component_record id bigint NO -vala_user_card_component_record user_id bigint NO -vala_user_card_component_record game_id bigint YES -vala_user_card_component_record chapter_id bigint YES -vala_user_card_component_record component_type bigint YES -vala_user_card_component_record sub_component_type bigint YES -vala_user_card_component_record component_id bigint NO -vala_user_card_component_record card_content text YES -vala_user_card_component_record is_new tinyint NO -vala_user_card_component_record created_at datetime YES -vala_user_card_component_record updated_at datetime YES -vala_user_card_component_record deleted_at datetime YES -vala_user_card_plot_upgrade id bigint NO -vala_user_card_plot_upgrade user_id bigint NO -vala_user_card_plot_upgrade chapter_id bigint YES -vala_user_card_plot_upgrade plot_card_id bigint NO -vala_user_card_plot_upgrade is_have_upgrade bigint YES -vala_user_card_plot_upgrade white_card_status tinyint NO -vala_user_card_plot_upgrade flash_card_status tinyint NO -vala_user_card_plot_upgrade is_new tinyint YES -vala_user_card_plot_upgrade created_at datetime YES -vala_user_card_plot_upgrade updated_at datetime YES -vala_user_card_plot_upgrade deleted_at datetime YES -vala_user_chapter_and_component_record id bigint NO -vala_user_chapter_and_component_record user_id bigint NO -vala_user_chapter_and_component_record chapter_id text NO -vala_user_chapter_and_component_record component_id text NO -vala_user_chapter_and_component_record created_at datetime YES -vala_user_chapter_and_component_record updated_at datetime YES -vala_user_chapter_and_component_record deleted_at datetime YES -vala_user_chapter_finish_record id bigint NO -vala_user_chapter_finish_record user_id bigint NO -vala_user_chapter_finish_record game_id bigint YES -vala_user_chapter_finish_record chapter_id bigint NO -vala_user_chapter_finish_record created_at datetime YES -vala_user_chapter_finish_record updated_at datetime YES -vala_user_chapter_finish_record deleted_at datetime YES -vala_user_chat_log id bigint NO -vala_user_chat_log user_id bigint NO -vala_user_chat_log game_id bigint YES -vala_user_chat_log chapter_id bigint YES -vala_user_chat_log sub_component_type bigint YES -vala_user_chat_log component_id bigint NO -vala_user_chat_log npc_id bigint NO -vala_user_chat_log session_id varchar NO -vala_user_chat_log dialog_list text YES -vala_user_chat_log dialog_score text YES -vala_user_chat_log tencent_soe text YES -vala_user_chat_log chat_review_resp text YES -vala_user_chat_log cost_time int NO -vala_user_chat_log insert_time datetime YES -vala_user_chat_log log_type tinyint YES -vala_user_chat_log deal_status tinyint YES -vala_user_chat_log created_at datetime YES -vala_user_chat_log updated_at datetime YES -vala_user_chat_log deleted_at datetime YES -vala_user_drama_log id bigint NO -vala_user_drama_log user_id bigint NO -vala_user_drama_log game_id bigint YES -vala_user_drama_log chapter_id bigint YES -vala_user_drama_log component_id bigint YES -vala_user_drama_log npc_id bigint YES -vala_user_drama_log created_at datetime YES -vala_user_drama_log updated_at datetime YES -vala_user_drama_log deleted_at datetime YES -vala_user_friend_info id bigint NO -vala_user_friend_info user_id bigint NO -vala_user_friend_info npc_id bigint NO -vala_user_friend_info favorability_value int YES -vala_user_friend_info level int YES -vala_user_friend_info relation_type tinyint YES -vala_user_friend_info interact_times int YES -vala_user_friend_info story_id int YES -vala_user_friend_info created_at datetime YES -vala_user_friend_info updated_at datetime YES -vala_user_friend_info deleted_at datetime YES -vala_user_game_finish_record id bigint NO -vala_user_game_finish_record user_id bigint NO -vala_user_game_finish_record game_id bigint YES -vala_user_game_finish_record created_at datetime YES -vala_user_game_finish_record updated_at datetime YES -vala_user_game_finish_record deleted_at datetime YES -vala_user_interact_chat_log id bigint NO -vala_user_interact_chat_log user_id bigint NO -vala_user_interact_chat_log npc_id bigint NO -vala_user_interact_chat_log session_id varchar YES -vala_user_interact_chat_log type varchar NO -vala_user_interact_chat_log name varchar NO -vala_user_interact_chat_log text varchar NO -vala_user_interact_chat_log interactive_type text YES -vala_user_interact_chat_log interactive_selected tinyint YES -vala_user_interact_chat_log chat_time datetime YES -vala_user_interact_chat_log created_at datetime YES -vala_user_interact_chat_log updated_at datetime YES -vala_user_interact_chat_log deleted_at datetime YES -vala_user_interact_log id bigint NO -vala_user_interact_log user_id bigint NO -vala_user_interact_log component_type tinyint YES -vala_user_interact_log relation_type tinyint YES -vala_user_interact_log npc_id bigint NO -vala_user_interact_log session_id varchar NO -vala_user_interact_log summary text NO -vala_user_interact_log dialog_list text YES -vala_user_interact_log dialog_score text YES -vala_user_interact_log tencent_soe text YES -vala_user_interact_log chat_review_resp text YES -vala_user_interact_log cost_time int NO -vala_user_interact_log insert_time datetime YES -vala_user_interact_log created_at datetime YES -vala_user_interact_log updated_at datetime YES -vala_user_interact_log deleted_at datetime YES -vala_user_interact_log favor_value int YES -vala_user_learn_record id int NO -vala_user_learn_record user_id bigint NO -vala_user_learn_record game_id bigint NO -vala_user_learn_record chapter_id bigint NO -vala_user_learn_record score float NO -vala_user_learn_record score_level char YES -vala_user_learn_record unique_code varchar YES -vala_user_learn_record source_type tinyint YES -vala_user_learn_record created_at datetime YES -vala_user_learn_record updated_at datetime YES -vala_user_learn_record deleted_at timestamp YES -vala_user_long_term_memory id bigint NO -vala_user_long_term_memory user_id int YES -vala_user_long_term_memory npc_id int YES -vala_user_long_term_memory robot_id int YES -vala_user_long_term_memory config_id int YES -vala_user_long_term_memory session_id varchar YES -vala_user_long_term_memory log_id int YES -vala_user_long_term_memory source varchar YES -vala_user_long_term_memory source_type varchar YES -vala_user_long_term_memory memory_type varchar YES -vala_user_long_term_memory sub_type varchar YES -vala_user_long_term_memory user_content text YES -vala_user_long_term_memory system_content text YES -vala_user_long_term_memory created_at datetime YES -vala_user_long_term_memory updated_at datetime YES -vala_user_long_term_memory deleted_at datetime YES -vala_user_robot id int NO -vala_user_robot user_id bigint NO -vala_user_robot robot_id bigint NO -vala_user_robot robot_level tinyint YES -vala_user_robot created_at datetime YES -vala_user_robot updated_at datetime YES -vala_user_robot deleted_at timestamp YES -vala_user_word_statistics id bigint NO -vala_user_word_statistics user_id int NO -vala_user_word_statistics word_pattern_id varchar YES -vala_user_word_statistics count_num int NO -vala_user_word_statistics content_type char YES -vala_user_word_statistics source_type tinyint YES -vala_user_word_statistics created_at datetime YES -vala_user_word_statistics updated_at datetime YES -vala_user_word_statistics deleted_at datetime YES diff --git a/database/mysql_test_full.txt b/database/mysql_test_full.txt deleted file mode 100644 index e1029bb..0000000 --- a/database/mysql_test_full.txt +++ /dev/null @@ -1,971 +0,0 @@ -TABLE_NAME COLUMN_NAME DATA_TYPE IS_NULLABLE -ai_reviewer_issue_records id int NO -ai_reviewer_issue_records table_name varchar NO -ai_reviewer_issue_records record_key varchar NO -ai_reviewer_issue_records issue_type varchar NO -ai_reviewer_issue_records issue_location varchar YES -ai_reviewer_issue_records issue_description text NO -ai_reviewer_issue_records issue_detail text YES -ai_reviewer_issue_records source_content text YES -ai_reviewer_issue_records raw_llm_output text YES -ai_reviewer_issue_records review_model varchar YES -ai_reviewer_issue_records review_result varchar YES -ai_reviewer_issue_records review_reason text YES -ai_reviewer_issue_records is_notified tinyint YES -ai_reviewer_issue_records is_exported tinyint YES -ai_reviewer_issue_records created_at datetime YES -ai_reviewer_issue_records notified_at datetime YES -ai_reviewer_scan_metadata id int NO -ai_reviewer_scan_metadata table_name varchar NO -ai_reviewer_scan_metadata last_full_scan_at datetime YES -ai_reviewer_scan_metadata full_scan_status varchar NO -ai_reviewer_scan_metadata scan_offset_hours int NO -ai_reviewer_scan_metadata total_records_scanned int YES -ai_reviewer_scan_metadata created_at datetime YES -ai_reviewer_scan_metadata updated_at datetime YES -ai_reviewer_scan_records id int NO -ai_reviewer_scan_records table_name varchar NO -ai_reviewer_scan_records record_key varchar NO -ai_reviewer_scan_records last_updated_at datetime NO -ai_reviewer_scan_records last_scanned_at datetime NO -ai_reviewer_scan_records scan_status varchar NO -ai_reviewer_scan_records has_issues tinyint YES -ai_reviewer_scan_records content_hash varchar YES -ai_reviewer_scan_records created_at datetime YES -ai_reviewer_scan_records updated_at datetime YES -casbin_rule id bigint NO -casbin_rule ptype varchar YES -casbin_rule v0 varchar YES -casbin_rule v1 varchar YES -casbin_rule v2 varchar YES -casbin_rule v3 varchar YES -casbin_rule v4 varchar YES -casbin_rule v5 varchar YES -chapter_task_desc id bigint NO -chapter_task_desc chapter_id bigint NO -chapter_task_desc section_target varchar NO -chapter_task_desc reviewCount int NO -chapter_task_desc task_desc text YES -chapter_task_desc question_list text YES -chapter_task_desc created_at datetime YES -chapter_task_desc updated_at datetime YES -chapter_task_desc deleted_at datetime YES -core_interaction_component id bigint NO -core_interaction_component c_type varchar YES -core_interaction_component c_id varchar YES -core_interaction_component title varchar YES -core_interaction_component component_config text YES -core_interaction_component notice_component_config text YES -core_interaction_component err_msg text YES -core_interaction_component material_img_list text YES -core_interaction_component audio_list text YES -core_interaction_component text_analysis text YES -core_interaction_component related_path text YES -core_interaction_component read_status tinyint YES -core_interaction_component config_status tinyint YES -core_interaction_component lock_status tinyint YES -core_interaction_component kp_relation_info text YES -core_interaction_component created_at datetime YES -core_interaction_component updated_at datetime YES -core_interaction_component deleted_at datetime YES -dev_knowledge_point_config id int NO -dev_knowledge_point_config content varchar NO -dev_knowledge_point_config type varchar NO -dev_knowledge_point_config definition text NO -dev_knowledge_point_config example text YES -dev_knowledge_point_config created_at timestamp NO -dev_knowledge_point_config updated_at timestamp NO -dev_question_system_config id int NO -dev_question_system_config knowledge_point_id int NO -dev_question_system_config question_type varchar NO -dev_question_system_config question_type_name varchar NO -dev_question_system_config content text NO -dev_question_system_config format varchar NO -dev_question_system_config options json YES -dev_question_system_config answer int YES -dev_question_system_config fill_answer text YES -dev_question_system_config created_at timestamp NO -dev_question_system_config updated_at timestamp NO -exa_customers id bigint NO -exa_customers created_at datetime YES -exa_customers updated_at datetime YES -exa_customers deleted_at datetime YES -exa_customers customer_name varchar YES -exa_customers customer_phone_data varchar YES -exa_customers sys_user_id bigint YES -exa_customers sys_user_authority_id bigint YES -exa_file_chunks id bigint NO -exa_file_chunks created_at datetime YES -exa_file_chunks updated_at datetime YES -exa_file_chunks deleted_at datetime YES -exa_file_chunks exa_file_id bigint YES -exa_file_chunks file_chunk_number bigint YES -exa_file_chunks file_chunk_path varchar YES -exa_file_upload_and_downloads id bigint NO -exa_file_upload_and_downloads name varchar YES -exa_file_upload_and_downloads url varchar YES -exa_file_upload_and_downloads tag varchar YES -exa_file_upload_and_downloads key varchar YES -exa_file_upload_and_downloads created_at datetime YES -exa_file_upload_and_downloads updated_at datetime YES -exa_file_upload_and_downloads deleted_at datetime YES -exa_files id bigint NO -exa_files created_at datetime YES -exa_files updated_at datetime YES -exa_files deleted_at datetime YES -exa_files file_name varchar YES -exa_files file_md5 varchar YES -exa_files file_path varchar YES -exa_files chunk_total bigint YES -exa_files is_finish tinyint YES -jwt_blacklists id bigint NO -jwt_blacklists created_at datetime YES -jwt_blacklists updated_at datetime YES -jwt_blacklists deleted_at datetime YES -jwt_blacklists jwt text YES -lesson_config_info id bigint NO -lesson_config_info chapter_id bigint YES -lesson_config_info lock_status tinyint YES -lesson_config_info review_info text YES -lesson_config_info created_at datetime YES -lesson_config_info updated_at datetime YES -lesson_config_info deleted_at datetime YES -llm_words_mapping group text YES -llm_words_mapping wordsMapping text YES -llm_words_mapping updated_at datetime YES -middle_interaction_component id bigint NO -middle_interaction_component c_type varchar YES -middle_interaction_component c_id varchar YES -middle_interaction_component title varchar YES -middle_interaction_component component_config text YES -middle_interaction_component notice_component_config text YES -middle_interaction_component err_msg text YES -middle_interaction_component audio_list text YES -middle_interaction_component text_analysis text YES -middle_interaction_component related_path text YES -middle_interaction_component read_status tinyint YES -middle_interaction_component config_status tinyint YES -middle_interaction_component lock_status tinyint YES -middle_interaction_component kp_relation_info text YES -middle_interaction_component created_at datetime YES -middle_interaction_component updated_at datetime YES -middle_interaction_component deleted_at datetime YES -prompt_templates id int NO -prompt_templates user_id varchar NO -prompt_templates user_name varchar YES -prompt_templates template_name varchar NO -prompt_templates prompt_content text NO -prompt_templates description text YES -prompt_templates model_name varchar YES -prompt_templates model_params json YES -prompt_templates is_public tinyint YES -prompt_templates usage_count int YES -prompt_templates hidden_count int YES -prompt_templates created_at timestamp NO -prompt_templates updated_at timestamp NO -prompt_templates public_at timestamp YES -sys_apis id bigint NO -sys_apis created_at datetime YES -sys_apis updated_at datetime YES -sys_apis deleted_at datetime YES -sys_apis path varchar YES -sys_apis description varchar YES -sys_apis api_group varchar YES -sys_apis method varchar YES -sys_authorities created_at datetime YES -sys_authorities updated_at datetime YES -sys_authorities deleted_at datetime YES -sys_authorities authority_id bigint NO -sys_authorities authority_name varchar YES -sys_authorities parent_id bigint YES -sys_authorities default_router varchar YES -sys_authority_btns authority_id bigint YES -sys_authority_btns sys_menu_id bigint YES -sys_authority_btns sys_base_menu_btn_id bigint YES -sys_authority_menus sys_base_menu_id bigint NO -sys_authority_menus sys_authority_authority_id bigint NO -sys_auto_code_histories id bigint NO -sys_auto_code_histories created_at datetime YES -sys_auto_code_histories updated_at datetime YES -sys_auto_code_histories deleted_at datetime YES -sys_auto_code_histories package varchar YES -sys_auto_code_histories business_db varchar YES -sys_auto_code_histories table_name varchar YES -sys_auto_code_histories request_meta text YES -sys_auto_code_histories auto_code_path text YES -sys_auto_code_histories injection_meta text YES -sys_auto_code_histories struct_name varchar YES -sys_auto_code_histories struct_cn_name varchar YES -sys_auto_code_histories api_ids varchar YES -sys_auto_code_histories flag bigint YES -sys_auto_codes id bigint NO -sys_auto_codes created_at datetime YES -sys_auto_codes updated_at datetime YES -sys_auto_codes deleted_at datetime YES -sys_auto_codes package_name varchar YES -sys_auto_codes label varchar YES -sys_auto_codes desc varchar YES -sys_base_menu_btns id bigint NO -sys_base_menu_btns created_at datetime YES -sys_base_menu_btns updated_at datetime YES -sys_base_menu_btns deleted_at datetime YES -sys_base_menu_btns name varchar YES -sys_base_menu_btns desc varchar YES -sys_base_menu_btns sys_base_menu_id bigint YES -sys_base_menu_parameters id bigint NO -sys_base_menu_parameters created_at datetime YES -sys_base_menu_parameters updated_at datetime YES -sys_base_menu_parameters deleted_at datetime YES -sys_base_menu_parameters sys_base_menu_id bigint YES -sys_base_menu_parameters type varchar YES -sys_base_menu_parameters key varchar YES -sys_base_menu_parameters value varchar YES -sys_base_menus id bigint NO -sys_base_menus created_at datetime YES -sys_base_menus updated_at datetime YES -sys_base_menus deleted_at datetime YES -sys_base_menus menu_level bigint YES -sys_base_menus parent_id varchar YES -sys_base_menus path varchar YES -sys_base_menus name varchar YES -sys_base_menus hidden tinyint YES -sys_base_menus component varchar YES -sys_base_menus sort bigint YES -sys_base_menus active_name varchar YES -sys_base_menus keep_alive tinyint YES -sys_base_menus default_menu tinyint YES -sys_base_menus title varchar YES -sys_base_menus icon varchar YES -sys_base_menus close_tab tinyint YES -sys_chat_gpt_options sk varchar YES -sys_data_authority_id sys_authority_authority_id bigint NO -sys_data_authority_id data_authority_id_authority_id bigint NO -sys_dictionaries id bigint NO -sys_dictionaries created_at datetime YES -sys_dictionaries updated_at datetime YES -sys_dictionaries deleted_at datetime YES -sys_dictionaries name varchar YES -sys_dictionaries type varchar YES -sys_dictionaries status tinyint YES -sys_dictionaries desc varchar YES -sys_dictionary_details id bigint NO -sys_dictionary_details created_at datetime YES -sys_dictionary_details updated_at datetime YES -sys_dictionary_details deleted_at datetime YES -sys_dictionary_details label varchar YES -sys_dictionary_details value bigint YES -sys_dictionary_details extend varchar YES -sys_dictionary_details status tinyint YES -sys_dictionary_details sort bigint YES -sys_dictionary_details sys_dictionary_id bigint YES -sys_operation_records id bigint NO -sys_operation_records updated_at datetime YES -sys_operation_records nick_name varchar YES -sys_operation_records body text YES -sys_operation_records path varchar YES -sys_operation_records ip varchar YES -sys_operation_records method varchar YES -sys_operation_records status bigint YES -sys_operation_records latency bigint YES -sys_operation_records agent varchar YES -sys_operation_records error_message varchar YES -sys_operation_records resp text YES -sys_operation_records user_id bigint YES -sys_operation_records deleted_at datetime YES -sys_operation_records created_at datetime YES -sys_user_authority sys_user_id bigint NO -sys_user_authority sys_authority_authority_id bigint NO -sys_users id bigint NO -sys_users uuid varchar YES -sys_users username varchar YES -sys_users password varchar YES -sys_users nick_name varchar YES -sys_users authority_id bigint YES -sys_users side_mode varchar YES -sys_users header_img varchar YES -sys_users base_color varchar YES -sys_users active_color varchar YES -sys_users phone varchar YES -sys_users email varchar YES -sys_users enable bigint YES -sys_users created_at datetime YES -sys_users updated_at datetime YES -sys_users deleted_at datetime YES -unit_challenge_question id bigint NO -unit_challenge_question category varchar YES -unit_challenge_question type varchar YES -unit_challenge_question question_set_id varchar YES -unit_challenge_question question_content text YES -unit_challenge_question notice_content text YES -unit_challenge_question err_msg text YES -unit_challenge_question related_path text YES -unit_challenge_question material_list text YES -unit_challenge_question created_at datetime YES -unit_challenge_question updated_at datetime YES -unit_challenge_question deleted_at datetime YES -unit_chapter_text_parse id bigint NO -unit_chapter_text_parse chapter_id bigint NO -unit_chapter_text_parse text_parse mediumtext YES -unit_chapter_text_parse created_at datetime YES -unit_chapter_text_parse updated_at datetime YES -unit_chapter_text_parse deleted_at datetime YES -unity_chapter_section id bigint NO -unity_chapter_section chapter_id bigint NO -unity_chapter_section section_id bigint NO -unity_chapter_section section_target varchar NO -unity_chapter_section start_section text YES -unity_chapter_section end_section text YES -unity_chapter_section component_config text YES -unity_chapter_section pre_cartoon_file varchar YES -unity_chapter_section pre_cartoon_desc varchar YES -unity_chapter_section post_cartoon_file varchar YES -unity_chapter_section post_cartoon_desc varchar YES -unity_chapter_section index int NO -unity_chapter_section status int NO -unity_chapter_section mission_id int NO -unity_chapter_section mission_name varchar NO -unity_chapter_section created_at datetime YES -unity_chapter_section updated_at datetime YES -unity_chapter_section deleted_at datetime YES -user_template_preferences id int NO -user_template_preferences user_id varchar NO -user_template_preferences template_id int NO -user_template_preferences is_hidden tinyint YES -user_template_preferences is_favorite tinyint YES -user_template_preferences created_at timestamp NO -user_template_preferences updated_at timestamp NO -vala_activity id int NO -vala_activity code varchar NO -vala_activity name varchar NO -vala_activity stime bigint NO -vala_activity etime bigint NO -vala_activity is_online tinyint NO -vala_activity times int NO -vala_activity scope enum NO -vala_activity config text YES -vala_activity created_at datetime YES -vala_activity updated_at datetime YES -vala_activity deleted_at datetime YES -vala_activity_condition id bigint NO -vala_activity_condition activity_id int NO -vala_activity_condition condition varchar NO -vala_activity_condition created_at datetime YES -vala_app_account id bigint NO -vala_app_account tel varchar YES -vala_app_account tel_encrypt varchar YES -vala_app_account name varchar YES -vala_app_account id_card varchar YES -vala_app_account status tinyint NO -vala_app_account pay_status tinyint YES -vala_app_account login_times int YES -vala_app_account remark text YES -vala_app_account key_from varchar NO -vala_app_account download_channel varchar YES -vala_app_account created_at datetime YES -vala_app_account updated_at datetime YES -vala_app_account deleted_at datetime YES -vala_app_character id bigint NO -vala_app_character account_id bigint YES -vala_app_character nickname varchar YES -vala_app_character gender tinyint YES -vala_app_character birthday varchar YES -vala_app_character avatar text YES -vala_app_character spine_name varchar YES -vala_app_character latest_login datetime YES -vala_app_character reputation char YES -vala_app_character robots_cnt int YES -vala_app_character head_image varchar YES -vala_app_character status tinyint YES -vala_app_character purchase_season_package text YES -vala_app_character total_sp_point int YES -vala_app_character balance_sp_point int YES -vala_app_character point int YES -vala_app_character pay_status tinyint YES -vala_app_character created_at datetime YES -vala_app_character updated_at datetime YES -vala_app_character deleted_at datetime YES -vala_app_character mood int NO -vala_app_character mood_start_time bigint NO -vala_app_log_upload id bigint NO -vala_app_log_upload name varchar YES -vala_app_log_upload url varchar YES -vala_app_log_upload created_at datetime YES -vala_app_log_upload updated_at datetime YES -vala_app_log_upload deleted_at datetime YES -vala_applet_user id bigint NO -vala_applet_user tel varchar YES -vala_applet_user open_id varchar YES -vala_applet_user session_key varchar YES -vala_applet_user union_id varchar YES -vala_applet_user status tinyint NO -vala_applet_user created_at datetime YES -vala_applet_user updated_at datetime YES -vala_applet_user deleted_at datetime YES -vala_area id bigint NO -vala_area cn_name varchar NO -vala_area en_name varchar NO -vala_area desc text YES -vala_area location_num int YES -vala_area is_hide tinyint NO -vala_area created_at datetime YES -vala_area updated_at datetime YES -vala_area deleted_at datetime YES -vala_area_location id bigint NO -vala_area_location location_id int NO -vala_area_location area_id bigint NO -vala_area_location created_at datetime YES -vala_area_location updated_at datetime YES -vala_area_location deleted_at datetime YES -vala_card_package id bigint NO -vala_card_package name varchar YES -vala_card_package icon_name varchar YES -vala_card_package card_face_name varchar YES -vala_card_package card_back_name varchar YES -vala_card_package card_config text YES -vala_card_package relation_story_id int YES -vala_card_package created_at datetime YES -vala_card_package updated_at datetime YES -vala_card_package deleted_at datetime YES -vala_card_plot id bigint NO -vala_card_plot cn_name varchar YES -vala_card_plot en_name varchar YES -vala_card_plot plot_desc varchar YES -vala_card_plot plot_sentence text YES -vala_card_plot card_face_name varchar YES -vala_card_plot card_back_name varchar YES -vala_card_plot white_plot_card_config text YES -vala_card_plot flash_plot_card_config text YES -vala_card_plot relation_story_id int YES -vala_card_plot created_at datetime YES -vala_card_plot updated_at datetime YES -vala_card_plot deleted_at datetime YES -vala_card_plot_new id bigint NO -vala_card_plot_new cn_name varchar YES -vala_card_plot_new en_name varchar YES -vala_card_plot_new rarity char YES -vala_card_plot_new base_card_main_pic varchar YES -vala_card_plot_new advanced_card_main_pic varchar YES -vala_card_plot_new plot_desc varchar YES -vala_card_plot_new detail_title varchar YES -vala_card_plot_new detail_desc text YES -vala_card_plot_new relation_story_id int NO -vala_card_plot_new relation_chapter_id int NO -vala_card_plot_new created_at datetime YES -vala_card_plot_new updated_at datetime YES -vala_card_plot_new deleted_at datetime YES -vala_card_use_statistics id bigint NO -vala_card_use_statistics user_id int NO -vala_card_use_statistics card_type tinyint YES -vala_card_use_statistics card_id int YES -vala_card_use_statistics count_num int NO -vala_card_use_statistics created_at datetime YES -vala_card_use_statistics updated_at datetime YES -vala_card_use_statistics deleted_at datetime YES -vala_chat id bigint NO -vala_chat user_id bigint NO -vala_chat mod_id bigint NO -vala_chat npc_id bigint NO -vala_chat chat_id varchar NO -vala_chat msg varchar NO -vala_chat assistant text YES -vala_chat created_at datetime YES -vala_chat updated_at datetime YES -vala_chat deleted_at datetime YES -vala_chatbot id bigint NO -vala_chatbot type varchar NO -vala_chatbot ver varchar NO -vala_chatbot title varchar NO -vala_chatbot desc varchar NO -vala_chatbot config text NO -vala_chatbot created_at datetime YES -vala_chatbot updated_at datetime YES -vala_chatbot deleted_at datetime YES -vala_chatbot source_type varchar NO -vala_component_chat_config id bigint NO -vala_component_chat_config component_id bigint YES -vala_component_chat_config title varchar NO -vala_component_chat_config type varchar YES -vala_component_chat_config desc varchar NO -vala_component_chat_config config text NO -vala_component_chat_config created_at datetime YES -vala_component_chat_config updated_at datetime YES -vala_component_chat_config deleted_at datetime YES -vala_friend_relation id int NO -vala_friend_relation resource_id int NO -vala_friend_relation relation_type tinyint YES -vala_friend_relation chapter_id int NO -vala_friend_relation component_id int NO -vala_friend_relation interact_times int NO -vala_friend_relation favor_level int NO -vala_friend_relation content_show varchar YES -vala_friend_relation created_at datetime YES -vala_friend_relation updated_at datetime YES -vala_friend_relation deleted_at datetime YES -vala_game_chapter id bigint NO -vala_game_chapter game_id bigint YES -vala_game_chapter season_package_id bigint YES -vala_game_chapter en_name varchar NO -vala_game_chapter cn_name varchar NO -vala_game_chapter scene_id varchar NO -vala_game_chapter lesson_type tinyint YES -vala_game_chapter event_id int YES -vala_game_chapter desc text NO -vala_game_chapter index smallint YES -vala_game_chapter learn_config text YES -vala_game_chapter scene_path text YES -vala_game_chapter self_knowledge_card_config text YES -vala_game_chapter other_knowledge_card_config text YES -vala_game_chapter review_knowledge_card_config text YES -vala_game_chapter chapter_extension_config text YES -vala_game_chapter created_at datetime YES -vala_game_chapter updated_at datetime YES -vala_game_chapter deleted_at datetime YES -vala_game_chapter_section id bigint NO -vala_game_chapter_section chapter_id bigint YES -vala_game_chapter_section section_target varchar YES -vala_game_chapter_section start_section text YES -vala_game_chapter_section end_section text YES -vala_game_chapter_section pre_cartoon_file varchar YES -vala_game_chapter_section pre_cartoon_desc varchar YES -vala_game_chapter_section post_cartoon_file varchar YES -vala_game_chapter_section post_cartoon_desc varchar YES -vala_game_chapter_section special_event_type tinyint YES -vala_game_chapter_section special_event_en_content text YES -vala_game_chapter_section special_event_cn_content text YES -vala_game_chapter_section standard_event_type tinyint YES -vala_game_chapter_section standard_event_en_content text YES -vala_game_chapter_section standard_event_cn_content text YES -vala_game_chapter_section story_remind_type tinyint YES -vala_game_chapter_section story_remind_en_content text YES -vala_game_chapter_section story_remind_cn_content text YES -vala_game_chapter_section component_config text YES -vala_game_chapter_section mission_id int YES -vala_game_chapter_section mission_name varchar YES -vala_game_chapter_section index int YES -vala_game_chapter_section created_at datetime YES -vala_game_chapter_section updated_at datetime YES -vala_game_chapter_section deleted_at datetime YES -vala_game_component_brain_config id bigint NO -vala_game_component_brain_config brain_config text YES -vala_game_component_brain_config created_at datetime YES -vala_game_component_brain_config updated_at datetime YES -vala_game_component_brain_config deleted_at datetime YES -vala_game_info id bigint NO -vala_game_info en_name varchar NO -vala_game_info cn_name varchar NO -vala_game_info game_code varchar NO -vala_game_info game_category tinyint NO -vala_game_info season_package_id bigint YES -vala_game_info level_type tinyint NO -vala_game_info desc text NO -vala_game_info is_hide tinyint NO -vala_game_info card_package_id text YES -vala_game_info card_plot_id text YES -vala_game_info index int YES -vala_game_info plot_card_config text YES -vala_game_info created_at datetime YES -vala_game_info updated_at datetime YES -vala_game_info deleted_at datetime YES -vala_game_season_package id bigint NO -vala_game_season_package cn_name varchar YES -vala_game_season_package en_name varchar YES -vala_game_season_package season_of_year int NO -vala_game_season_package season_of_quarter int NO -vala_game_season_package season_cn_desc text YES -vala_game_season_package season_en_desc text YES -vala_game_season_package is_hide tinyint YES -vala_game_season_package index int YES -vala_game_season_package level char YES -vala_game_season_package created_at datetime YES -vala_game_season_package updated_at datetime YES -vala_game_season_package deleted_at datetime YES -vala_invitation_code id bigint NO -vala_invitation_code code varchar YES -vala_invitation_code owner varchar YES -vala_invitation_code status tinyint NO -vala_invitation_code created_at datetime YES -vala_invitation_code updated_at datetime YES -vala_invitation_code deleted_at datetime YES -vala_invitation_code_log id bigint NO -vala_invitation_code_log code_id bigint NO -vala_invitation_code_log ip_address varchar YES -vala_invitation_code_log city varchar YES -vala_invitation_code_log created_at datetime YES -vala_invitation_code_log updated_at datetime YES -vala_invitation_code_log deleted_at timestamp YES -vala_km id bigint NO -vala_km type varchar YES -vala_km km_id varchar YES -vala_km title varchar YES -vala_km bg_img varchar YES -vala_km kp_set text YES -vala_km sum_explain text YES -vala_km notice_km_content text YES -vala_km err_msg text YES -vala_km material_list text YES -vala_km created_at datetime YES -vala_km updated_at datetime YES -vala_km deleted_at datetime YES -vala_knowledge_module id bigint NO -vala_knowledge_module cn_title varchar NO -vala_knowledge_module en_title varchar NO -vala_knowledge_module module_type tinyint YES -vala_knowledge_module description text NO -vala_knowledge_module word_knowledge_point text YES -vala_knowledge_module phoneme_knowledge_point text YES -vala_knowledge_module grammar_knowledge_point text YES -vala_knowledge_module sentence_knowledge_point text YES -vala_knowledge_module dialogue_knowledge_point text YES -vala_knowledge_module relation_test_question text YES -vala_knowledge_module created_at datetime YES -vala_knowledge_module updated_at datetime YES -vala_knowledge_module deleted_at datetime YES -vala_knowledge_point id bigint NO -vala_knowledge_point title varchar YES -vala_knowledge_point knowledge_point_type tinyint NO -vala_knowledge_point knowledge_point_content text YES -vala_knowledge_point relation_knowledge_point text YES -vala_knowledge_point created_at datetime YES -vala_knowledge_point updated_at datetime YES -vala_knowledge_point deleted_at datetime YES -vala_kp id bigint NO -vala_kp type varchar YES -vala_kp kp_id varchar YES -vala_kp title varchar YES -vala_kp ipa varchar YES -vala_kp vala_level varchar YES -vala_kp cambridge_level varchar YES -vala_kp cefr_level varchar YES -vala_kp nc_level varchar YES -vala_kp en_desc varchar YES -vala_kp scene varchar YES -vala_kp knowledge_point_content text YES -vala_kp notice_kp_content text YES -vala_kp err_msg text YES -vala_kp related_path text YES -vala_kp created_at datetime YES -vala_kp updated_at datetime YES -vala_kp deleted_at datetime YES -vala_kp_card id bigint NO -vala_kp_card kp_id varchar YES -vala_kp_card category varchar YES -vala_kp_card exp_type varchar YES -vala_kp_card card_content text YES -vala_kp_card question text YES -vala_kp_card notice_content text YES -vala_kp_card err_msg varchar YES -vala_kp_card created_at datetime YES -vala_kp_card updated_at datetime YES -vala_kp_card deleted_at datetime YES -vala_kp_question id bigint NO -vala_kp_question kp_id varchar YES -vala_kp_question category varchar YES -vala_kp_question skill varchar YES -vala_kp_question type varchar YES -vala_kp_question question text YES -vala_kp_question created_at datetime YES -vala_kp_question updated_at datetime YES -vala_kp_question deleted_at datetime YES -vala_learn_ability id bigint NO -vala_learn_ability en_name varchar NO -vala_learn_ability cn_name varchar NO -vala_learn_ability season_package_id bigint YES -vala_learn_ability game_id bigint YES -vala_learn_ability chapter_id bigint YES -vala_learn_ability ability_type tinyint NO -vala_learn_ability sub_component_type tinyint NO -vala_learn_ability component_id bigint YES -vala_learn_ability created_at datetime YES -vala_learn_ability updated_at datetime YES -vala_learn_ability deleted_at datetime YES -vala_location id bigint NO -vala_location location_id int NO -vala_location cn_name varchar NO -vala_location en_name varchar NO -vala_location desc text YES -vala_location area_id bigint NO -vala_location is_related tinyint NO -vala_location location_size tinyint NO -vala_location actual_size varchar NO -vala_location resource_url varchar NO -vala_location created_at datetime YES -vala_location updated_at datetime YES -vala_location deleted_at datetime YES -vala_minimaxi_tts id bigint NO -vala_minimaxi_tts local_name varchar NO -vala_minimaxi_tts voice_id varchar NO -vala_minimaxi_tts gender varchar YES -vala_minimaxi_tts gender_desc varchar YES -vala_minimaxi_tts created_at datetime YES -vala_minimaxi_tts updated_at datetime YES -vala_minimaxi_tts deleted_at datetime YES -vala_picture_text_material id bigint NO -vala_picture_text_material material_name varchar YES -vala_picture_text_material material_type varchar YES -vala_picture_text_material material_content text YES -vala_picture_text_material created_at datetime YES -vala_picture_text_material updated_at datetime YES -vala_picture_text_material deleted_at datetime YES -vala_relation_level id int NO -vala_relation_level level int NO -vala_relation_level upgrade_need_value int NO -vala_relation_level total_require_value int NO -vala_relation_level favorability_lower_limit int NO -vala_relation_level favorability_upper_limit int NO -vala_relation_level created_at datetime YES -vala_relation_level updated_at datetime YES -vala_relation_level deleted_at datetime YES -vala_resource_base id bigint NO -vala_resource_base cn_name varchar NO -vala_resource_base en_name varchar NO -vala_resource_base type varchar NO -vala_resource_base voice_list text YES -vala_resource_base voice_id varchar YES -vala_resource_base is_hide tinyint YES -vala_resource_base gender tinyint YES -vala_resource_base age int YES -vala_resource_base half_body varchar YES -vala_resource_base thumbnail varchar YES -vala_resource_base profile text YES -vala_resource_base spine_file_name varchar YES -vala_resource_base is_friend tinyint YES -vala_resource_base nick_name varchar YES -vala_resource_base mbti varchar YES -vala_resource_base interest varchar YES -vala_resource_base birthday varchar YES -vala_resource_base tips varchar YES -vala_resource_base desc text YES -vala_resource_base stream_voice_id varchar YES -vala_resource_base created_at datetime YES -vala_resource_base updated_at datetime YES -vala_resource_base deleted_at datetime YES -vala_scene id bigint NO -vala_scene name varchar NO -vala_scene scene_id varchar NO -vala_scene location_id int NO -vala_scene desc text YES -vala_scene is_related tinyint YES -vala_scene created_at datetime YES -vala_scene updated_at datetime YES -vala_scene deleted_at datetime YES -vala_scene_resource_v3 id int NO -vala_scene_resource_v3 location_id int NO -vala_scene_resource_v3 scene_id varchar NO -vala_scene_resource_v3 resource_id int NO -vala_scene_resource_v3 created_at datetime YES -vala_scene_resource_v3 updated_at datetime YES -vala_scene_resource_v3 deleted_at timestamp YES -vala_seasonal_ticket id bigint NO -vala_seasonal_ticket unique_code varchar YES -vala_seasonal_ticket account_id bigint YES -vala_seasonal_ticket account_name varchar YES -vala_seasonal_ticket tel varchar YES -vala_seasonal_ticket give_time int YES -vala_seasonal_ticket expire_time int YES -vala_seasonal_ticket expire_type tinyint NO -vala_seasonal_ticket ticket_type tinyint NO -vala_seasonal_ticket used_time int YES -vala_seasonal_ticket season_package_id bigint YES -vala_seasonal_ticket season_package_name varchar YES -vala_seasonal_ticket status tinyint NO -vala_seasonal_ticket created_at datetime YES -vala_seasonal_ticket updated_at datetime YES -vala_seasonal_ticket deleted_at datetime YES -vala_tts_voice id bigint NO -vala_tts_voice name varchar NO -vala_tts_voice voice_id varchar NO -vala_tts_voice preview_url varchar YES -vala_tts_voice model_id varchar YES -vala_tts_voice type tinyint NO -vala_tts_voice created_at datetime YES -vala_tts_voice updated_at datetime YES -vala_tts_voice deleted_at datetime YES -vala_user_card_component_record id bigint NO -vala_user_card_component_record user_id bigint NO -vala_user_card_component_record game_id bigint YES -vala_user_card_component_record chapter_id bigint YES -vala_user_card_component_record component_type bigint YES -vala_user_card_component_record sub_component_type bigint YES -vala_user_card_component_record component_id bigint NO -vala_user_card_component_record card_content text YES -vala_user_card_component_record is_new tinyint NO -vala_user_card_component_record created_at datetime YES -vala_user_card_component_record updated_at datetime YES -vala_user_card_component_record deleted_at datetime YES -vala_user_card_plot_upgrade id bigint NO -vala_user_card_plot_upgrade user_id bigint NO -vala_user_card_plot_upgrade chapter_id bigint YES -vala_user_card_plot_upgrade plot_card_id bigint NO -vala_user_card_plot_upgrade is_have_upgrade bigint YES -vala_user_card_plot_upgrade white_card_status tinyint NO -vala_user_card_plot_upgrade flash_card_status tinyint NO -vala_user_card_plot_upgrade is_new tinyint YES -vala_user_card_plot_upgrade created_at datetime YES -vala_user_card_plot_upgrade updated_at datetime YES -vala_user_card_plot_upgrade deleted_at datetime YES -vala_user_chapter_finish_record id bigint NO -vala_user_chapter_finish_record user_id bigint NO -vala_user_chapter_finish_record game_id bigint YES -vala_user_chapter_finish_record chapter_id bigint NO -vala_user_chapter_finish_record created_at datetime YES -vala_user_chapter_finish_record updated_at datetime YES -vala_user_chapter_finish_record deleted_at datetime YES -vala_user_chat_log id bigint NO -vala_user_chat_log user_id bigint NO -vala_user_chat_log game_id bigint YES -vala_user_chat_log chapter_id bigint YES -vala_user_chat_log sub_component_type bigint YES -vala_user_chat_log component_id bigint NO -vala_user_chat_log npc_id bigint NO -vala_user_chat_log session_id varchar NO -vala_user_chat_log dialog_list text YES -vala_user_chat_log dialog_score text YES -vala_user_chat_log tencent_soe text YES -vala_user_chat_log chat_review_resp text YES -vala_user_chat_log cost_time int NO -vala_user_chat_log insert_time datetime YES -vala_user_chat_log log_type tinyint YES -vala_user_chat_log deal_status tinyint YES -vala_user_chat_log created_at datetime YES -vala_user_chat_log updated_at datetime YES -vala_user_chat_log deleted_at datetime YES -vala_user_drama_log id bigint NO -vala_user_drama_log user_id bigint NO -vala_user_drama_log game_id bigint YES -vala_user_drama_log chapter_id bigint YES -vala_user_drama_log component_id bigint YES -vala_user_drama_log npc_id bigint YES -vala_user_drama_log created_at datetime YES -vala_user_drama_log updated_at datetime YES -vala_user_drama_log deleted_at datetime YES -vala_user_friend_info id bigint NO -vala_user_friend_info user_id bigint NO -vala_user_friend_info npc_id bigint NO -vala_user_friend_info favorability_value int YES -vala_user_friend_info level int YES -vala_user_friend_info relation_type tinyint YES -vala_user_friend_info interact_times int YES -vala_user_friend_info story_id int YES -vala_user_friend_info created_at datetime YES -vala_user_friend_info updated_at datetime YES -vala_user_friend_info deleted_at datetime YES -vala_user_game_finish_record id bigint NO -vala_user_game_finish_record user_id bigint NO -vala_user_game_finish_record game_id bigint YES -vala_user_game_finish_record created_at datetime YES -vala_user_game_finish_record updated_at datetime YES -vala_user_game_finish_record deleted_at datetime YES -vala_user_interact_chat_log id bigint NO -vala_user_interact_chat_log user_id bigint NO -vala_user_interact_chat_log npc_id bigint NO -vala_user_interact_chat_log session_id varchar YES -vala_user_interact_chat_log type varchar NO -vala_user_interact_chat_log name varchar NO -vala_user_interact_chat_log text varchar NO -vala_user_interact_chat_log interactive_type text YES -vala_user_interact_chat_log interactive_selected tinyint YES -vala_user_interact_chat_log chat_time datetime YES -vala_user_interact_chat_log created_at datetime YES -vala_user_interact_chat_log updated_at datetime YES -vala_user_interact_chat_log deleted_at datetime YES -vala_user_interact_log id bigint NO -vala_user_interact_log user_id bigint NO -vala_user_interact_log component_type tinyint YES -vala_user_interact_log relation_type tinyint YES -vala_user_interact_log npc_id bigint NO -vala_user_interact_log session_id varchar NO -vala_user_interact_log summary text NO -vala_user_interact_log dialog_list text YES -vala_user_interact_log dialog_score text YES -vala_user_interact_log tencent_soe text YES -vala_user_interact_log chat_review_resp text YES -vala_user_interact_log cost_time int NO -vala_user_interact_log insert_time datetime YES -vala_user_interact_log created_at datetime YES -vala_user_interact_log updated_at datetime YES -vala_user_interact_log deleted_at datetime YES -vala_user_interact_log favor_value int YES -vala_user_learn_record id int NO -vala_user_learn_record user_id bigint NO -vala_user_learn_record game_id bigint NO -vala_user_learn_record chapter_id bigint NO -vala_user_learn_record score float NO -vala_user_learn_record score_level char YES -vala_user_learn_record unique_code varchar YES -vala_user_learn_record source_type tinyint YES -vala_user_learn_record created_at datetime YES -vala_user_learn_record updated_at datetime YES -vala_user_learn_record deleted_at timestamp YES -vala_user_long_term_memory id bigint NO -vala_user_long_term_memory user_id int YES -vala_user_long_term_memory npc_id int YES -vala_user_long_term_memory robot_id int YES -vala_user_long_term_memory config_id int YES -vala_user_long_term_memory session_id varchar YES -vala_user_long_term_memory log_id int YES -vala_user_long_term_memory source varchar YES -vala_user_long_term_memory source_type varchar YES -vala_user_long_term_memory memory_type varchar YES -vala_user_long_term_memory sub_type varchar YES -vala_user_long_term_memory user_content text YES -vala_user_long_term_memory system_content text YES -vala_user_long_term_memory created_at datetime YES -vala_user_long_term_memory updated_at datetime YES -vala_user_long_term_memory deleted_at datetime YES -vala_user_robot id int NO -vala_user_robot user_id bigint NO -vala_user_robot robot_id bigint NO -vala_user_robot robot_level tinyint YES -vala_user_robot created_at datetime YES -vala_user_robot updated_at datetime YES -vala_user_robot deleted_at timestamp YES -vala_user_word_statistics id bigint NO -vala_user_word_statistics user_id int NO -vala_user_word_statistics word_pattern_id varchar YES -vala_user_word_statistics count_num int NO -vala_user_word_statistics content_type char YES -vala_user_word_statistics source_type tinyint YES -vala_user_word_statistics created_at datetime YES -vala_user_word_statistics updated_at datetime YES -vala_user_word_statistics deleted_at datetime YES -vala_wx_chatbot id bigint NO -vala_wx_chatbot type varchar NO -vala_wx_chatbot component_id bigint YES -vala_wx_chatbot chatbot_id bigint YES -vala_wx_chatbot title varchar NO -vala_wx_chatbot desc text NO -vala_wx_chatbot desc_detail text NO -vala_wx_chatbot npc_id bigint NO -vala_wx_chatbot target varchar NO -vala_wx_chatbot index int YES -vala_wx_chatbot complete_story text YES -vala_wx_chatbot limit_type int YES -vala_wx_chatbot limit_data int YES -vala_wx_chatbot created_at datetime YES -vala_wx_chatbot updated_at datetime YES -vala_wx_chatbot deleted_at datetime YES -vala_wxchat_history id bigint NO -vala_wxchat_history user_id bigint YES -vala_wxchat_history wx_chatbot_id bigint YES -vala_wxchat_history chatbot_id bigint YES -vala_wxchat_history session_id varchar YES -vala_wxchat_history chat_content text YES -vala_wxchat_history question_result text YES -vala_wxchat_history status tinyint YES -vala_wxchat_history created_at datetime YES -vala_wxchat_history updated_at datetime YES -vala_wxchat_history deleted_at datetime YES -voice_info id bigint NO -voice_info voice_id varchar YES -voice_info elabs_id varchar YES -voice_info voice_name varchar YES -voice_info des varchar YES -voice_info gender varchar YES -voice_info mini_speed float YES -voice_info mini_volume float YES -voice_info mini_emotion varchar YES -voice_info elabs_speed float YES -voice_info default_platform varchar YES -voice_info created_at datetime YES -voice_info updated_at datetime YES -voice_info deleted_at datetime YES diff --git a/database/pg_online_full.txt b/database/pg_online_full.txt deleted file mode 100644 index e7f6dc2..0000000 --- a/database/pg_online_full.txt +++ /dev/null @@ -1,678 +0,0 @@ - table_name | column_name | data_type | is_nullable -----------------------------------------+-------------------------+--------------------------+------------- - account_activity_count | id | bigint | NO - account_activity_count | time_period | integer | YES - account_activity_count | counts | integer | YES - account_activity_count | stat_date | text | NO - account_activity_count | created_time | timestamp with time zone | NO - account_detail_info | id | bigint | NO - account_detail_info | account_id | integer | YES - account_detail_info | login_time | text | YES - account_detail_info | device | text | YES - account_detail_info | device_os | text | YES - account_detail_info | login_address | text | YES - account_detail_info | login_times | integer | YES - account_detail_info | created_time | timestamp with time zone | NO - account_detail_info | phone_login_times | integer | YES - account_device | id | bigint | NO - account_device | account_id | integer | YES - account_device | device | text | YES - account_device | created_time | timestamp with time zone | NO - account_login | id | bigint | NO - account_login | account_id | integer | YES - account_login | login_date | text | NO - account_login | created_time | timestamp with time zone | NO - account_login | status | integer | NO - account_paid_activity_count | id | integer | NO - account_paid_activity_count | time_period | integer | YES - account_paid_activity_count | counts | integer | YES - account_paid_activity_count | stat_date | text | NO - account_paid_activity_count | created_time | timestamp with time zone | NO - bi_level_unit_lesson | course_level | character varying | YES - bi_level_unit_lesson | course_season | character varying | YES - bi_level_unit_lesson | course_unit | character varying | YES - bi_level_unit_lesson | course_lesson | character varying | YES - bi_level_unit_lesson | id | integer | YES - bi_refund_order | id | bigint | NO - bi_refund_order | account_id | bigint | YES - bi_refund_order | out_trade_no | character varying | YES - bi_refund_order | trade_no | character varying | YES - bi_refund_order | refund_amount | character varying | YES - bi_refund_order | created_at | timestamp with time zone | YES - bi_refund_order | updated_at | timestamp with time zone | YES - bi_refund_order | deleted_at | timestamp with time zone | YES - bi_refund_order | refund_amount_int | integer | NO - bi_refund_order | reason | text | NO - bi_refund_order | sale_channel | integer | NO - bi_refund_order | status | integer | NO - bi_refund_order | is_admin | boolean | NO - bi_refund_order | channel_refund_id | bigint | NO - bi_refund_order | refund_ticket_ids | text | YES - bi_refund_order | refund_type | integer | NO - bi_refund_order | refund_method | integer | NO - bi_refund_order | after_sale_no | text | YES - bi_user_chapter_play_record_0 | id | bigint | NO - bi_user_chapter_play_record_0 | user_id | bigint | YES - bi_user_chapter_play_record_0 | chapter_id | bigint | YES - bi_user_chapter_play_record_0 | chapter_unique_id | character varying | YES - bi_user_chapter_play_record_0 | play_status | smallint | YES - bi_user_chapter_play_record_0 | story_id | bigint | YES - bi_user_chapter_play_record_0 | created_at | timestamp with time zone | YES - bi_user_chapter_play_record_0 | updated_at | timestamp with time zone | YES - bi_user_chapter_play_record_0 | deleted_at | timestamp with time zone | YES - bi_user_chapter_play_record_0 | map_position | text | YES - bi_user_chapter_play_record_0 | dialog_info | text | YES - bi_user_chapter_play_record_0 | level | character varying | YES - bi_user_chapter_play_record_1 | id | bigint | NO - bi_user_chapter_play_record_1 | user_id | bigint | YES - bi_user_chapter_play_record_1 | chapter_id | bigint | YES - bi_user_chapter_play_record_1 | chapter_unique_id | character varying | YES - bi_user_chapter_play_record_1 | play_status | smallint | YES - bi_user_chapter_play_record_1 | story_id | bigint | YES - bi_user_chapter_play_record_1 | created_at | timestamp with time zone | YES - bi_user_chapter_play_record_1 | updated_at | timestamp with time zone | YES - bi_user_chapter_play_record_1 | deleted_at | timestamp with time zone | YES - bi_user_chapter_play_record_1 | map_position | text | YES - bi_user_chapter_play_record_1 | dialog_info | text | YES - bi_user_chapter_play_record_1 | level | character varying | YES - bi_user_chapter_play_record_2 | id | bigint | NO - bi_user_chapter_play_record_2 | user_id | bigint | YES - bi_user_chapter_play_record_2 | chapter_id | bigint | YES - bi_user_chapter_play_record_2 | chapter_unique_id | character varying | YES - bi_user_chapter_play_record_2 | play_status | smallint | YES - bi_user_chapter_play_record_2 | story_id | bigint | YES - bi_user_chapter_play_record_2 | created_at | timestamp with time zone | YES - bi_user_chapter_play_record_2 | updated_at | timestamp with time zone | YES - bi_user_chapter_play_record_2 | deleted_at | timestamp with time zone | YES - bi_user_chapter_play_record_2 | map_position | text | YES - bi_user_chapter_play_record_2 | dialog_info | text | YES - bi_user_chapter_play_record_2 | level | character varying | YES - bi_user_chapter_play_record_3 | id | bigint | NO - bi_user_chapter_play_record_3 | user_id | bigint | YES - bi_user_chapter_play_record_3 | chapter_id | bigint | YES - bi_user_chapter_play_record_3 | chapter_unique_id | character varying | YES - bi_user_chapter_play_record_3 | play_status | smallint | YES - bi_user_chapter_play_record_3 | story_id | bigint | YES - bi_user_chapter_play_record_3 | created_at | timestamp with time zone | YES - bi_user_chapter_play_record_3 | updated_at | timestamp with time zone | YES - bi_user_chapter_play_record_3 | deleted_at | timestamp with time zone | YES - bi_user_chapter_play_record_3 | map_position | text | YES - bi_user_chapter_play_record_3 | dialog_info | text | YES - bi_user_chapter_play_record_3 | level | character varying | YES - bi_user_chapter_play_record_4 | id | bigint | NO - bi_user_chapter_play_record_4 | user_id | bigint | YES - bi_user_chapter_play_record_4 | chapter_id | bigint | YES - bi_user_chapter_play_record_4 | chapter_unique_id | character varying | YES - bi_user_chapter_play_record_4 | play_status | smallint | YES - bi_user_chapter_play_record_4 | story_id | bigint | YES - bi_user_chapter_play_record_4 | created_at | timestamp with time zone | YES - bi_user_chapter_play_record_4 | updated_at | timestamp with time zone | YES - bi_user_chapter_play_record_4 | deleted_at | timestamp with time zone | YES - bi_user_chapter_play_record_4 | map_position | text | YES - bi_user_chapter_play_record_4 | dialog_info | text | YES - bi_user_chapter_play_record_4 | level | character varying | YES - bi_user_chapter_play_record_5 | id | bigint | NO - bi_user_chapter_play_record_5 | user_id | bigint | YES - bi_user_chapter_play_record_5 | chapter_id | bigint | YES - bi_user_chapter_play_record_5 | chapter_unique_id | character varying | YES - bi_user_chapter_play_record_5 | play_status | smallint | YES - bi_user_chapter_play_record_5 | story_id | bigint | YES - bi_user_chapter_play_record_5 | created_at | timestamp with time zone | YES - bi_user_chapter_play_record_5 | updated_at | timestamp with time zone | YES - bi_user_chapter_play_record_5 | deleted_at | timestamp with time zone | YES - bi_user_chapter_play_record_5 | map_position | text | YES - bi_user_chapter_play_record_5 | dialog_info | text | YES - bi_user_chapter_play_record_5 | level | character varying | YES - bi_user_chapter_play_record_6 | id | bigint | NO - bi_user_chapter_play_record_6 | user_id | bigint | YES - bi_user_chapter_play_record_6 | chapter_id | bigint | YES - bi_user_chapter_play_record_6 | chapter_unique_id | character varying | YES - bi_user_chapter_play_record_6 | play_status | smallint | YES - bi_user_chapter_play_record_6 | story_id | bigint | YES - bi_user_chapter_play_record_6 | created_at | timestamp with time zone | YES - bi_user_chapter_play_record_6 | updated_at | timestamp with time zone | YES - bi_user_chapter_play_record_6 | deleted_at | timestamp with time zone | YES - bi_user_chapter_play_record_6 | map_position | text | YES - bi_user_chapter_play_record_6 | dialog_info | text | YES - bi_user_chapter_play_record_6 | level | character varying | YES - bi_user_chapter_play_record_7 | id | bigint | NO - bi_user_chapter_play_record_7 | user_id | bigint | YES - bi_user_chapter_play_record_7 | chapter_id | bigint | YES - bi_user_chapter_play_record_7 | chapter_unique_id | character varying | YES - bi_user_chapter_play_record_7 | play_status | smallint | YES - bi_user_chapter_play_record_7 | story_id | bigint | YES - bi_user_chapter_play_record_7 | created_at | timestamp with time zone | YES - bi_user_chapter_play_record_7 | updated_at | timestamp with time zone | YES - bi_user_chapter_play_record_7 | deleted_at | timestamp with time zone | YES - bi_user_chapter_play_record_7 | map_position | text | YES - bi_user_chapter_play_record_7 | dialog_info | text | YES - bi_user_chapter_play_record_7 | level | character varying | YES - bi_user_component_play_record_0 | id | bigint | NO - bi_user_component_play_record_0 | user_id | bigint | YES - bi_user_component_play_record_0 | chapter_id | bigint | YES - bi_user_component_play_record_0 | chapter_unique_id | character varying | YES - bi_user_component_play_record_0 | component_id | bigint | YES - bi_user_component_play_record_0 | sub_component_type | smallint | YES - bi_user_component_play_record_0 | component_type | smallint | YES - bi_user_component_play_record_0 | comp_type | character varying | YES - bi_user_component_play_record_0 | play_status | smallint | YES - bi_user_component_play_record_0 | pass_time | integer | YES - bi_user_component_play_record_0 | interval_time | integer | YES - bi_user_component_play_record_0 | read_word_count | integer | YES - bi_user_component_play_record_0 | speak_count | integer | YES - bi_user_component_play_record_0 | listen_sentence_count | integer | YES - bi_user_component_play_record_0 | write_word_count | integer | YES - bi_user_component_play_record_0 | created_at | timestamp with time zone | YES - bi_user_component_play_record_0 | updated_at | timestamp with time zone | YES - bi_user_component_play_record_0 | deleted_at | timestamp with time zone | YES - bi_user_component_play_record_0 | section_id | bigint | YES - bi_user_component_play_record_0 | component_unique_code | character varying | YES - bi_user_component_play_record_0 | play_result | character varying | YES - bi_user_component_play_record_0 | finish_question_count | integer | YES - bi_user_component_play_record_0 | skill_points | integer | YES - bi_user_component_play_record_0 | c_type | character varying | YES - bi_user_component_play_record_0 | c_id | character varying | YES - bi_user_component_play_record_0 | user_behavior_info | text | YES - bi_user_component_play_record_0 | behavior_analysis_info | text | YES - bi_user_component_play_record_0 | session_id | character varying | YES - bi_user_component_play_record_0 | listen_time | integer | YES - bi_user_component_play_record_0 | dialog_list | text | YES - bi_user_component_play_record_0 | level | character varying | YES - bi_user_component_play_record_1 | id | bigint | NO - bi_user_component_play_record_1 | user_id | bigint | YES - bi_user_component_play_record_1 | chapter_id | bigint | YES - bi_user_component_play_record_1 | chapter_unique_id | character varying | YES - bi_user_component_play_record_1 | component_id | bigint | YES - bi_user_component_play_record_1 | sub_component_type | smallint | YES - bi_user_component_play_record_1 | component_type | smallint | YES - bi_user_component_play_record_1 | comp_type | character varying | YES - bi_user_component_play_record_1 | play_status | smallint | YES - bi_user_component_play_record_1 | pass_time | integer | YES - bi_user_component_play_record_1 | interval_time | integer | YES - bi_user_component_play_record_1 | read_word_count | integer | YES - bi_user_component_play_record_1 | speak_count | integer | YES - bi_user_component_play_record_1 | listen_sentence_count | integer | YES - bi_user_component_play_record_1 | write_word_count | integer | YES - bi_user_component_play_record_1 | created_at | timestamp with time zone | YES - bi_user_component_play_record_1 | updated_at | timestamp with time zone | YES - bi_user_component_play_record_1 | deleted_at | timestamp with time zone | YES - bi_user_component_play_record_1 | section_id | bigint | YES - bi_user_component_play_record_1 | component_unique_code | character varying | YES - bi_user_component_play_record_1 | play_result | character varying | YES - bi_user_component_play_record_1 | finish_question_count | integer | YES - bi_user_component_play_record_1 | skill_points | integer | YES - bi_user_component_play_record_1 | c_type | character varying | YES - bi_user_component_play_record_1 | c_id | character varying | YES - bi_user_component_play_record_1 | user_behavior_info | text | YES - bi_user_component_play_record_1 | behavior_analysis_info | text | YES - bi_user_component_play_record_1 | session_id | character varying | YES - bi_user_component_play_record_1 | listen_time | integer | YES - bi_user_component_play_record_1 | dialog_list | text | YES - bi_user_component_play_record_1 | level | character varying | YES - bi_user_component_play_record_2 | id | bigint | NO - bi_user_component_play_record_2 | user_id | bigint | YES - bi_user_component_play_record_2 | chapter_id | bigint | YES - bi_user_component_play_record_2 | chapter_unique_id | character varying | YES - bi_user_component_play_record_2 | component_id | bigint | YES - bi_user_component_play_record_2 | sub_component_type | smallint | YES - bi_user_component_play_record_2 | component_type | smallint | YES - bi_user_component_play_record_2 | comp_type | character varying | YES - bi_user_component_play_record_2 | play_status | smallint | YES - bi_user_component_play_record_2 | pass_time | integer | YES - bi_user_component_play_record_2 | interval_time | integer | YES - bi_user_component_play_record_2 | read_word_count | integer | YES - bi_user_component_play_record_2 | speak_count | integer | YES - bi_user_component_play_record_2 | listen_sentence_count | integer | YES - bi_user_component_play_record_2 | write_word_count | integer | YES - bi_user_component_play_record_2 | created_at | timestamp with time zone | YES - bi_user_component_play_record_2 | updated_at | timestamp with time zone | YES - bi_user_component_play_record_2 | deleted_at | timestamp with time zone | YES - bi_user_component_play_record_2 | section_id | bigint | YES - bi_user_component_play_record_2 | component_unique_code | character varying | YES - bi_user_component_play_record_2 | play_result | character varying | YES - bi_user_component_play_record_2 | finish_question_count | integer | YES - bi_user_component_play_record_2 | skill_points | integer | YES - bi_user_component_play_record_2 | c_type | character varying | YES - bi_user_component_play_record_2 | c_id | character varying | YES - bi_user_component_play_record_2 | user_behavior_info | text | YES - bi_user_component_play_record_2 | behavior_analysis_info | text | YES - bi_user_component_play_record_2 | session_id | character varying | YES - bi_user_component_play_record_2 | listen_time | integer | YES - bi_user_component_play_record_2 | dialog_list | text | YES - bi_user_component_play_record_2 | level | character varying | YES - bi_user_component_play_record_3 | id | bigint | NO - bi_user_component_play_record_3 | user_id | bigint | YES - bi_user_component_play_record_3 | chapter_id | bigint | YES - bi_user_component_play_record_3 | chapter_unique_id | character varying | YES - bi_user_component_play_record_3 | component_id | bigint | YES - bi_user_component_play_record_3 | sub_component_type | smallint | YES - bi_user_component_play_record_3 | component_type | smallint | YES - bi_user_component_play_record_3 | comp_type | character varying | YES - bi_user_component_play_record_3 | play_status | smallint | YES - bi_user_component_play_record_3 | pass_time | integer | YES - bi_user_component_play_record_3 | interval_time | integer | YES - bi_user_component_play_record_3 | read_word_count | integer | YES - bi_user_component_play_record_3 | speak_count | integer | YES - bi_user_component_play_record_3 | listen_sentence_count | integer | YES - bi_user_component_play_record_3 | write_word_count | integer | YES - bi_user_component_play_record_3 | created_at | timestamp with time zone | YES - bi_user_component_play_record_3 | updated_at | timestamp with time zone | YES - bi_user_component_play_record_3 | deleted_at | timestamp with time zone | YES - bi_user_component_play_record_3 | section_id | bigint | YES - bi_user_component_play_record_3 | component_unique_code | character varying | YES - bi_user_component_play_record_3 | play_result | character varying | YES - bi_user_component_play_record_3 | finish_question_count | integer | YES - bi_user_component_play_record_3 | skill_points | integer | YES - bi_user_component_play_record_3 | c_type | character varying | YES - bi_user_component_play_record_3 | c_id | character varying | YES - bi_user_component_play_record_3 | user_behavior_info | text | YES - bi_user_component_play_record_3 | behavior_analysis_info | text | YES - bi_user_component_play_record_3 | session_id | character varying | YES - bi_user_component_play_record_3 | listen_time | integer | YES - bi_user_component_play_record_3 | dialog_list | text | YES - bi_user_component_play_record_3 | level | character varying | YES - bi_user_component_play_record_4 | id | bigint | NO - bi_user_component_play_record_4 | user_id | bigint | YES - bi_user_component_play_record_4 | chapter_id | bigint | YES - bi_user_component_play_record_4 | chapter_unique_id | character varying | YES - bi_user_component_play_record_4 | component_id | bigint | YES - bi_user_component_play_record_4 | sub_component_type | smallint | YES - bi_user_component_play_record_4 | component_type | smallint | YES - bi_user_component_play_record_4 | comp_type | character varying | YES - bi_user_component_play_record_4 | play_status | smallint | YES - bi_user_component_play_record_4 | pass_time | integer | YES - bi_user_component_play_record_4 | interval_time | integer | YES - bi_user_component_play_record_4 | read_word_count | integer | YES - bi_user_component_play_record_4 | speak_count | integer | YES - bi_user_component_play_record_4 | listen_sentence_count | integer | YES - bi_user_component_play_record_4 | write_word_count | integer | YES - bi_user_component_play_record_4 | created_at | timestamp with time zone | YES - bi_user_component_play_record_4 | updated_at | timestamp with time zone | YES - bi_user_component_play_record_4 | deleted_at | timestamp with time zone | YES - bi_user_component_play_record_4 | section_id | bigint | YES - bi_user_component_play_record_4 | component_unique_code | character varying | YES - bi_user_component_play_record_4 | play_result | character varying | YES - bi_user_component_play_record_4 | finish_question_count | integer | YES - bi_user_component_play_record_4 | skill_points | integer | YES - bi_user_component_play_record_4 | c_type | character varying | YES - bi_user_component_play_record_4 | c_id | character varying | YES - bi_user_component_play_record_4 | user_behavior_info | text | YES - bi_user_component_play_record_4 | behavior_analysis_info | text | YES - bi_user_component_play_record_4 | session_id | character varying | YES - bi_user_component_play_record_4 | listen_time | integer | YES - bi_user_component_play_record_4 | dialog_list | text | YES - bi_user_component_play_record_4 | level | character varying | YES - bi_user_component_play_record_5 | id | bigint | NO - bi_user_component_play_record_5 | user_id | bigint | YES - bi_user_component_play_record_5 | chapter_id | bigint | YES - bi_user_component_play_record_5 | chapter_unique_id | character varying | YES - bi_user_component_play_record_5 | component_id | bigint | YES - bi_user_component_play_record_5 | sub_component_type | smallint | YES - bi_user_component_play_record_5 | component_type | smallint | YES - bi_user_component_play_record_5 | comp_type | character varying | YES - bi_user_component_play_record_5 | play_status | smallint | YES - bi_user_component_play_record_5 | pass_time | integer | YES - bi_user_component_play_record_5 | interval_time | integer | YES - bi_user_component_play_record_5 | read_word_count | integer | YES - bi_user_component_play_record_5 | speak_count | integer | YES - bi_user_component_play_record_5 | listen_sentence_count | integer | YES - bi_user_component_play_record_5 | write_word_count | integer | YES - bi_user_component_play_record_5 | created_at | timestamp with time zone | YES - bi_user_component_play_record_5 | updated_at | timestamp with time zone | YES - bi_user_component_play_record_5 | deleted_at | timestamp with time zone | YES - bi_user_component_play_record_5 | section_id | bigint | YES - bi_user_component_play_record_5 | component_unique_code | character varying | YES - bi_user_component_play_record_5 | play_result | character varying | YES - bi_user_component_play_record_5 | finish_question_count | integer | YES - bi_user_component_play_record_5 | skill_points | integer | YES - bi_user_component_play_record_5 | c_type | character varying | YES - bi_user_component_play_record_5 | c_id | character varying | YES - bi_user_component_play_record_5 | user_behavior_info | text | YES - bi_user_component_play_record_5 | behavior_analysis_info | text | YES - bi_user_component_play_record_5 | session_id | character varying | YES - bi_user_component_play_record_5 | listen_time | integer | YES - bi_user_component_play_record_5 | dialog_list | text | YES - bi_user_component_play_record_5 | level | character varying | YES - bi_user_component_play_record_6 | id | bigint | NO - bi_user_component_play_record_6 | user_id | bigint | YES - bi_user_component_play_record_6 | chapter_id | bigint | YES - bi_user_component_play_record_6 | chapter_unique_id | character varying | YES - bi_user_component_play_record_6 | component_id | bigint | YES - bi_user_component_play_record_6 | sub_component_type | smallint | YES - bi_user_component_play_record_6 | component_type | smallint | YES - bi_user_component_play_record_6 | comp_type | character varying | YES - bi_user_component_play_record_6 | play_status | smallint | YES - bi_user_component_play_record_6 | pass_time | integer | YES - bi_user_component_play_record_6 | interval_time | integer | YES - bi_user_component_play_record_6 | read_word_count | integer | YES - bi_user_component_play_record_6 | speak_count | integer | YES - bi_user_component_play_record_6 | listen_sentence_count | integer | YES - bi_user_component_play_record_6 | write_word_count | integer | YES - bi_user_component_play_record_6 | created_at | timestamp with time zone | YES - bi_user_component_play_record_6 | updated_at | timestamp with time zone | YES - bi_user_component_play_record_6 | deleted_at | timestamp with time zone | YES - bi_user_component_play_record_6 | section_id | bigint | YES - bi_user_component_play_record_6 | component_unique_code | character varying | YES - bi_user_component_play_record_6 | play_result | character varying | YES - bi_user_component_play_record_6 | finish_question_count | integer | YES - bi_user_component_play_record_6 | skill_points | integer | YES - bi_user_component_play_record_6 | c_type | character varying | YES - bi_user_component_play_record_6 | c_id | character varying | YES - bi_user_component_play_record_6 | user_behavior_info | text | YES - bi_user_component_play_record_6 | behavior_analysis_info | text | YES - bi_user_component_play_record_6 | session_id | character varying | YES - bi_user_component_play_record_6 | listen_time | integer | YES - bi_user_component_play_record_6 | dialog_list | text | YES - bi_user_component_play_record_6 | level | character varying | YES - bi_user_component_play_record_7 | id | bigint | NO - bi_user_component_play_record_7 | user_id | bigint | YES - bi_user_component_play_record_7 | chapter_id | bigint | YES - bi_user_component_play_record_7 | chapter_unique_id | character varying | YES - bi_user_component_play_record_7 | component_id | bigint | YES - bi_user_component_play_record_7 | sub_component_type | smallint | YES - bi_user_component_play_record_7 | component_type | smallint | YES - bi_user_component_play_record_7 | comp_type | character varying | YES - bi_user_component_play_record_7 | play_status | smallint | YES - bi_user_component_play_record_7 | pass_time | integer | YES - bi_user_component_play_record_7 | interval_time | integer | YES - bi_user_component_play_record_7 | read_word_count | integer | YES - bi_user_component_play_record_7 | speak_count | integer | YES - bi_user_component_play_record_7 | listen_sentence_count | integer | YES - bi_user_component_play_record_7 | write_word_count | integer | YES - bi_user_component_play_record_7 | created_at | timestamp with time zone | YES - bi_user_component_play_record_7 | updated_at | timestamp with time zone | YES - bi_user_component_play_record_7 | deleted_at | timestamp with time zone | YES - bi_user_component_play_record_7 | section_id | bigint | YES - bi_user_component_play_record_7 | component_unique_code | character varying | YES - bi_user_component_play_record_7 | play_result | character varying | YES - bi_user_component_play_record_7 | finish_question_count | integer | YES - bi_user_component_play_record_7 | skill_points | integer | YES - bi_user_component_play_record_7 | c_type | character varying | YES - bi_user_component_play_record_7 | c_id | character varying | YES - bi_user_component_play_record_7 | user_behavior_info | text | YES - bi_user_component_play_record_7 | behavior_analysis_info | text | YES - bi_user_component_play_record_7 | session_id | character varying | YES - bi_user_component_play_record_7 | listen_time | integer | YES - bi_user_component_play_record_7 | dialog_list | text | YES - bi_user_component_play_record_7 | level | character varying | YES - bi_user_course_detail | id | integer | NO - bi_user_course_detail | created_at | timestamp with time zone | YES - bi_user_course_detail | updated_at | timestamp with time zone | YES - bi_user_course_detail | account_id | integer | YES - bi_user_course_detail | user_id | integer | YES - bi_user_course_detail | course_level | text | YES - bi_user_course_detail | active_time | timestamp with time zone | YES - bi_user_course_detail | expire_time | timestamp with time zone | YES - bi_user_course_detail | latest_unit_index | integer | YES - bi_user_course_detail | latest_lesson_index | integer | YES - bi_user_course_detail | learn_duration | integer | YES - bi_user_course_detail | last_learn_time | timestamp with time zone | YES - bi_user_course_detail | deleted_at | timestamp with time zone | YES - bi_user_task_log | id | integer | NO - bi_user_task_log | user_id | integer | NO - bi_user_task_log | task_type | integer | NO - bi_user_task_log | task_id | integer | NO - bi_user_task_log | status | integer | NO - bi_user_task_log | start_time | bigint | NO - bi_user_task_log | updated_at | timestamp with time zone | YES - bi_user_task_log | created_at | timestamp with time zone | YES - bi_user_unit_challenge_question_result | id | bigint | NO - bi_user_unit_challenge_question_result | user_id | bigint | YES - bi_user_unit_challenge_question_result | story_id | bigint | YES - bi_user_unit_challenge_question_result | score | integer | YES - bi_user_unit_challenge_question_result | score_text | character varying | YES - bi_user_unit_challenge_question_result | question_list | text | YES - bi_user_unit_challenge_question_result | sp_value | integer | YES - bi_user_unit_challenge_question_result | created_at | timestamp with time zone | YES - bi_user_unit_challenge_question_result | updated_at | timestamp with time zone | YES - bi_user_unit_challenge_question_result | deleted_at | timestamp with time zone | YES - bi_user_unit_challenge_question_result | category | character varying | YES - bi_user_unit_challenge_question_result | exp | integer | YES - bi_user_unit_challenge_question_result | play_time | bigint | YES - bi_user_unit_challenge_question_result | read_word_count | bigint | YES - bi_user_unit_challenge_question_result | listen_time | bigint | YES - bi_user_unit_challenge_question_result | speak_count | bigint | YES - bi_user_unit_challenge_question_result | level | character varying | YES - bi_user_unit_review_question_result | id | bigint | NO - bi_user_unit_review_question_result | user_id | bigint | YES - bi_user_unit_review_question_result | story_id | bigint | YES - bi_user_unit_review_question_result | chapter_id | bigint | YES - bi_user_unit_review_question_result | unique_id | character varying | YES - bi_user_unit_review_question_result | score | integer | YES - bi_user_unit_review_question_result | score_text | character varying | YES - bi_user_unit_review_question_result | question_list | text | YES - bi_user_unit_review_question_result | sp_value | integer | YES - bi_user_unit_review_question_result | exp | integer | YES - bi_user_unit_review_question_result | created_at | timestamp with time zone | YES - bi_user_unit_review_question_result | updated_at | timestamp with time zone | YES - bi_user_unit_review_question_result | deleted_at | timestamp with time zone | YES - bi_user_unit_review_question_result | play_time | bigint | YES - bi_user_unit_review_question_result | level | character varying | YES - bi_user_unit_summary_km_result | id | bigint | NO - bi_user_unit_summary_km_result | user_id | bigint | YES - bi_user_unit_summary_km_result | story_id | bigint | YES - bi_user_unit_summary_km_result | km_id | character varying | YES - bi_user_unit_summary_km_result | km_type | character varying | YES - bi_user_unit_summary_km_result | score_text | character varying | YES - bi_user_unit_summary_km_result | sp_value | integer | YES - bi_user_unit_summary_km_result | created_at | timestamp with time zone | YES - bi_user_unit_summary_km_result | updated_at | timestamp with time zone | YES - bi_user_unit_summary_km_result | deleted_at | timestamp with time zone | YES - bi_user_unit_summary_km_result | play_time | bigint | YES - bi_user_unit_summary_km_result | sum_explain | text | YES - bi_user_unit_summary_km_result | level | character varying | YES - bi_vala_app_account | id | bigint | NO - bi_vala_app_account | tel | character varying | YES - bi_vala_app_account | tel_encrypt | character varying | YES - bi_vala_app_account | name | character varying | YES - bi_vala_app_account | id_card | character varying | YES - bi_vala_app_account | status | smallint | NO - bi_vala_app_account | pay_status | integer | YES - bi_vala_app_account | login_times | integer | YES - bi_vala_app_account | remark | text | YES - bi_vala_app_account | key_from | character varying | NO - bi_vala_app_account | created_at | timestamp with time zone | YES - bi_vala_app_account | updated_at | timestamp with time zone | YES - bi_vala_app_account | deleted_at | timestamp with time zone | YES - bi_vala_app_account | download_channel | text | YES - bi_vala_app_character | id | bigint | NO - bi_vala_app_character | account_id | bigint | YES - bi_vala_app_character | nickname | character varying | YES - bi_vala_app_character | gender | smallint | YES - bi_vala_app_character | birthday | character varying | YES - bi_vala_app_character | avatar | text | YES - bi_vala_app_character | spine_name | character varying | YES - bi_vala_app_character | latest_login | timestamp with time zone | YES - bi_vala_app_character | reputation | character | YES - bi_vala_app_character | robots_cnt | integer | YES - bi_vala_app_character | head_image | character varying | YES - bi_vala_app_character | status | smallint | YES - bi_vala_app_character | purchase_season_package | text | YES - bi_vala_app_character | total_sp_point | integer | YES - bi_vala_app_character | balance_sp_point | integer | YES - bi_vala_app_character | point | integer | YES - bi_vala_app_character | pay_status | smallint | YES - bi_vala_app_character | created_at | timestamp with time zone | YES - bi_vala_app_character | updated_at | timestamp with time zone | YES - bi_vala_app_character | deleted_at | timestamp with time zone | YES - bi_vala_app_character | mood | integer | YES - bi_vala_app_character | mood_start_time | bigint | YES - bi_vala_order | id | bigint | NO - bi_vala_order | account_id | bigint | YES - bi_vala_order | out_trade_no | character varying | YES - bi_vala_order | trade_no | character varying | YES - bi_vala_order | pay_amount | character varying | YES - bi_vala_order | goods_id | bigint | YES - bi_vala_order | order_status | integer | YES - bi_vala_order | order_status_comment | text | YES - bi_vala_order | is_used | integer | YES - bi_vala_order | sale_channel | integer | YES - bi_vala_order | created_at | timestamp with time zone | YES - bi_vala_order | updated_at | timestamp with time zone | YES - bi_vala_order | deleted_at | timestamp with time zone | YES - bi_vala_order | pay_channel | integer | YES - bi_vala_order | pay_amount_int | integer | YES - bi_vala_order | pay_success_date | timestamp with time zone | YES - bi_vala_order | quantity | integer | NO - bi_vala_order | goods_name | text | NO - bi_vala_order | goods_apply_package_ids | text | NO - bi_vala_order | key_from | text | NO - bi_vala_order | expire_days | integer | YES - data_sync_config | id | bigint | NO - data_sync_config | job_name | text | NO - data_sync_config | database_name | text | YES - data_sync_config | origin_table | text | YES - data_sync_config | dest_table | text | YES - data_sync_config | deal_time | timestamp with time zone | YES - data_sync_config | full_id | bigint | NO - data_sync_config | created_at | timestamp with time zone | NO - data_sync_config | updated_at | timestamp with time zone | YES - feishu_table_record | id | bigint | NO - feishu_table_record | data_table | text | NO - feishu_table_record | record_id | text | YES - feishu_table_record | created_at | timestamp with time zone | NO - growth_activity_behavior | id | integer | NO - growth_activity_behavior | es_id | character varying | NO - growth_activity_behavior | account_id | integer | YES - growth_activity_behavior | account_name | character varying | YES - growth_activity_behavior | activity | text | YES - growth_activity_behavior | behavior | character varying | YES - growth_activity_behavior | created_at | timestamp with time zone | YES - growth_activity_behavior | info | text | YES - growth_activity_behavior | source | character varying | YES - growth_activity_behavior | sub_behavior | character varying | YES - growth_activity_behavior | user_id | integer | YES - growth_activity_behavior | user_name | character varying | YES - user_behavior_0 | id | bigint | NO - user_behavior_0 | user_id | integer | YES - user_behavior_0 | behavior | text | YES - user_behavior_0 | behavior_time_date | text | NO - user_behavior_0 | created_time | timestamp with time zone | NO - user_behavior_1 | id | bigint | NO - user_behavior_1 | user_id | integer | YES - user_behavior_1 | behavior | text | YES - user_behavior_1 | behavior_time_date | text | NO - user_behavior_1 | created_time | timestamp with time zone | NO - user_behavior_10 | id | bigint | NO - user_behavior_10 | user_id | integer | YES - user_behavior_10 | behavior | text | YES - user_behavior_10 | behavior_time_date | text | NO - user_behavior_10 | created_time | timestamp with time zone | NO - user_behavior_11 | id | bigint | NO - user_behavior_11 | user_id | integer | YES - user_behavior_11 | behavior | text | YES - user_behavior_11 | behavior_time_date | text | NO - user_behavior_11 | created_time | timestamp with time zone | NO - user_behavior_12 | id | bigint | NO - user_behavior_12 | user_id | integer | YES - user_behavior_12 | behavior | text | YES - user_behavior_12 | behavior_time_date | text | NO - user_behavior_12 | created_time | timestamp with time zone | NO - user_behavior_13 | id | bigint | NO - user_behavior_13 | user_id | integer | YES - user_behavior_13 | behavior | text | YES - user_behavior_13 | behavior_time_date | text | NO - user_behavior_13 | created_time | timestamp with time zone | NO - user_behavior_14 | id | bigint | NO - user_behavior_14 | user_id | integer | YES - user_behavior_14 | behavior | text | YES - user_behavior_14 | behavior_time_date | text | NO - user_behavior_14 | created_time | timestamp with time zone | NO - user_behavior_15 | id | bigint | NO - user_behavior_15 | user_id | integer | YES - user_behavior_15 | behavior | text | YES - user_behavior_15 | behavior_time_date | text | NO - user_behavior_15 | created_time | timestamp with time zone | NO - user_behavior_2 | id | bigint | NO - user_behavior_2 | user_id | integer | YES - user_behavior_2 | behavior | text | YES - user_behavior_2 | behavior_time_date | text | NO - user_behavior_2 | created_time | timestamp with time zone | NO - user_behavior_3 | id | bigint | NO - user_behavior_3 | user_id | integer | YES - user_behavior_3 | behavior | text | YES - user_behavior_3 | behavior_time_date | text | NO - user_behavior_3 | created_time | timestamp with time zone | NO - user_behavior_4 | id | bigint | NO - user_behavior_4 | user_id | integer | YES - user_behavior_4 | behavior | text | YES - user_behavior_4 | behavior_time_date | text | NO - user_behavior_4 | created_time | timestamp with time zone | NO - user_behavior_5 | id | bigint | NO - user_behavior_5 | user_id | integer | YES - user_behavior_5 | behavior | text | YES - user_behavior_5 | behavior_time_date | text | NO - user_behavior_5 | created_time | timestamp with time zone | NO - user_behavior_6 | id | bigint | NO - user_behavior_6 | user_id | integer | YES - user_behavior_6 | behavior | text | YES - user_behavior_6 | behavior_time_date | text | NO - user_behavior_6 | created_time | timestamp with time zone | NO - user_behavior_7 | id | bigint | NO - user_behavior_7 | user_id | integer | YES - user_behavior_7 | behavior | text | YES - user_behavior_7 | behavior_time_date | text | NO - user_behavior_7 | created_time | timestamp with time zone | NO - user_behavior_8 | id | bigint | NO - user_behavior_8 | user_id | integer | YES - user_behavior_8 | behavior | text | YES - user_behavior_8 | behavior_time_date | text | NO - user_behavior_8 | created_time | timestamp with time zone | NO - user_behavior_9 | id | bigint | NO - user_behavior_9 | user_id | integer | YES - user_behavior_9 | behavior | text | YES - user_behavior_9 | behavior_time_date | text | NO - user_behavior_9 | created_time | timestamp with time zone | NO - user_behavior_buried_points | id | integer | NO - user_behavior_buried_points | burying_point_id | integer | YES - user_behavior_buried_points | burying_point_name | character varying | YES - user_behavior_buried_points | burying_point_sub_id | integer | YES - user_behavior_buried_points | burying_point_sub_name | character varying | YES - user_behavior_buried_points | account_id | integer | YES - user_behavior_buried_points | account_name | character varying | YES - user_behavior_buried_points | character_id | integer | YES - user_behavior_buried_points | character_name | character varying | YES - user_behavior_buried_points | active_time | integer | YES - user_behavior_buried_points | version_id | character varying | YES - user_behavior_buried_points | step_duration | integer | YES - user_behavior_buried_points | finish_status | character varying | YES - user_behavior_buried_points | season_package_id | integer | YES - user_behavior_buried_points | season_package_name | character varying | YES - user_behavior_buried_points | unit_id | integer | YES - user_behavior_buried_points | unit_name | character varying | YES - user_behavior_buried_points | lesson_id | integer | YES - user_behavior_buried_points | lesson_name | character varying | YES - user_behavior_buried_points | component_id | integer | YES - user_behavior_buried_points | component_name | character varying | YES - user_behavior_buried_points | c_type | character varying | YES - user_behavior_buried_points | c_id | character varying | YES - user_behavior_buried_points | learning_module_id | integer | YES - user_behavior_buried_points | learning_module_name | character varying | YES - user_behavior_buried_points | learning_point_id | integer | YES - user_behavior_buried_points | learning_point_name | character varying | YES - user_behavior_buried_points | card_id | integer | YES - user_behavior_buried_points | card_name | character varying | YES - user_behavior_buried_points | data_version | character varying | YES - user_behavior_buried_points | ex2 | text | YES - user_behavior_buried_points | ex3 | text | YES - user_behavior_buried_points | es_id | character varying | YES - user_behavior_count_tmp | behavior | text | YES - user_behavior_count_tmp | register_time_date | text | NO - user_behavior_count_tmp | count | integer | YES - user_detail_info | id | bigint | NO - user_detail_info | user_id | integer | YES - user_detail_info | latest_login_time | bigint | YES - user_detail_info | latest_active_time | bigint | YES - user_detail_info | created_time | timestamp with time zone | NO - user_done_duration | id | bigint | NO - user_done_duration | user_id | integer | YES - user_done_duration | chapter_id | integer | YES - user_done_duration | duration | integer | YES - user_done_duration | created_time | timestamp with time zone | NO - user_learning | id | bigint | NO - user_learning | user_id | integer | YES - user_learning | stat_date | text | NO - user_learning | learning_time | integer | YES - user_learning | created_time | timestamp with time zone | NO -(674 rows) - diff --git a/database/pg_test_full.txt b/database/pg_test_full.txt deleted file mode 100644 index bee2858..0000000 --- a/database/pg_test_full.txt +++ /dev/null @@ -1,628 +0,0 @@ - table_name | column_name | data_type | is_nullable -----------------------------------------+-------------------------+--------------------------+------------- - account_activity_count | id | bigint | NO - account_activity_count | time_period | integer | YES - account_activity_count | counts | integer | YES - account_activity_count | stat_date | text | NO - account_activity_count | created_time | timestamp with time zone | NO - account_behavior_count | id | bigint | NO - account_behavior_count | count | integer | YES - account_behavior_count | behavior | text | YES - account_behavior_count | behavior_group | text | NO - account_behavior_count | stat_date | text | NO - account_behavior_count | dt | text | NO - account_behavior_count | created_time | timestamp with time zone | NO - account_detail_info | id | bigint | NO - account_detail_info | account_id | integer | YES - account_detail_info | login_time | text | YES - account_detail_info | device | text | YES - account_detail_info | device_os | text | YES - account_detail_info | login_address | text | YES - account_detail_info | login_times | integer | YES - account_detail_info | created_time | timestamp with time zone | NO - account_device | id | bigint | NO - account_device | account_id | integer | YES - account_device | device | text | YES - account_device | created_time | timestamp with time zone | NO - account_login | id | bigint | NO - account_login | account_id | integer | YES - account_login | login_date | text | NO - account_login | created_time | timestamp with time zone | NO - account_login | status | integer | NO - account_paid_activity_count | id | integer | NO - account_paid_activity_count | time_period | integer | YES - account_paid_activity_count | counts | integer | YES - account_paid_activity_count | stat_date | text | NO - account_paid_activity_count | created_time | timestamp with time zone | NO - bi_refund_order | id | bigint | NO - bi_refund_order | account_id | bigint | YES - bi_refund_order | out_trade_no | character varying | YES - bi_refund_order | trade_no | character varying | YES - bi_refund_order | refund_amount | character varying | YES - bi_refund_order | created_at | timestamp with time zone | YES - bi_refund_order | updated_at | timestamp with time zone | YES - bi_refund_order | deleted_at | timestamp with time zone | YES - bi_refund_order | refund_amount_int | integer | NO - bi_refund_order | reason | text | NO - bi_refund_order | sale_channel | integer | NO - bi_refund_order | status | integer | NO - bi_refund_order | is_admin | boolean | NO - bi_refund_order | channel_refund_id | bigint | NO - bi_refund_order | refund_ticket_ids | text | YES - bi_refund_order | refund_type | integer | NO - bi_refund_order | refund_method | integer | NO - bi_refund_order | after_sale_no | text | YES - bi_user_chapter_play_record_0 | id | bigint | NO - bi_user_chapter_play_record_0 | user_id | bigint | YES - bi_user_chapter_play_record_0 | chapter_id | bigint | YES - bi_user_chapter_play_record_0 | chapter_unique_id | character varying | YES - bi_user_chapter_play_record_0 | play_status | smallint | YES - bi_user_chapter_play_record_0 | story_id | bigint | YES - bi_user_chapter_play_record_0 | created_at | timestamp with time zone | YES - bi_user_chapter_play_record_0 | updated_at | timestamp with time zone | YES - bi_user_chapter_play_record_0 | deleted_at | timestamp with time zone | YES - bi_user_chapter_play_record_0 | map_position | text | YES - bi_user_chapter_play_record_0 | dialog_info | text | YES - bi_user_chapter_play_record_0 | level | character varying | YES - bi_user_chapter_play_record_1 | id | bigint | NO - bi_user_chapter_play_record_1 | user_id | bigint | YES - bi_user_chapter_play_record_1 | chapter_id | bigint | YES - bi_user_chapter_play_record_1 | chapter_unique_id | character varying | YES - bi_user_chapter_play_record_1 | play_status | smallint | YES - bi_user_chapter_play_record_1 | story_id | bigint | YES - bi_user_chapter_play_record_1 | created_at | timestamp with time zone | YES - bi_user_chapter_play_record_1 | updated_at | timestamp with time zone | YES - bi_user_chapter_play_record_1 | deleted_at | timestamp with time zone | YES - bi_user_chapter_play_record_1 | map_position | text | YES - bi_user_chapter_play_record_1 | dialog_info | text | YES - bi_user_chapter_play_record_1 | level | character varying | YES - bi_user_chapter_play_record_2 | id | bigint | NO - bi_user_chapter_play_record_2 | user_id | bigint | YES - bi_user_chapter_play_record_2 | chapter_id | bigint | YES - bi_user_chapter_play_record_2 | chapter_unique_id | character varying | YES - bi_user_chapter_play_record_2 | play_status | smallint | YES - bi_user_chapter_play_record_2 | story_id | bigint | YES - bi_user_chapter_play_record_2 | created_at | timestamp with time zone | YES - bi_user_chapter_play_record_2 | updated_at | timestamp with time zone | YES - bi_user_chapter_play_record_2 | deleted_at | timestamp with time zone | YES - bi_user_chapter_play_record_2 | map_position | text | YES - bi_user_chapter_play_record_2 | dialog_info | text | YES - bi_user_chapter_play_record_2 | level | character varying | YES - bi_user_chapter_play_record_3 | id | bigint | NO - bi_user_chapter_play_record_3 | user_id | bigint | YES - bi_user_chapter_play_record_3 | chapter_id | bigint | YES - bi_user_chapter_play_record_3 | chapter_unique_id | character varying | YES - bi_user_chapter_play_record_3 | play_status | smallint | YES - bi_user_chapter_play_record_3 | story_id | bigint | YES - bi_user_chapter_play_record_3 | created_at | timestamp with time zone | YES - bi_user_chapter_play_record_3 | updated_at | timestamp with time zone | YES - bi_user_chapter_play_record_3 | deleted_at | timestamp with time zone | YES - bi_user_chapter_play_record_3 | map_position | text | YES - bi_user_chapter_play_record_3 | dialog_info | text | YES - bi_user_chapter_play_record_3 | level | character varying | YES - bi_user_chapter_play_record_4 | id | bigint | NO - bi_user_chapter_play_record_4 | user_id | bigint | YES - bi_user_chapter_play_record_4 | chapter_id | bigint | YES - bi_user_chapter_play_record_4 | chapter_unique_id | character varying | YES - bi_user_chapter_play_record_4 | play_status | smallint | YES - bi_user_chapter_play_record_4 | story_id | bigint | YES - bi_user_chapter_play_record_4 | created_at | timestamp with time zone | YES - bi_user_chapter_play_record_4 | updated_at | timestamp with time zone | YES - bi_user_chapter_play_record_4 | deleted_at | timestamp with time zone | YES - bi_user_chapter_play_record_4 | map_position | text | YES - bi_user_chapter_play_record_4 | dialog_info | text | YES - bi_user_chapter_play_record_4 | level | character varying | YES - bi_user_chapter_play_record_5 | id | bigint | NO - bi_user_chapter_play_record_5 | user_id | bigint | YES - bi_user_chapter_play_record_5 | chapter_id | bigint | YES - bi_user_chapter_play_record_5 | chapter_unique_id | character varying | YES - bi_user_chapter_play_record_5 | play_status | smallint | YES - bi_user_chapter_play_record_5 | story_id | bigint | YES - bi_user_chapter_play_record_5 | created_at | timestamp with time zone | YES - bi_user_chapter_play_record_5 | updated_at | timestamp with time zone | YES - bi_user_chapter_play_record_5 | deleted_at | timestamp with time zone | YES - bi_user_chapter_play_record_5 | map_position | text | YES - bi_user_chapter_play_record_5 | dialog_info | text | YES - bi_user_chapter_play_record_5 | level | character varying | YES - bi_user_chapter_play_record_6 | id | bigint | NO - bi_user_chapter_play_record_6 | user_id | bigint | YES - bi_user_chapter_play_record_6 | chapter_id | bigint | YES - bi_user_chapter_play_record_6 | chapter_unique_id | character varying | YES - bi_user_chapter_play_record_6 | play_status | smallint | YES - bi_user_chapter_play_record_6 | story_id | bigint | YES - bi_user_chapter_play_record_6 | created_at | timestamp with time zone | YES - bi_user_chapter_play_record_6 | updated_at | timestamp with time zone | YES - bi_user_chapter_play_record_6 | deleted_at | timestamp with time zone | YES - bi_user_chapter_play_record_6 | map_position | text | YES - bi_user_chapter_play_record_6 | dialog_info | text | YES - bi_user_chapter_play_record_6 | level | character varying | YES - bi_user_chapter_play_record_7 | id | bigint | NO - bi_user_chapter_play_record_7 | user_id | bigint | YES - bi_user_chapter_play_record_7 | chapter_id | bigint | YES - bi_user_chapter_play_record_7 | chapter_unique_id | character varying | YES - bi_user_chapter_play_record_7 | play_status | smallint | YES - bi_user_chapter_play_record_7 | story_id | bigint | YES - bi_user_chapter_play_record_7 | created_at | timestamp with time zone | YES - bi_user_chapter_play_record_7 | updated_at | timestamp with time zone | YES - bi_user_chapter_play_record_7 | deleted_at | timestamp with time zone | YES - bi_user_chapter_play_record_7 | map_position | text | YES - bi_user_chapter_play_record_7 | dialog_info | text | YES - bi_user_chapter_play_record_7 | level | character varying | YES - bi_user_component_play_record_0 | id | bigint | NO - bi_user_component_play_record_0 | user_id | bigint | YES - bi_user_component_play_record_0 | chapter_id | bigint | YES - bi_user_component_play_record_0 | chapter_unique_id | character varying | YES - bi_user_component_play_record_0 | component_id | bigint | YES - bi_user_component_play_record_0 | sub_component_type | smallint | YES - bi_user_component_play_record_0 | component_type | smallint | YES - bi_user_component_play_record_0 | comp_type | character varying | YES - bi_user_component_play_record_0 | play_status | smallint | YES - bi_user_component_play_record_0 | pass_time | integer | YES - bi_user_component_play_record_0 | interval_time | integer | YES - bi_user_component_play_record_0 | read_word_count | integer | YES - bi_user_component_play_record_0 | speak_count | integer | YES - bi_user_component_play_record_0 | listen_sentence_count | integer | YES - bi_user_component_play_record_0 | write_word_count | integer | YES - bi_user_component_play_record_0 | created_at | timestamp with time zone | YES - bi_user_component_play_record_0 | updated_at | timestamp with time zone | YES - bi_user_component_play_record_0 | deleted_at | timestamp with time zone | YES - bi_user_component_play_record_0 | section_id | bigint | YES - bi_user_component_play_record_0 | component_unique_code | character varying | YES - bi_user_component_play_record_0 | play_result | character varying | YES - bi_user_component_play_record_0 | finish_question_count | integer | YES - bi_user_component_play_record_0 | skill_points | integer | YES - bi_user_component_play_record_0 | c_type | character varying | YES - bi_user_component_play_record_0 | c_id | character varying | YES - bi_user_component_play_record_0 | user_behavior_info | text | YES - bi_user_component_play_record_0 | behavior_analysis_info | text | YES - bi_user_component_play_record_0 | session_id | character varying | YES - bi_user_component_play_record_0 | listen_time | integer | YES - bi_user_component_play_record_0 | dialog_list | text | YES - bi_user_component_play_record_0 | level | character varying | YES - bi_user_component_play_record_1 | id | bigint | NO - bi_user_component_play_record_1 | user_id | bigint | YES - bi_user_component_play_record_1 | chapter_id | bigint | YES - bi_user_component_play_record_1 | chapter_unique_id | character varying | YES - bi_user_component_play_record_1 | component_id | bigint | YES - bi_user_component_play_record_1 | sub_component_type | smallint | YES - bi_user_component_play_record_1 | component_type | smallint | YES - bi_user_component_play_record_1 | comp_type | character varying | YES - bi_user_component_play_record_1 | play_status | smallint | YES - bi_user_component_play_record_1 | pass_time | integer | YES - bi_user_component_play_record_1 | interval_time | integer | YES - bi_user_component_play_record_1 | read_word_count | integer | YES - bi_user_component_play_record_1 | speak_count | integer | YES - bi_user_component_play_record_1 | listen_sentence_count | integer | YES - bi_user_component_play_record_1 | write_word_count | integer | YES - bi_user_component_play_record_1 | created_at | timestamp with time zone | YES - bi_user_component_play_record_1 | updated_at | timestamp with time zone | YES - bi_user_component_play_record_1 | deleted_at | timestamp with time zone | YES - bi_user_component_play_record_1 | section_id | bigint | YES - bi_user_component_play_record_1 | component_unique_code | character varying | YES - bi_user_component_play_record_1 | play_result | character varying | YES - bi_user_component_play_record_1 | finish_question_count | integer | YES - bi_user_component_play_record_1 | skill_points | integer | YES - bi_user_component_play_record_1 | c_type | character varying | YES - bi_user_component_play_record_1 | c_id | character varying | YES - bi_user_component_play_record_1 | user_behavior_info | text | YES - bi_user_component_play_record_1 | behavior_analysis_info | text | YES - bi_user_component_play_record_1 | session_id | character varying | YES - bi_user_component_play_record_1 | listen_time | integer | YES - bi_user_component_play_record_1 | dialog_list | text | YES - bi_user_component_play_record_1 | level | character varying | YES - bi_user_component_play_record_2 | id | bigint | NO - bi_user_component_play_record_2 | user_id | bigint | YES - bi_user_component_play_record_2 | chapter_id | bigint | YES - bi_user_component_play_record_2 | chapter_unique_id | character varying | YES - bi_user_component_play_record_2 | component_id | bigint | YES - bi_user_component_play_record_2 | sub_component_type | smallint | YES - bi_user_component_play_record_2 | component_type | smallint | YES - bi_user_component_play_record_2 | comp_type | character varying | YES - bi_user_component_play_record_2 | play_status | smallint | YES - bi_user_component_play_record_2 | pass_time | integer | YES - bi_user_component_play_record_2 | interval_time | integer | YES - bi_user_component_play_record_2 | read_word_count | integer | YES - bi_user_component_play_record_2 | speak_count | integer | YES - bi_user_component_play_record_2 | listen_sentence_count | integer | YES - bi_user_component_play_record_2 | write_word_count | integer | YES - bi_user_component_play_record_2 | created_at | timestamp with time zone | YES - bi_user_component_play_record_2 | updated_at | timestamp with time zone | YES - bi_user_component_play_record_2 | deleted_at | timestamp with time zone | YES - bi_user_component_play_record_2 | section_id | bigint | YES - bi_user_component_play_record_2 | component_unique_code | character varying | YES - bi_user_component_play_record_2 | play_result | character varying | YES - bi_user_component_play_record_2 | finish_question_count | integer | YES - bi_user_component_play_record_2 | skill_points | integer | YES - bi_user_component_play_record_2 | c_type | character varying | YES - bi_user_component_play_record_2 | c_id | character varying | YES - bi_user_component_play_record_2 | user_behavior_info | text | YES - bi_user_component_play_record_2 | behavior_analysis_info | text | YES - bi_user_component_play_record_2 | session_id | character varying | YES - bi_user_component_play_record_2 | listen_time | integer | YES - bi_user_component_play_record_2 | dialog_list | text | YES - bi_user_component_play_record_2 | level | character varying | YES - bi_user_component_play_record_3 | id | bigint | NO - bi_user_component_play_record_3 | user_id | bigint | YES - bi_user_component_play_record_3 | chapter_id | bigint | YES - bi_user_component_play_record_3 | chapter_unique_id | character varying | YES - bi_user_component_play_record_3 | component_id | bigint | YES - bi_user_component_play_record_3 | sub_component_type | smallint | YES - bi_user_component_play_record_3 | component_type | smallint | YES - bi_user_component_play_record_3 | comp_type | character varying | YES - bi_user_component_play_record_3 | play_status | smallint | YES - bi_user_component_play_record_3 | pass_time | integer | YES - bi_user_component_play_record_3 | interval_time | integer | YES - bi_user_component_play_record_3 | read_word_count | integer | YES - bi_user_component_play_record_3 | speak_count | integer | YES - bi_user_component_play_record_3 | listen_sentence_count | integer | YES - bi_user_component_play_record_3 | write_word_count | integer | YES - bi_user_component_play_record_3 | created_at | timestamp with time zone | YES - bi_user_component_play_record_3 | updated_at | timestamp with time zone | YES - bi_user_component_play_record_3 | deleted_at | timestamp with time zone | YES - bi_user_component_play_record_3 | section_id | bigint | YES - bi_user_component_play_record_3 | component_unique_code | character varying | YES - bi_user_component_play_record_3 | play_result | character varying | YES - bi_user_component_play_record_3 | finish_question_count | integer | YES - bi_user_component_play_record_3 | skill_points | integer | YES - bi_user_component_play_record_3 | c_type | character varying | YES - bi_user_component_play_record_3 | c_id | character varying | YES - bi_user_component_play_record_3 | user_behavior_info | text | YES - bi_user_component_play_record_3 | behavior_analysis_info | text | YES - bi_user_component_play_record_3 | session_id | character varying | YES - bi_user_component_play_record_3 | listen_time | integer | YES - bi_user_component_play_record_3 | dialog_list | text | YES - bi_user_component_play_record_3 | level | character varying | YES - bi_user_component_play_record_4 | id | bigint | NO - bi_user_component_play_record_4 | user_id | bigint | YES - bi_user_component_play_record_4 | chapter_id | bigint | YES - bi_user_component_play_record_4 | chapter_unique_id | character varying | YES - bi_user_component_play_record_4 | component_id | bigint | YES - bi_user_component_play_record_4 | sub_component_type | smallint | YES - bi_user_component_play_record_4 | component_type | smallint | YES - bi_user_component_play_record_4 | comp_type | character varying | YES - bi_user_component_play_record_4 | play_status | smallint | YES - bi_user_component_play_record_4 | pass_time | integer | YES - bi_user_component_play_record_4 | interval_time | integer | YES - bi_user_component_play_record_4 | read_word_count | integer | YES - bi_user_component_play_record_4 | speak_count | integer | YES - bi_user_component_play_record_4 | listen_sentence_count | integer | YES - bi_user_component_play_record_4 | write_word_count | integer | YES - bi_user_component_play_record_4 | created_at | timestamp with time zone | YES - bi_user_component_play_record_4 | updated_at | timestamp with time zone | YES - bi_user_component_play_record_4 | deleted_at | timestamp with time zone | YES - bi_user_component_play_record_4 | section_id | bigint | YES - bi_user_component_play_record_4 | component_unique_code | character varying | YES - bi_user_component_play_record_4 | play_result | character varying | YES - bi_user_component_play_record_4 | finish_question_count | integer | YES - bi_user_component_play_record_4 | skill_points | integer | YES - bi_user_component_play_record_4 | c_type | character varying | YES - bi_user_component_play_record_4 | c_id | character varying | YES - bi_user_component_play_record_4 | user_behavior_info | text | YES - bi_user_component_play_record_4 | behavior_analysis_info | text | YES - bi_user_component_play_record_4 | session_id | character varying | YES - bi_user_component_play_record_4 | listen_time | integer | YES - bi_user_component_play_record_4 | dialog_list | text | YES - bi_user_component_play_record_4 | level | character varying | YES - bi_user_component_play_record_5 | id | bigint | NO - bi_user_component_play_record_5 | user_id | bigint | YES - bi_user_component_play_record_5 | chapter_id | bigint | YES - bi_user_component_play_record_5 | chapter_unique_id | character varying | YES - bi_user_component_play_record_5 | component_id | bigint | YES - bi_user_component_play_record_5 | sub_component_type | smallint | YES - bi_user_component_play_record_5 | component_type | smallint | YES - bi_user_component_play_record_5 | comp_type | character varying | YES - bi_user_component_play_record_5 | play_status | smallint | YES - bi_user_component_play_record_5 | pass_time | integer | YES - bi_user_component_play_record_5 | interval_time | integer | YES - bi_user_component_play_record_5 | read_word_count | integer | YES - bi_user_component_play_record_5 | speak_count | integer | YES - bi_user_component_play_record_5 | listen_sentence_count | integer | YES - bi_user_component_play_record_5 | write_word_count | integer | YES - bi_user_component_play_record_5 | created_at | timestamp with time zone | YES - bi_user_component_play_record_5 | updated_at | timestamp with time zone | YES - bi_user_component_play_record_5 | deleted_at | timestamp with time zone | YES - bi_user_component_play_record_5 | section_id | bigint | YES - bi_user_component_play_record_5 | component_unique_code | character varying | YES - bi_user_component_play_record_5 | play_result | character varying | YES - bi_user_component_play_record_5 | finish_question_count | integer | YES - bi_user_component_play_record_5 | skill_points | integer | YES - bi_user_component_play_record_5 | c_type | character varying | YES - bi_user_component_play_record_5 | c_id | character varying | YES - bi_user_component_play_record_5 | user_behavior_info | text | YES - bi_user_component_play_record_5 | behavior_analysis_info | text | YES - bi_user_component_play_record_5 | session_id | character varying | YES - bi_user_component_play_record_5 | listen_time | integer | YES - bi_user_component_play_record_5 | dialog_list | text | YES - bi_user_component_play_record_5 | level | character varying | YES - bi_user_component_play_record_6 | id | bigint | NO - bi_user_component_play_record_6 | user_id | bigint | YES - bi_user_component_play_record_6 | chapter_id | bigint | YES - bi_user_component_play_record_6 | chapter_unique_id | character varying | YES - bi_user_component_play_record_6 | component_id | bigint | YES - bi_user_component_play_record_6 | sub_component_type | smallint | YES - bi_user_component_play_record_6 | component_type | smallint | YES - bi_user_component_play_record_6 | comp_type | character varying | YES - bi_user_component_play_record_6 | play_status | smallint | YES - bi_user_component_play_record_6 | pass_time | integer | YES - bi_user_component_play_record_6 | interval_time | integer | YES - bi_user_component_play_record_6 | read_word_count | integer | YES - bi_user_component_play_record_6 | speak_count | integer | YES - bi_user_component_play_record_6 | listen_sentence_count | integer | YES - bi_user_component_play_record_6 | write_word_count | integer | YES - bi_user_component_play_record_6 | created_at | timestamp with time zone | YES - bi_user_component_play_record_6 | updated_at | timestamp with time zone | YES - bi_user_component_play_record_6 | deleted_at | timestamp with time zone | YES - bi_user_component_play_record_6 | section_id | bigint | YES - bi_user_component_play_record_6 | component_unique_code | character varying | YES - bi_user_component_play_record_6 | play_result | character varying | YES - bi_user_component_play_record_6 | finish_question_count | integer | YES - bi_user_component_play_record_6 | skill_points | integer | YES - bi_user_component_play_record_6 | c_type | character varying | YES - bi_user_component_play_record_6 | c_id | character varying | YES - bi_user_component_play_record_6 | user_behavior_info | text | YES - bi_user_component_play_record_6 | behavior_analysis_info | text | YES - bi_user_component_play_record_6 | session_id | character varying | YES - bi_user_component_play_record_6 | listen_time | integer | YES - bi_user_component_play_record_6 | dialog_list | text | YES - bi_user_component_play_record_6 | level | character varying | YES - bi_user_course_detail | id | integer | NO - bi_user_course_detail | created_at | timestamp with time zone | YES - bi_user_course_detail | updated_at | timestamp with time zone | YES - bi_user_course_detail | account_id | integer | YES - bi_user_course_detail | user_id | integer | YES - bi_user_course_detail | course_level | text | YES - bi_user_course_detail | active_time | timestamp with time zone | YES - bi_user_course_detail | expire_time | timestamp with time zone | YES - bi_user_course_detail | latest_unit_index | integer | YES - bi_user_course_detail | latest_lesson_index | integer | YES - bi_user_course_detail | learn_duration | integer | YES - bi_user_course_detail | last_learn_time | timestamp with time zone | YES - bi_user_course_detail | deleted_at | timestamp with time zone | YES - bi_user_unit_challenge_question_result | id | bigint | NO - bi_user_unit_challenge_question_result | user_id | bigint | YES - bi_user_unit_challenge_question_result | story_id | bigint | YES - bi_user_unit_challenge_question_result | score | integer | YES - bi_user_unit_challenge_question_result | score_text | character varying | YES - bi_user_unit_challenge_question_result | question_list | text | YES - bi_user_unit_challenge_question_result | sp_value | integer | YES - bi_user_unit_challenge_question_result | created_at | timestamp with time zone | YES - bi_user_unit_challenge_question_result | updated_at | timestamp with time zone | YES - bi_user_unit_challenge_question_result | deleted_at | timestamp with time zone | YES - bi_user_unit_challenge_question_result | category | character varying | YES - bi_user_unit_challenge_question_result | exp | integer | YES - bi_user_unit_challenge_question_result | play_time | bigint | YES - bi_user_unit_challenge_question_result | read_word_count | bigint | YES - bi_user_unit_challenge_question_result | listen_time | bigint | YES - bi_user_unit_challenge_question_result | speak_count | bigint | YES - bi_user_unit_challenge_question_result | level | character varying | YES - bi_user_unit_summary_km_result | id | bigint | NO - bi_user_unit_summary_km_result | user_id | bigint | YES - bi_user_unit_summary_km_result | story_id | bigint | YES - bi_user_unit_summary_km_result | km_id | character varying | YES - bi_user_unit_summary_km_result | km_type | character varying | YES - bi_user_unit_summary_km_result | score_text | character varying | YES - bi_user_unit_summary_km_result | sp_value | integer | YES - bi_user_unit_summary_km_result | created_at | timestamp with time zone | YES - bi_user_unit_summary_km_result | updated_at | timestamp with time zone | YES - bi_user_unit_summary_km_result | deleted_at | timestamp with time zone | YES - bi_user_unit_summary_km_result | play_time | bigint | YES - bi_user_unit_summary_km_result | sum_explain | text | YES - bi_user_unit_summary_km_result | level | character varying | YES - bi_vala_app_account | id | bigint | NO - bi_vala_app_account | tel | character varying | YES - bi_vala_app_account | tel_encrypt | character varying | YES - bi_vala_app_account | name | character varying | YES - bi_vala_app_account | id_card | character varying | YES - bi_vala_app_account | status | smallint | NO - bi_vala_app_account | pay_status | integer | YES - bi_vala_app_account | login_times | integer | YES - bi_vala_app_account | remark | text | YES - bi_vala_app_account | key_from | character varying | NO - bi_vala_app_account | created_at | timestamp with time zone | YES - bi_vala_app_account | updated_at | timestamp with time zone | YES - bi_vala_app_account | deleted_at | timestamp with time zone | YES - bi_vala_app_account | download_channel | text | YES - bi_vala_app_character | id | bigint | NO - bi_vala_app_character | account_id | bigint | YES - bi_vala_app_character | nickname | character varying | YES - bi_vala_app_character | gender | smallint | YES - bi_vala_app_character | birthday | character varying | YES - bi_vala_app_character | avatar | text | YES - bi_vala_app_character | spine_name | character varying | YES - bi_vala_app_character | latest_login | timestamp with time zone | YES - bi_vala_app_character | reputation | character | YES - bi_vala_app_character | robots_cnt | integer | YES - bi_vala_app_character | head_image | character varying | YES - bi_vala_app_character | status | smallint | YES - bi_vala_app_character | purchase_season_package | text | YES - bi_vala_app_character | total_sp_point | integer | YES - bi_vala_app_character | balance_sp_point | integer | YES - bi_vala_app_character | point | integer | YES - bi_vala_app_character | pay_status | smallint | YES - bi_vala_app_character | created_at | timestamp with time zone | YES - bi_vala_app_character | updated_at | timestamp with time zone | YES - bi_vala_app_character | deleted_at | timestamp with time zone | YES - bi_vala_app_character | mood | integer | YES - bi_vala_app_character | mood_start_time | bigint | YES - bi_vala_order | id | bigint | NO - bi_vala_order | account_id | bigint | YES - bi_vala_order | out_trade_no | character varying | YES - bi_vala_order | trade_no | character varying | YES - bi_vala_order | pay_amount | character varying | YES - bi_vala_order | goods_id | bigint | YES - bi_vala_order | order_status | integer | YES - bi_vala_order | order_status_comment | text | YES - bi_vala_order | is_used | integer | YES - bi_vala_order | sale_channel | integer | YES - bi_vala_order | created_at | timestamp with time zone | YES - bi_vala_order | updated_at | timestamp with time zone | YES - bi_vala_order | deleted_at | timestamp with time zone | YES - bi_vala_order | pay_channel | integer | YES - bi_vala_order | pay_amount_int | integer | YES - bi_vala_order | pay_success_date | timestamp with time zone | YES - bi_vala_order | quantity | integer | NO - bi_vala_order | goods_name | text | NO - bi_vala_order | goods_apply_package_ids | text | NO - bi_vala_order | key_from | text | NO - bi_vala_order | expire_days | integer | YES - data_sync_config | id | bigint | NO - data_sync_config | job_name | text | NO - data_sync_config | database_name | text | YES - data_sync_config | origin_table | text | YES - data_sync_config | dest_table | text | YES - data_sync_config | deal_time | timestamp with time zone | YES - data_sync_config | full_id | bigint | NO - data_sync_config | created_at | timestamp with time zone | NO - data_sync_config | updated_at | timestamp with time zone | YES - feishu_table_record | id | bigint | NO - feishu_table_record | data_table | text | NO - feishu_table_record | record_id | text | YES - feishu_table_record | created_at | timestamp with time zone | NO - growth_activity_behavior | id | integer | NO - growth_activity_behavior | es_id | character varying | NO - growth_activity_behavior | account_id | integer | YES - growth_activity_behavior | account_name | character varying | YES - growth_activity_behavior | activity | text | YES - growth_activity_behavior | behavior | character varying | YES - growth_activity_behavior | created_at | timestamp with time zone | YES - growth_activity_behavior | info | text | YES - growth_activity_behavior | source | character varying | YES - growth_activity_behavior | sub_behavior | character varying | YES - growth_activity_behavior | user_id | integer | YES - growth_activity_behavior | user_name | character varying | YES - user_behavior_0 | id | bigint | NO - user_behavior_0 | user_id | integer | YES - user_behavior_0 | behavior | text | YES - user_behavior_0 | behavior_time_date | text | NO - user_behavior_0 | created_time | timestamp with time zone | NO - user_behavior_1 | id | bigint | NO - user_behavior_1 | user_id | integer | YES - user_behavior_1 | behavior | text | YES - user_behavior_1 | behavior_time_date | text | NO - user_behavior_1 | created_time | timestamp with time zone | NO - user_behavior_10 | id | bigint | NO - user_behavior_10 | user_id | integer | YES - user_behavior_10 | behavior | text | YES - user_behavior_10 | behavior_time_date | text | NO - user_behavior_10 | created_time | timestamp with time zone | NO - user_behavior_11 | id | bigint | NO - user_behavior_11 | user_id | integer | YES - user_behavior_11 | behavior | text | YES - user_behavior_11 | behavior_time_date | text | NO - user_behavior_11 | created_time | timestamp with time zone | NO - user_behavior_12 | id | bigint | NO - user_behavior_12 | user_id | integer | YES - user_behavior_12 | behavior | text | YES - user_behavior_12 | behavior_time_date | text | NO - user_behavior_12 | created_time | timestamp with time zone | NO - user_behavior_13 | id | bigint | NO - user_behavior_13 | user_id | integer | YES - user_behavior_13 | behavior | text | YES - user_behavior_13 | behavior_time_date | text | NO - user_behavior_13 | created_time | timestamp with time zone | NO - user_behavior_14 | id | bigint | NO - user_behavior_14 | user_id | integer | YES - user_behavior_14 | behavior | text | YES - user_behavior_14 | behavior_time_date | text | NO - user_behavior_14 | created_time | timestamp with time zone | NO - user_behavior_15 | id | bigint | NO - user_behavior_15 | user_id | integer | YES - user_behavior_15 | behavior | text | YES - user_behavior_15 | behavior_time_date | text | NO - user_behavior_15 | created_time | timestamp with time zone | NO - user_behavior_2 | id | bigint | NO - user_behavior_2 | user_id | integer | YES - user_behavior_2 | behavior | text | YES - user_behavior_2 | behavior_time_date | text | NO - user_behavior_2 | created_time | timestamp with time zone | NO - user_behavior_3 | id | bigint | NO - user_behavior_3 | user_id | integer | YES - user_behavior_3 | behavior | text | YES - user_behavior_3 | behavior_time_date | text | NO - user_behavior_3 | created_time | timestamp with time zone | NO - user_behavior_4 | id | bigint | NO - user_behavior_4 | user_id | integer | YES - user_behavior_4 | behavior | text | YES - user_behavior_4 | behavior_time_date | text | NO - user_behavior_4 | created_time | timestamp with time zone | NO - user_behavior_5 | id | bigint | NO - user_behavior_5 | user_id | integer | YES - user_behavior_5 | behavior | text | YES - user_behavior_5 | behavior_time_date | text | NO - user_behavior_5 | created_time | timestamp with time zone | NO - user_behavior_6 | id | bigint | NO - user_behavior_6 | user_id | integer | YES - user_behavior_6 | behavior | text | YES - user_behavior_6 | behavior_time_date | text | NO - user_behavior_6 | created_time | timestamp with time zone | NO - user_behavior_7 | id | bigint | NO - user_behavior_7 | user_id | integer | YES - user_behavior_7 | behavior | text | YES - user_behavior_7 | behavior_time_date | text | NO - user_behavior_7 | created_time | timestamp with time zone | NO - user_behavior_8 | id | bigint | NO - user_behavior_8 | user_id | integer | YES - user_behavior_8 | behavior | text | YES - user_behavior_8 | behavior_time_date | text | NO - user_behavior_8 | created_time | timestamp with time zone | NO - user_behavior_9 | id | bigint | NO - user_behavior_9 | user_id | integer | YES - user_behavior_9 | behavior | text | YES - user_behavior_9 | behavior_time_date | text | NO - user_behavior_9 | created_time | timestamp with time zone | NO - user_behavior_buried_points | id | integer | NO - user_behavior_buried_points | burying_point_id | integer | YES - user_behavior_buried_points | burying_point_name | character varying | YES - user_behavior_buried_points | burying_point_sub_id | integer | YES - user_behavior_buried_points | burying_point_sub_name | character varying | YES - user_behavior_buried_points | account_id | integer | YES - user_behavior_buried_points | account_name | character varying | YES - user_behavior_buried_points | character_id | integer | YES - user_behavior_buried_points | character_name | character varying | YES - user_behavior_buried_points | active_time | integer | YES - user_behavior_buried_points | version_id | character varying | YES - user_behavior_buried_points | step_duration | integer | YES - user_behavior_buried_points | finish_status | character varying | YES - user_behavior_buried_points | season_package_id | integer | YES - user_behavior_buried_points | season_package_name | character varying | YES - user_behavior_buried_points | unit_id | integer | YES - user_behavior_buried_points | unit_name | character varying | YES - user_behavior_buried_points | lesson_id | integer | YES - user_behavior_buried_points | lesson_name | character varying | YES - user_behavior_buried_points | component_id | integer | YES - user_behavior_buried_points | component_name | character varying | YES - user_behavior_buried_points | c_type | character varying | YES - user_behavior_buried_points | c_id | character varying | YES - user_behavior_buried_points | learning_module_id | integer | YES - user_behavior_buried_points | learning_module_name | character varying | YES - user_behavior_buried_points | learning_point_id | integer | YES - user_behavior_buried_points | learning_point_name | character varying | YES - user_behavior_buried_points | card_id | integer | YES - user_behavior_buried_points | card_name | character varying | YES - user_behavior_buried_points | data_version | character varying | YES - user_behavior_buried_points | ex2 | text | YES - user_behavior_buried_points | ex3 | text | YES - user_behavior_buried_points | es_id | character varying | YES - user_behavior_count_tmp | behavior | text | YES - user_behavior_count_tmp | register_time_date | text | NO - user_behavior_count_tmp | count | integer | YES - user_detail_info | id | bigint | NO - user_detail_info | user_id | integer | YES - user_detail_info | latest_login_time | bigint | YES - user_detail_info | latest_active_time | bigint | YES - user_detail_info | created_time | timestamp with time zone | NO - user_info | id | bigint | NO - user_info | account_id | integer | YES - user_info | user_id | integer | YES - user_info | gender | integer | YES - user_info | birthday | text | YES - user_info | register_time_date | text | NO - user_info | created_time | timestamp with time zone | NO - user_info | register_time | text | YES - user_learning | id | bigint | NO - user_learning | user_id | integer | YES - user_learning | stat_date | text | NO - user_learning | learning_time | integer | YES - user_learning | created_time | timestamp with time zone | NO -(624 rows) - diff --git a/database/数据库表结构总览_完整版.md b/database/数据库表结构总览_完整版.md deleted file mode 100644 index 5c1c553..0000000 --- a/database/数据库表结构总览_完整版.md +++ /dev/null @@ -1,643 +0,0 @@ -# 瓦拉英语业务数据库表结构总览(完整版) -## 使用说明 -- 每个表开头的【表用途注释】行可填写该表的业务用途说明 -- 每个字段的【注释】列可填写该字段的业务含义说明 -- 所有空注释位置均可直接编辑补充 ---- -## 一、线上PostgreSQL库(正式环境用户行为数据) -- **地址:** bj-postgres-16pob4sg.sql.tencentcdb.com:28591 -- **库名:** vala_bi -- **权限:** 只读 ---- -### account_activity_count -【表用途注释:】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| time_period | integer | | -| counts | integer | | -| stat_date | text | | -| created_time | timestamp with time zone | | - -### account_detail_info -【表用途注释:】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| account_id | integer | 关联用户账户表id | -| login_time | text | 登录时间 | -| device | text | 登录设备 | -| device_os | text | 设备操作系统 | -| login_address | text | 登录地点 | -| login_times | integer | 登录次数 | -| created_time | timestamp with time zone | | -| phone_login_times | integer | 手机号登录次数 | - -### account_device -【表用途注释:存储用户使用过的设备信息】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| account_id | integer | 关联用户账户表id | -| device | text | 设备标识 | -| created_time | timestamp with time zone | | - -### account_login -【表用途注释:用户登录记录表】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| account_id | integer | 关联用户账户表id | -| login_date | text | 登录日期 | -| created_time | timestamp with time zone | | -| status | integer | 登录状态 | - -### account_paid_activity_count -【表用途注释:】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | integer | | -| time_period | integer | | -| counts | integer | | -| stat_date | text | | -| created_time | timestamp with time zone | | - -### bi_level_unit_lesson -【表用途注释:课程层级映射表,存储等级、季、单元、课时的对应关系】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| course_level | character varying | 课程等级(A1/L1、A2/L2等) | -| course_season | character varying | 课程季 | -| course_unit | character varying | 课程单元 | -| course_lesson | character varying | 课程课时 | -| id | integer | | - -### bi_refund_order -【表用途注释:退款订单表,存储所有退款订单的信息】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| account_id | bigint | 关联下单用户账户id | -| out_trade_no | character varying | 商户订单号,关联订单表out_trade_no | -| trade_no | character varying | 支付平台交易号 | -| refund_amount | character varying | 退款金额(字符串格式,单位元) | -| created_at | timestamp with time zone | 创建时间 | -| updated_at | timestamp with time zone | 更新时间 | -| deleted_at | timestamp with time zone | 删除时间(为空表示未删除) | -| refund_amount_int | integer | 退款金额(整数格式,单位分) | -| reason | text | 退款原因 | -| sale_channel | integer | 销售渠道 | -| status | integer | 退款状态:3=退款成功 | -| is_admin | boolean | 是否为后台操作退款 | -| channel_refund_id | bigint | 渠道退款id | -| refund_ticket_ids | text | 关联优惠券id | -| refund_type | integer | 退款类型 | -| refund_method | integer | 退款方式 | -| after_sale_no | text | 售后单号 | - -### bi_user_chapter_play_record_0 -【表用途注释:用户课时完成记录表(分表0),存储用户每个课时的学习完成情况】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| user_id | bigint | 角色id,关联用户角色表id | -| chapter_id | bigint | 课时id | -| chapter_unique_id | character varying | 课时唯一标识 | -| play_status | smallint | 课时完成状态:1=正常完成 | -| story_id | bigint | 关联故事id | -| created_at | timestamp with time zone | 创建时间 | -| updated_at | timestamp with time zone | 更新时间 | -| deleted_at | timestamp with time zone | 删除时间(为空表示未删除) | -| map_position | text | 地图位置 | -| dialog_info | text | 对话信息 | -| level | character varying | 课程等级 | - -### bi_user_chapter_play_record_1 -【表用途注释:用户课时完成记录表(分表1),存储用户每个课时的学习完成情况】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| user_id | bigint | 角色id,关联用户角色表id | -| chapter_id | bigint | 课时id | -| chapter_unique_id | character varying | 课时唯一标识 | -| play_status | smallint | 课时完成状态:1=正常完成 | -| story_id | bigint | 关联故事id | -| created_at | timestamp with time zone | 创建时间 | -| updated_at | timestamp with time zone | 更新时间 | -| deleted_at | timestamp with time zone | 删除时间(为空表示未删除) | -| map_position | text | 地图位置 | -| dialog_info | text | 对话信息 | -| level | character varying | 课程等级 | - -### bi_user_chapter_play_record_2 -【表用途注释:用户课时完成记录表(分表2),存储用户每个课时的学习完成情况】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| user_id | bigint | 角色id,关联用户角色表id | -| chapter_id | bigint | 课时id | -| chapter_unique_id | character varying | 课时唯一标识 | -| play_status | smallint | 课时完成状态:1=正常完成 | -| story_id | bigint | 关联故事id | -| created_at | timestamp with time zone | 创建时间 | -| updated_at | timestamp with time zone | 更新时间 | -| deleted_at | timestamp with time zone | 删除时间(为空表示未删除) | -| map_position | text | 地图位置 | -| dialog_info | text | 对话信息 | -| level | character varying | 课程等级 | - -### bi_user_chapter_play_record_3 -【表用途注释:用户课时完成记录表(分表3),存储用户每个课时的学习完成情况】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| user_id | bigint | 角色id,关联用户角色表id | -| chapter_id | bigint | 课时id | -| chapter_unique_id | character varying | 课时唯一标识 | -| play_status | smallint | 课时完成状态:1=正常完成 | -| story_id | bigint | 关联故事id | -| created_at | timestamp with time zone | 创建时间 | -| updated_at | timestamp with time zone | 更新时间 | -| deleted_at | timestamp with time zone | 删除时间(为空表示未删除) | -| map_position | text | 地图位置 | -| dialog_info | text | 对话信息 | -| level | character varying | 课程等级 | - -### bi_user_chapter_play_record_4 -【表用途注释:用户课时完成记录表(分表4),存储用户每个课时的学习完成情况】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| user_id | bigint | 角色id,关联用户角色表id | -| chapter_id | bigint | 课时id | -| chapter_unique_id | character varying | 课时唯一标识 | -| play_status | smallint | 课时完成状态:1=正常完成 | -| story_id | bigint | 关联故事id | -| created_at | timestamp with time zone | 创建时间 | -| updated_at | timestamp with time zone | 更新时间 | -| deleted_at | timestamp with time zone | 删除时间(为空表示未删除) | -| map_position | text | 地图位置 | -| dialog_info | text | 对话信息 | -| level | character varying | 课程等级 | - -### bi_user_chapter_play_record_5 -【表用途注释:用户课时完成记录表(分表5),存储用户每个课时的学习完成情况】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| user_id | bigint | 角色id,关联用户角色表id | -| chapter_id | bigint | 课时id | -| chapter_unique_id | character varying | 课时唯一标识 | -| play_status | smallint | 课时完成状态:1=正常完成 | -| story_id | bigint | 关联故事id | -| created_at | timestamp with time zone | 创建时间 | -| updated_at | timestamp with time zone | 更新时间 | -| deleted_at | timestamp with time zone | 删除时间(为空表示未删除) | -| map_position | text | 地图位置 | -| dialog_info | text | 对话信息 | -| level | character varying | 课程等级 | - -### bi_user_chapter_play_record_6 -【表用途注释:用户课时完成记录表(分表6),存储用户每个课时的学习完成情况】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| user_id | bigint | 角色id,关联用户角色表id | -| chapter_id | bigint | 课时id | -| chapter_unique_id | character varying | 课时唯一标识 | -| play_status | smallint | 课时完成状态:1=正常完成 | -| story_id | bigint | 关联故事id | -| created_at | timestamp with time zone | 创建时间 | -| updated_at | timestamp with time zone | 更新时间 | -| deleted_at | timestamp with time zone | 删除时间(为空表示未删除) | -| map_position | text | 地图位置 | -| dialog_info | text | 对话信息 | -| level | character varying | 课程等级 | - -### bi_user_chapter_play_record_7 -【表用途注释:用户课时完成记录表(分表7),存储用户每个课时的学习完成情况】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| user_id | bigint | 角色id,关联用户角色表id | -| chapter_id | bigint | 课时id | -| chapter_unique_id | character varying | 课时唯一标识 | -| play_status | smallint | 课时完成状态:1=正常完成 | -| story_id | bigint | 关联故事id | -| created_at | timestamp with time zone | 创建时间 | -| updated_at | timestamp with time zone | 更新时间 | -| deleted_at | timestamp with time zone | 删除时间(为空表示未删除) | -| map_position | text | 地图位置 | -| dialog_info | text | 对话信息 | -| level | character varying | 课程等级 | - -### bi_user_component_play_record_0 -【表用途注释:用户组件完成记录表(分表0),存储用户每个课时下每个组件的学习情况】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| user_id | bigint | 角色id,关联用户角色表id | -| chapter_id | bigint | 课时id | -| chapter_unique_id | character varying | 课时唯一标识,关联课时记录表chapter_unique_id | -| component_id | bigint | 组件id | -| sub_component_type | smallint | 子组件类型 | -| component_type | smallint | 组件类型 | -| comp_type | character varying | 组件类型标识 | -| play_status | smallint | 组件完成状态 | -| pass_time | integer | 通过时间 | -| interval_time | integer | 组件学习耗时(单位:毫秒) | -| read_word_count | integer | 读单词数量 | -| speak_count | integer | 开口次数 | -| listen_sentence_count | integer | 听句子数量 | -| write_word_count | integer | 写单词数量 | -| created_at | timestamp with time zone | 创建时间 | -| updated_at | timestamp with time zone | 更新时间 | -| deleted_at | timestamp with time zone | 删除时间(为空表示未删除) | -| section_id | bigint | 关联章节id | -| component_unique_code | character varying | 组件唯一编码 | -| play_result | character varying | 组件完成结果(Perfect/Good/Oops等) | -| finish_question_count | integer | 完成题目数量 | -| skill_points | integer | 获得技能点数 | -| c_type | character varying | 组件类型 | -| c_id | character varying | 组件配置id | -| user_behavior_info | text | 用户行为信息 | -| behavior_analysis_info | text | 行为分析信息 | -| session_id | character varying | 会话id | -| listen_time | integer | 听力时长 | -| dialog_list | text | 对话列表 | -| level | character varying | 课程等级 | - -### bi_user_component_play_record_1 -【表用途注释:用户组件完成记录表(分表1),存储用户每个课时下每个组件的学习情况】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| user_id | bigint | 角色id,关联用户角色表id | -| chapter_id | bigint | 课时id | -| chapter_unique_id | character varying | 课时唯一标识,关联课时记录表chapter_unique_id | -| component_id | bigint | 组件id | -| sub_component_type | smallint | 子组件类型 | -| component_type | smallint | 组件类型 | -| comp_type | character varying | 组件类型标识 | -| play_status | smallint | 组件完成状态 | -| pass_time | integer | 通过时间 | -| interval_time | integer | 组件学习耗时(单位:毫秒) | -| read_word_count | integer | 读单词数量 | -| speak_count | integer | 开口次数 | -| listen_sentence_count | integer | 听句子数量 | -| write_word_count | integer | 写单词数量 | -| created_at | timestamp with time zone | 创建时间 | -| updated_at | timestamp with time zone | 更新时间 | -| deleted_at | timestamp with time zone | 删除时间(为空表示未删除) | -| section_id | bigint | 关联章节id | -| component_unique_code | character varying | 组件唯一编码 | -| play_result | character varying | 组件完成结果(Perfect/Good/Oops等) | -| finish_question_count | integer | 完成题目数量 | -| skill_points | integer | 获得技能点数 | -| c_type | character varying | 组件类型 | -| c_id | character varying | 组件配置id | -| user_behavior_info | text | 用户行为信息 | -| behavior_analysis_info | text | 行为分析信息 | -| session_id | character varying | 会话id | -| listen_time | integer | 听力时长 | -| dialog_list | text | 对话列表 | -| level | character varying | 课程等级 | - -### bi_user_component_play_record_2 -【表用途注释:用户组件完成记录表(分表2),存储用户每个课时下每个组件的学习情况】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| user_id | bigint | 角色id,关联用户角色表id | -| chapter_id | bigint | 课时id | -| chapter_unique_id | character varying | 课时唯一标识,关联课时记录表chapter_unique_id | -| component_id | bigint | 组件id | -| sub_component_type | smallint | 子组件类型 | -| component_type | smallint | 组件类型 | -| comp_type | character varying | 组件类型标识 | -| play_status | smallint | 组件完成状态 | -| pass_time | integer | 通过时间 | -| interval_time | integer | 组件学习耗时(单位:毫秒) | -| read_word_count | integer | 读单词数量 | -| speak_count | integer | 开口次数 | -| listen_sentence_count | integer | 听句子数量 | -| write_word_count | integer | 写单词数量 | -| created_at | timestamp with time zone | 创建时间 | -| updated_at | timestamp with time zone | 更新时间 | -| deleted_at | timestamp with time zone | 删除时间(为空表示未删除) | -| section_id | bigint | 关联章节id | -| component_unique_code | character varying | 组件唯一编码 | -| play_result | character varying | 组件完成结果(Perfect/Good/Oops等) | -| finish_question_count | integer | 完成题目数量 | -| skill_points | integer | 获得技能点数 | -| c_type | character varying | 组件类型 | -| c_id | character varying | 组件配置id | -| user_behavior_info | text | 用户行为信息 | -| behavior_analysis_info | text | 行为分析信息 | -| session_id | character varying | 会话id | -| listen_time | integer | 听力时长 | -| dialog_list | text | 对话列表 | -| level | character varying | 课程等级 | - -### bi_user_component_play_record_3 -【表用途注释:用户组件完成记录表(分表3),存储用户每个课时下每个组件的学习情况】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| user_id | bigint | 角色id,关联用户角色表id | -| chapter_id | bigint | 课时id | -| chapter_unique_id | character varying | 课时唯一标识,关联课时记录表chapter_unique_id | -| component_id | bigint | 组件id | -| sub_component_type | smallint | 子组件类型 | -| component_type | smallint | 组件类型 | -| comp_type | character varying | 组件类型标识 | -| play_status | smallint | 组件完成状态 | -| pass_time | integer | 通过时间 | -| interval_time | integer | 组件学习耗时(单位:毫秒) | -| read_word_count | integer | 读单词数量 | -| speak_count | integer | 开口次数 | -| listen_sentence_count | integer | 听句子数量 | -| write_word_count | integer | 写单词数量 | -| created_at | timestamp with time zone | 创建时间 | -| updated_at | timestamp with time zone | 更新时间 | -| deleted_at | timestamp with time zone | 删除时间(为空表示未删除) | -| section_id | bigint | 关联章节id | -| component_unique_code | character varying | 组件唯一编码 | -| play_result | character varying | 组件完成结果(Perfect/Good/Oops等) | -| finish_question_count | integer | 完成题目数量 | -| skill_points | integer | 获得技能点数 | -| c_type | character varying | 组件类型 | -| c_id | character varying | 组件配置id | -| user_behavior_info | text | 用户行为信息 | -| behavior_analysis_info | text | 行为分析信息 | -| session_id | character varying | 会话id | -| listen_time | integer | 听力时长 | -| dialog_list | text | 对话列表 | -| level | character varying | 课程等级 | - -### bi_user_component_play_record_4 -【表用途注释:用户组件完成记录表(分表4),存储用户每个课时下每个组件的学习情况】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| user_id | bigint | 角色id,关联用户角色表id | -| chapter_id | bigint | 课时id | -| chapter_unique_id | character varying | 课时唯一标识,关联课时记录表chapter_unique_id | -| component_id | bigint | 组件id | -| sub_component_type | smallint | 子组件类型 | -| component_type | smallint | 组件类型 | -| comp_type | character varying | 组件类型标识 | -| play_status | smallint | 组件完成状态 | -| pass_time | integer | 通过时间 | -| interval_time | integer | 组件学习耗时(单位:毫秒) | -| read_word_count | integer | 读单词数量 | -| speak_count | integer | 开口次数 | -| listen_sentence_count | integer | 听句子数量 | -| write_word_count | integer | 写单词数量 | -| created_at | timestamp with time zone | 创建时间 | -| updated_at | timestamp with time zone | 更新时间 | -| deleted_at | timestamp with time zone | 删除时间(为空表示未删除) | -| section_id | bigint | 关联章节id | -| component_unique_code | character varying | 组件唯一编码 | -| play_result | character varying | 组件完成结果(Perfect/Good/Oops等) | -| finish_question_count | integer | 完成题目数量 | -| skill_points | integer | 获得技能点数 | -| c_type | character varying | 组件类型 | -| c_id | character varying | 组件配置id | -| user_behavior_info | text | 用户行为信息 | -| behavior_analysis_info | text | 行为分析信息 | -| session_id | character varying | 会话id | -| listen_time | integer | 听力时长 | -| dialog_list | text | 对话列表 | -| level | character varying | 课程等级 | - -### bi_user_component_play_record_5 -【表用途注释:用户组件完成记录表(分表5),存储用户每个课时下每个组件的学习情况】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| user_id | bigint | 角色id,关联用户角色表id | -| chapter_id | bigint | 课时id | -| chapter_unique_id | character varying | 课时唯一标识,关联课时记录表chapter_unique_id | -| component_id | bigint | 组件id | -| sub_component_type | smallint | 子组件类型 | -| component_type | smallint | 组件类型 | -| comp_type | character varying | 组件类型标识 | -| play_status | smallint | 组件完成状态 | -| pass_time | integer | 通过时间 | -| interval_time | integer | 组件学习耗时(单位:毫秒) | -| read_word_count | integer | 读单词数量 | -| speak_count | integer | 开口次数 | -| listen_sentence_count | integer | 听句子数量 | -| write_word_count | integer | 写单词数量 | -| created_at | timestamp with time zone | 创建时间 | -| updated_at | timestamp with time zone | 更新时间 | -| deleted_at | timestamp with time zone | 删除时间(为空表示未删除) | -| section_id | bigint | 关联章节id | -| component_unique_code | character varying | 组件唯一编码 | -| play_result | character varying | 组件完成结果(Perfect/Good/Oops等) | -| finish_question_count | integer | 完成题目数量 | -| skill_points | integer | 获得技能点数 | -| c_type | character varying | 组件类型 | -| c_id | character varying | 组件配置id | -| user_behavior_info | text | 用户行为信息 | -| behavior_analysis_info | text | 行为分析信息 | -| session_id | character varying | 会话id | -| listen_time | integer | 听力时长 | -| dialog_list | text | 对话列表 | -| level | character varying | 课程等级 | - -### bi_user_component_play_record_6 -【表用途注释:用户组件完成记录表(分表6),存储用户每个课时下每个组件的学习情况】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| user_id | bigint | 角色id,关联用户角色表id | -| chapter_id | bigint | 课时id | -| chapter_unique_id | character varying | 课时唯一标识,关联课时记录表chapter_unique_id | -| component_id | bigint | 组件id | -| sub_component_type | smallint | 子组件类型 | -| component_type | smallint | 组件类型 | -| comp_type | character varying | 组件类型标识 | -| play_status | smallint | 组件完成状态 | -| pass_time | integer | 通过时间 | -| interval_time | integer | 组件学习耗时(单位:毫秒) | -| read_word_count | integer | 读单词数量 | -| speak_count | integer | 开口次数 | -| listen_sentence_count | integer | 听句子数量 | -| write_word_count | integer | 写单词数量 | -| created_at | timestamp with time zone | 创建时间 | -| updated_at | timestamp with time zone | 更新时间 | -| deleted_at | timestamp with time zone | 删除时间(为空表示未删除) | -| section_id | bigint | 关联章节id | -| component_unique_code | character varying | 组件唯一编码 | -| play_result | character varying | 组件完成结果(Perfect/Good/Oops等) | -| finish_question_count | integer | 完成题目数量 | -| skill_points | integer | 获得技能点数 | -| c_type | character varying | 组件类型 | -| c_id | character varying | 组件配置id | -| user_behavior_info | text | 用户行为信息 | -| behavior_analysis_info | text | 行为分析信息 | -| session_id | character varying | 会话id | -| listen_time | integer | 听力时长 | -| dialog_list | text | 对话列表 | -| level | character varying | 课程等级 | - -### bi_user_component_play_record_7 -【表用途注释:用户组件完成记录表(分表7),存储用户每个课时下每个组件的学习情况】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| user_id | bigint | 角色id,关联用户角色表id | -| chapter_id | bigint | 课时id | -| chapter_unique_id | character varying | 课时唯一标识,关联课时记录表chapter_unique_id | -| component_id | bigint | 组件id | -| sub_component_type | smallint | 子组件类型 | -| component_type | smallint | 组件类型 | -| comp_type | character varying | 组件类型标识 | -| play_status | smallint | 组件完成状态 | -| pass_time | integer | 通过时间 | -| interval_time | integer | 组件学习耗时(单位:毫秒) | -| read_word_count | integer | 读单词数量 | -| speak_count | integer | 开口次数 | -| listen_sentence_count | integer | 听句子数量 | -| write_word_count | integer | 写单词数量 | -| created_at | timestamp with time zone | 创建时间 | -| updated_at | timestamp with time zone | 更新时间 | -| deleted_at | timestamp with time zone | 删除时间(为空表示未删除) | -| section_id | bigint | 关联章节id | -| component_unique_code | character varying | 组件唯一编码 | -| play_result | character varying | 组件完成结果(Perfect/Good/Oops等) | -| finish_question_count | integer | 完成题目数量 | -| skill_points | integer | 获得技能点数 | -| c_type | character varying | 组件类型 | -| c_id | character varying | 组件配置id | -| user_behavior_info | text | 用户行为信息 | -| behavior_analysis_info | text | 行为分析信息 | -| session_id | character varying | 会话id | -| listen_time | integer | 听力时长 | -| dialog_list | text | 对话列表 | -| level | character varying | 课程等级 | - -### bi_user_course_detail -【表用途注释:用户课程明细表,存储用户购买的课程信息】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | integer | | -| created_at | timestamp with time zone | 创建时间 | -| updated_at | timestamp with time zone | 更新时间 | -| account_id | integer | 关联用户账户表id | -| user_id | integer | 关联用户角色表id | -| course_level | text | 课程等级:A1=L1,A2=L2 | -| active_time | timestamp with time zone | 课程激活时间 | -| expire_time | timestamp with time zone | 课程过期时间:不为空=正式课,为空=体验课 | -| latest_unit_index | integer | 当前学习到的单元序号 | -| latest_lesson_index | integer | 当前学习到的课时序号 | -| learn_duration | integer | 累计学习时长(单位:秒) | -| last_learn_time | timestamp with time zone | 最近一次学习时间 | -| deleted_at | timestamp with time zone | 删除时间:为空=未删除,有值=已删除 | - -### bi_user_task_log -【表用途注释:用户任务完成日志表】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | integer | | -| user_id | integer | 关联用户角色表id | -| task_type | integer | 任务类型 | -| task_id | integer | 任务id | -| status | integer | 任务完成状态 | -| start_time | bigint | 任务开始时间(时间戳) | -| updated_at | timestamp with time zone | 更新时间 | -| created_at | timestamp with time zone | 创建时间 | - -### bi_user_unit_challenge_question_result -【表用途注释:用户单元挑战题结果表】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| user_id | bigint | 关联用户角色表id | -| story_id | bigint | 关联故事id | -| score | integer | 挑战得分 | -| score_text | character varying | 得分等级(Perfect/Good/Oops等) | -| question_list | text | 题目列表 | -| sp_value | integer | 获得SP点数 | -| created_at | timestamp with time zone | 创建时间 | -| updated_at | timestamp with time zone | 更新时间 | -| deleted_at | timestamp with time zone | 删除时间 | -| category | character varying | 挑战类型 | -| exp | integer | 获得经验值 | -| play_time | bigint | 挑战耗时 | -| read_word_count | bigint | 读单词数量 | -| listen_time | bigint | 听力时长 | -| speak_count | bigint | 开口次数 | -| level | character varying | 课程等级 | - -### bi_user_unit_review_question_result -【表用途注释:用户单元复习题结果表】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| user_id | bigint | 关联用户角色表id | -| story_id | bigint | 关联故事id | -| chapter_id | bigint | 关联课时id | -| unique_id | character varying | 唯一标识 | -| score | integer | 复习题得分 | -| score_text | character varying | 得分等级 | -| question_list | text | 题目列表 | -| sp_value | integer | 获得SP点数 | -| exp | integer | 获得经验值 | -| created_at | timestamp with time zone | 创建时间 | -| updated_at | timestamp with time zone | 更新时间 | -| deleted_at | timestamp with time zone | 删除时间 | -| play_time | bigint | 答题耗时 | -| level | character varying | 课程等级 | - -### bi_user_unit_summary_km_result -【表用途注释:用户单元知识点总结结果表】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | | -| user_id | bigint | 关联用户角色表id | -| story_id | bigint | 关联故事id | -| km_id | character varying | 知识点id | -| km_type | character varying | 知识点类型 | -| score_text | character varying | 知识点掌握等级 | -| sp_value | integer | 获得SP点数 | -| created_at | timestamp with time zone | 创建时间 | -| updated_at | timestamp with time zone | 更新时间 | -| deleted_at | timestamp with time zone | 删除时间 | -| play_time | bigint | 学习耗时 | -| sum_explain | text | 知识点总结说明 | -| level | character varying | 课程等级 | - -### bi_vala_app_account -【表用途注释:用户账户表,存储用户的账号基本信息】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | 账户id,主键 | -| tel | character varying | 手机号(明文) | -| tel_encrypt | character varying | 手机号加密存储 | -| name | character varying | 用户姓名 | -| id_card | character varying | 身份证号 | -| status | smallint | 账户状态 | -| pay_status | integer | 付费状态 | -| login_times | integer | 累计登录次数 | -| remark | text | 备注 | -| key_from | character varying | 注册来源渠道 | -| created_at | timestamp with time zone | 创建时间 | -| updated_at | timestamp with time zone | 更新时间 | -| deleted_at | timestamp with time zone | 删除时间(为空表示未删除) | -| download_channel | text | 用户下载渠道(汉字格式,用于统计新增用户来源) | - -### bi_vala_app_character -【表用途注释:用户角色表,一个账户可以创建多个角色】 -| 字段名 | 数据类型 | 注释 | -|--------|----------|------| -| id | bigint | 角色id,主键 | -| account_id | bigint | 关联账户表id | -| nickname | character varying | 角色昵称 | -| gender | smallint | 角色性别:1=男,2=女 | -| birthday | character varying | 角色生日 | -| avatar | text | 角色头像 | -| spine_name | character varying | 角色 spine 资源名 | -| latest_login | timestamp with time zone | 最近登录 \ No newline at end of file diff --git a/generate_report.py b/generate_report.py deleted file mode 100644 index 0b41638..0000000 --- a/generate_report.py +++ /dev/null @@ -1,152 +0,0 @@ -import pandas as pd -from datetime import datetime - -# 1. 整体统计数据 -overall_data = [ - {"渠道": "学而思", "新增注册总人数": 615, "购课总人数":7, "购课总金额(元)":7794}, - {"渠道": "科大讯飞", "新增注册总人数": 377, "购课总人数":4, "购课总金额(元)":3796}, - {"渠道": "希沃", "新增注册总人数": 122, "购课总人数":1, "购课总金额(元)":599}, - {"渠道": "京东方", "新增注册总人数": 61, "购课总人数":1, "购课总金额(元)":599}, - {"渠道": "合计", "新增注册总人数": 1175, "购课总人数":13, "购课总金额(元)":12788}, -] -df_overall = pd.DataFrame(overall_data) - -# 2. 每日购课明细数据 -purchase_data = [ - {"日期": "2026-03-02", "渠道": "学而思", "购课人数":1, "购课金额(元)":599, "订单号": "zfb202603022031481772454708683943"}, - {"日期": "2026-03-07", "渠道": "学而思", "购课人数":1, "购课金额(元)":599, "订单号": "wx202603071022051772850125753228"}, - {"日期": "2026-03-07", "渠道": "科大讯飞", "购课人数":1, "购课金额(元)":599, "订单号": "wx202603072123501772889830225976"}, - {"日期": "2026-03-10", "渠道": "学而思", "购课人数":1, "购课金额(元)":1999, "订单号": "wx202603101820431773138043948181"}, - {"日期": "2026-03-15", "渠道": "科大讯飞", "购课人数":2, "购课金额(元)":2598, "订单号": "wx202603150854031773536043478685、wx20260315122747177354886748896"}, - {"日期": "2026-03-18", "渠道": "学而思", "购课人数":2, "购课金额(元)":2598, "订单号": "wx202603182055481773838548372991、zfb202603182118201773839900411837"}, - {"日期": "2026-03-23", "渠道": "科大讯飞", "购课人数":1, "购课金额(元)":599, "订单号": "wx202603232015081774268108032833"}, - {"日期": "2026-03-24", "渠道": "京东方", "购课人数":1, "购课金额(元)":599, "订单号": "zfb202603242026431774355203538499"}, - {"日期": "2026-03-27", "渠道": "学而思", "购课人数":1, "购课金额(元)":1999, "订单号": "wx202603271258341774587514141956"}, - {"日期": "2026-03-28", "渠道": "希沃", "购课人数":1, "购课金额(元)":599, "订单号": "wx20260328145038177468063894734"}, -] -df_purchase = pd.DataFrame(purchase_data) - -# 3. 每日新增注册数据 -register_data = [ - {"日期": "2026-03-01", "渠道": "京东方", "新增注册人数": 1}, - {"日期": "2026-03-01", "渠道": "学而思", "新增注册人数": 48}, - {"日期": "2026-03-01", "渠道": "希沃", "新增注册人数": 2}, - {"日期": "2026-03-02", "渠道": "京东方", "新增注册人数": 3}, - {"日期": "2026-03-02", "渠道": "学而思", "新增注册人数": 38}, - {"日期": "2026-03-02", "渠道": "希沃", "新增注册人数": 1}, - {"日期": "2026-03-03", "渠道": "学而思", "新增注册人数": 24}, - {"日期": "2026-03-03", "渠道": "希沃", "新增注册人数": 4}, - {"日期": "2026-03-04", "渠道": "京东方", "新增注册人数": 4}, - {"日期": "2026-03-04", "渠道": "学而思", "新增注册人数": 20}, - {"日期": "2026-03-04", "渠道": "希沃", "新增注册人数": 10}, - {"日期": "2026-03-04", "渠道": "科大讯飞", "新增注册人数": 3}, - {"日期": "2026-03-05", "渠道": "京东方", "新增注册人数": 7}, - {"日期": "2026-03-05", "渠道": "学而思", "新增注册人数": 37}, - {"日期": "2026-03-05", "渠道": "希沃", "新增注册人数": 15}, - {"日期": "2026-03-05", "渠道": "科大讯飞", "新增注册人数": 17}, - {"日期": "2026-03-06", "渠道": "京东方", "新增注册人数": 6}, - {"日期": "2026-03-06", "渠道": "学而思", "新增注册人数": 26}, - {"日期": "2026-03-06", "渠道": "希沃", "新增注册人数": 9}, - {"日期": "2026-03-06", "渠道": "科大讯飞", "新增注册人数": 12}, - {"日期": "2026-03-07", "渠道": "京东方", "新增注册人数": 5}, - {"日期": "2026-03-07", "渠道": "学而思", "新增注册人数": 35}, - {"日期": "2026-03-07", "渠道": "希沃", "新增注册人数": 5}, - {"日期": "2026-03-07", "渠道": "科大讯飞", "新增注册人数": 34}, - {"日期": "2026-03-08", "渠道": "京东方", "新增注册人数": 3}, - {"日期": "2026-03-08", "渠道": "学而思", "新增注册人数": 33}, - {"日期": "2026-03-08", "渠道": "希沃", "新增注册人数": 12}, - {"日期": "2026-03-08", "渠道": "科大讯飞", "新增注册人数": 34}, - {"日期": "2026-03-09", "渠道": "京东方", "新增注册人数": 2}, - {"日期": "2026-03-09", "渠道": "学而思", "新增注册人数": 27}, - {"日期": "2026-03-09", "渠道": "希沃", "新增注册人数": 5}, - {"日期": "2026-03-09", "渠道": "科大讯飞", "新增注册人数": 15}, - {"日期": "2026-03-10", "渠道": "学而思", "新增注册人数": 15}, - {"日期": "2026-03-10", "渠道": "希沃", "新增注册人数": 3}, - {"日期": "2026-03-10", "渠道": "科大讯飞", "新增注册人数": 9}, - {"日期": "2026-03-11", "渠道": "京东方", "新增注册人数": 2}, - {"日期": "2026-03-11", "渠道": "学而思", "新增注册人数": 25}, - {"日期": "2026-03-11", "渠道": "希沃", "新增注册人数": 2}, - {"日期": "2026-03-11", "渠道": "科大讯飞", "新增注册人数": 13}, - {"日期": "2026-03-12", "渠道": "京东方", "新增注册人数": 5}, - {"日期": "2026-03-12", "渠道": "学而思", "新增注册人数": 24}, - {"日期": "2026-03-12", "渠道": "希沃", "新增注册人数": 5}, - {"日期": "2026-03-12", "渠道": "科大讯飞", "新增注册人数": 15}, - {"日期": "2026-03-13", "渠道": "京东方", "新增注册人数": 2}, - {"日期": "2026-03-13", "渠道": "学而思", "新增注册人数": 31}, - {"日期": "2026-03-13", "渠道": "希沃", "新增注册人数": 7}, - {"日期": "2026-03-13", "渠道": "科大讯飞", "新增注册人数": 8}, - {"日期": "2026-03-14", "渠道": "学而思", "新增注册人数": 30}, - {"日期": "2026-03-14", "渠道": "希沃", "新增注册人数": 3}, - {"日期": "2026-03-14", "渠道": "科大讯飞", "新增注册人数": 22}, - {"日期": "2026-03-15", "渠道": "京东方", "新增注册人数": 1}, - {"日期": "2026-03-15", "渠道": "学而思", "新增注册人数": 22}, - {"日期": "2026-03-15", "渠道": "希沃", "新增注册人数": 3}, - {"日期": "2026-03-15", "渠道": "科大讯飞", "新增注册人数": 22}, - {"日期": "2026-03-16", "渠道": "京东方", "新增注册人数": 2}, - {"日期": "2026-03-16", "渠道": "学而思", "新增注册人数": 6}, - {"日期": "2026-03-16", "渠道": "希沃", "新增注册人数": 2}, - {"日期": "2026-03-16", "渠道": "科大讯飞", "新增注册人数": 10}, - {"日期": "2026-03-17", "渠道": "京东方", "新增注册人数": 3}, - {"日期": "2026-03-17", "渠道": "学而思", "新增注册人数": 12}, - {"日期": "2026-03-17", "渠道": "希沃", "新增注册人数": 3}, - {"日期": "2026-03-17", "渠道": "科大讯飞", "新增注册人数": 6}, - {"日期": "2026-03-18", "渠道": "京东方", "新增注册人数": 1}, - {"日期": "2026-03-18", "渠道": "学而思", "新增注册人数": 9}, - {"日期": "2026-03-18", "渠道": "科大讯飞", "新增注册人数": 11}, - {"日期": "2026-03-19", "渠道": "京东方", "新增注册人数": 1}, - {"日期": "2026-03-19", "渠道": "学而思", "新增注册人数": 6}, - {"日期": "2026-03-19", "渠道": "希沃", "新增注册人数": 2}, - {"日期": "2026-03-19", "渠道": "科大讯飞", "新增注册人数": 9}, - {"日期": "2026-03-20", "渠道": "京东方", "新增注册人数": 1}, - {"日期": "2026-03-20", "渠道": "学而思", "新增注册人数": 13}, - {"日期": "2026-03-20", "渠道": "希沃", "新增注册人数": 2}, - {"日期": "2026-03-20", "渠道": "科大讯飞", "新增注册人数": 12}, - {"日期": "2026-03-21", "渠道": "京东方", "新增注册人数": 1}, - {"日期": "2026-03-21", "渠道": "学而思", "新增注册人数": 27}, - {"日期": "2026-03-21", "渠道": "科大讯飞", "新增注册人数": 26}, - {"日期": "2026-03-22", "渠道": "学而思", "新增注册人数": 12}, - {"日期": "2026-03-22", "渠道": "希沃", "新增注册人数": 4}, - {"日期": "2026-03-22", "渠道": "科大讯飞", "新增注册人数": 22}, - {"日期": "2026-03-23", "渠道": "京东方", "新增注册人数": 1}, - {"日期": "2026-03-23", "渠道": "学而思", "新增注册人数": 9}, - {"日期": "2026-03-23", "渠道": "希沃", "新增注册人数": 2}, - {"日期": "2026-03-23", "渠道": "科大讯飞", "新增注册人数": 5}, - {"日期": "2026-03-24", "渠道": "学而思", "新增注册人数": 4}, - {"日期": "2026-03-24", "渠道": "希沃", "新增注册人数": 2}, - {"日期": "2026-03-24", "渠道": "科大讯飞", "新增注册人数": 8}, - {"日期": "2026-03-25", "渠道": "京东方", "新增注册人数": 1}, - {"日期": "2026-03-25", "渠道": "学而思", "新增注册人数": 12}, - {"日期": "2026-03-25", "渠道": "希沃", "新增注册人数": 5}, - {"日期": "2026-03-25", "渠道": "科大讯飞", "新增注册人数": 13}, - {"日期": "2026-03-26", "渠道": "京东方", "新增注册人数": 1}, - {"日期": "2026-03-26", "渠道": "学而思", "新增注册人数": 8}, - {"日期": "2026-03-26", "渠道": "希沃", "新增注册人数": 2}, - {"日期": "2026-03-26", "渠道": "科大讯飞", "新增注册人数": 8}, - {"日期": "2026-03-27", "渠道": "学而思", "新增注册人数": 9}, - {"日期": "2026-03-27", "渠道": "希沃", "新增注册人数": 6}, - {"日期": "2026-03-27", "渠道": "科大讯飞", "新增注册人数": 6}, - {"日期": "2026-03-28", "渠道": "京东方", "新增注册人数": 2}, - {"日期": "2026-03-28", "渠道": "学而思", "新增注册人数": 20}, - {"日期": "2026-03-28", "渠道": "希沃", "新增注册人数": 4}, - {"日期": "2026-03-28", "渠道": "科大讯飞", "新增注册人数": 12}, - {"日期": "2026-03-29", "渠道": "京东方", "新增注册人数": 2}, - {"日期": "2026-03-29", "渠道": "学而思", "新增注册人数": 16}, - {"日期": "2026-03-29", "渠道": "科大讯飞", "新增注册人数": 9}, - {"日期": "2026-03-30", "渠道": "京东方", "新增注册人数": 2}, - {"日期": "2026-03-30", "渠道": "学而思", "新增注册人数": 7}, - {"日期": "2026-03-30", "渠道": "希沃", "新增注册人数": 2}, - {"日期": "2026-03-30", "渠道": "科大讯飞", "新增注册人数": 6}, - {"日期": "2026-03-31", "渠道": "京东方", "新增注册人数": 3}, - {"日期": "2026-03-31", "渠道": "学而思", "新增注册人数": 10}, - {"日期": "2026-03-31", "渠道": "科大讯飞", "新增注册人数": 10}, -] -df_register = pd.DataFrame(register_data) - -# 生成Excel文件 -output_path = "/root/.openclaw/workspace/2026年3月硬件渠道数据汇总.xlsx" -with pd.ExcelWriter(output_path, engine='openpyxl') as writer: - df_overall.to_excel(writer, sheet_name='整体统计', index=False) - df_purchase.to_excel(writer, sheet_name='每日购课明细', index=False) - df_register.to_excel(writer, sheet_name='每日新增注册明细', index=False) - -print(f"文件已生成:{output_path}") diff --git a/git_repos/llm_offline_production b/git_repos/llm_offline_production deleted file mode 160000 index 75ab13e..0000000 --- a/git_repos/llm_offline_production +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 75ab13e87dd0e856cb05c9515efcd507888b6486 diff --git a/makee_vala/README.md b/makee_vala/README.md deleted file mode 100644 index 2bee9b4..0000000 --- a/makee_vala/README.md +++ /dev/null @@ -1,30 +0,0 @@ -# 业务知识库 - -作为数据分析师,持续积累对公司业务和数据表的理解。 - -## 目录结构 - -- `sql_queries/` - 常用 SQL 查询语句和业务分析模板 -- `tables/` - 数据表结构和字段说明 -- `business_terms/` - 业务术语和指标定义 - -## 资料来源 - -1. 飞书 Wiki - 增长组常用查询SQL: https://makee-interactive.feishu.cn/wiki/XJuCwNol1iL3sYkXkXWc2QnJnMd -2. Git 仓库 - 数据抽取脚本: https://git.valavala.com/vala/llm_offline_production/src/branch/master/config_user_data_extract_and_analyze - -## 收集的 SQL 查询文档 - -- [ ] 全字段大表 -- [ ] 平均通关时长 -- [ ] 新增注册用户数by渠道 -- [ ] 课程进入完成率 -- [ ] 账号角色年龄地址 -- [ ] 退费率 -- [ ] 销转学习进度 -- [ ] 班主任关注数据 -- [ ] 端内GMV -- [ ] 端内用户课程进入完成率 -- [ ] 端内购课用户学习行为 -- [ ] 转化率 -- [ ] 课程ID映射 diff --git a/makee_vala/business_terms.md b/makee_vala/business_terms.md deleted file mode 100644 index e86f0ce..0000000 --- a/makee_vala/business_terms.md +++ /dev/null @@ -1,49 +0,0 @@ -# 业务术语表 - -## 核心业务指标 - -### 用户相关 -- **注册用户**: 在 `bi_vala_app_account` 表中 `status = 1` 且 `deleted_at is NULL` 的用户 -- **测试用户**: 需要排除的特定用户 ID,如 `id not in (51,2121)` -- **下载渠道 (download_channel)**: 用户下载 App 的渠道 -- **key_from**: 注册或购课的来源标识 - -### 购课相关 -- **购课渠道 (sale_channel)**: 用户购买课程的渠道,有数字编码映射到具体渠道名称 -- **有效订单**: `order_status = 3` 且 `pay_amount_int > 49800` 的订单(金额大于498元) -- **购课标签**: 分为"未购课"、"站外购课"、"站内购课" -- **站内购课**: 购课渠道不是"站外"的购课 - -### 角色相关 -- **角色付费状态 (characer_pay_status)**: 0表示未付费,1表示已付费 -- **性别 (gender)**: 0=girl, 1=boy, 其他=unknow -- **赛季包 (purchase_season_package)**: `'[1]'` 表示未购买赛季包 - -### 课程相关 -- **完课标识 (chapter_unique_id)**: 唯一标识一次完课记录 -- **完课耗时 (finish_time)**: 完成课程所花费的时间,格式为 mm:ss -- **课程ID (course_id)**: 由 course_level-course_season-course_unit-course_lesson 组成 -- **play_status = 1**: 表示播放完成状态 - -## 购课渠道映射表 - -| 编码 | 渠道名称 | -|------|----------| -| 11 | 苹果 | -| 12 | 华为 | -| 13 | 小米 | -| 14 | 荣耀 | -| 15 | 应用宝 | -| 17 | 魅族 | -| 18 | VIVO | -| 19 | OPPO | -| 21 | 学而思 | -| 22 | 讯飞 | -| 23 | 步步高 | -| 24 | 作业帮 | -| 25 | 小度 | -| 26 | 希沃 | -| 27 | 京东方 | -| 41 | 官网 | -| 71 | 小程序 | -| 其他 | 站外 | diff --git a/makee_vala/data_tables.md b/makee_vala/data_tables.md deleted file mode 100644 index ee28241..0000000 --- a/makee_vala/data_tables.md +++ /dev/null @@ -1,168 +0,0 @@ -# 数据表说明 - -## 核心业务表 - -### 用户账号表 -**表名**: `bi_vala_app_account` - -**关键字段**: -- `id`: 用户ID -- `key_from`: 注册来源 -- `created_at`: 注册时间 -- `download_channel`: 下载渠道 -- `status`: 账号状态(1表示有效) -- `deleted_at`: 删除时间(NULL表示未删除) - -**常用筛选条件**: -```sql -where status = 1 - and id not in (51,2121) -- 排除测试用户 - and deleted_at is NULL -``` - ---- - -### 账号详情表 -**表名**: `account_detail_info` - -**关键字段**: -- `account_id`: 账号ID(关联 bi_vala_app_account.id) -- `login_address`: 登录地址(格式如"省份-城市") -- `phone_login_times`: 手机登录次数 - -**业务逻辑**: -```sql --- 提取城市 -split_part(login_address,'-',2) as login_address - --- 判断是否手机登录 -case when phone_login_times = 0 then 0 else 1 end as phone_login -``` - ---- - -### 订单表 -**表名**: `bi_vala_order` - -**关键字段**: -- `account_id`: 账号ID -- `sale_channel`: 购课渠道(数字编码) -- `key_from`: 购课来源 -- `pay_success_date`: 支付成功时间 -- `pay_amount`: 支付金额 -- `pay_amount_int`: 支付金额(整数分) -- `order_status`: 订单状态(3表示有效订单) - -**常用筛选条件**: -```sql -where order_status = 3 - and pay_amount_int > 49800 -- 金额大于498元 -``` - ---- - -### 角色表 -**表名**: `bi_vala_app_character` - -**关键字段**: -- `id`: 角色ID -- `account_id`: 账号ID -- `gender`: 性别(0=girl, 1=boy) -- `birthday`: 生日(格式如"YYYY-MM-DD") -- `purchase_season_package`: 赛季包购买状态 -- `deleted_at`: 删除时间 - -**业务逻辑**: -```sql --- 角色付费状态 -case when purchase_season_package = '[1]' then 0 else 1 end as characer_pay_status - --- 性别映射 -case when gender = 0 then 'girl' - when gender = 1 then 'boy' - else 'unknow' -end as gender - --- 提取出生年份 -case when split_part(birthday,'-',1) = '' then '0000' - else split_part(birthday,'-',1) -end as birthday -``` - ---- - -## 课程播放记录表(分表) - -### 用户章节播放记录 -**表名**: `bi_user_chapter_play_record_0` ~ `bi_user_chapter_play_record_7` - -**说明**: 按分表存储,共8张表,需要使用 UNION ALL 合并 - -**关键字段**: -- `user_id`: 用户ID -- `chapter_id`: 章节ID -- `chapter_unique_id`: 完课唯一标识 -- `updated_at`: 更新时间 -- `play_status`: 播放状态(1表示完成) - -**常用筛选条件**: -```sql -where chapter_id in (55,56,57,58,59) -- 指定章节 - and play_status = 1 -- 播放完成 -``` - ---- - -### 用户组件播放记录 -**表名**: `bi_user_component_play_record_0` ~ `bi_user_component_play_record_7` - -**说明**: 按分表存储,共8张表,需要使用 UNION ALL 合并 - -**关键字段**: -- `chapter_unique_id`: 完课唯一标识 -- `interval_time`: 播放时长(毫秒) - -**业务逻辑**: -```sql --- 计算完课耗时(mm:ss格式) -format('%s:%s', - floor(sum(interval_time)/1000/60), - mod((sum(interval_time)/1000),60) -) as finish_time -``` - ---- - -## 课程信息表 - -### 课程单元表 -**表名**: `bi_level_unit_lesson` - -**关键字段**: -- `id`: ID(关联 chapter_id) -- `course_level`: 课程级别 -- `course_season`: 课程赛季 -- `course_unit`: 课程单元 -- `course_lesson`: 课程课时 - -**业务逻辑**: -```sql --- 生成课程ID -format('%s-%s-%s-%s', - course_level, - course_season, - course_unit, - course_lesson -) as course_id -``` - ---- - -## 其他表 - -### 账号登录表 -**表名**: `account_login` - -**关键字段**: -- `account_id`: 账号ID -- `login_date`: 登录日期 diff --git a/makee_vala/feishu_format_rules.md b/makee_vala/feishu_format_rules.md deleted file mode 100644 index fb1a2b9..0000000 --- a/makee_vala/feishu_format_rules.md +++ /dev/null @@ -1,53 +0,0 @@ -# 飞书文档排版规则 - -## 飞书文档块类型 - -根据观察,飞书文档的块类型: - -| block_type | 说明 | -|-----------|------| -| 1 | Page(页面)| -| 2 | Text(文本块)| -| 3 | Heading1(一级标题)| -| 4 | Heading2(二级标题)| -| 5 | Heading3(三级标题)| -| 6 | Bulleted List(无序列表)| -| 7 | Numbered List(有序列表)| -| 8 | To-do(待办事项)| -| 9 | Quote(引用)| -| 10 | Code(代码块)| -| 11 | Divider(分隔线)| -| 34 | Quote Container(引用容器)| - -## 排版最佳实践 - -### 1. 标题层级 -- 使用 Heading2/Heading3 来组织内容结构 -- 避免太多层级,保持清晰 - -### 2. 列表使用 -- 无序列表(type 6)用于列举项目 -- 有序列表(type 7)用于步骤说明 - -### 3. 分隔线 -- 使用 Divider(type 11)来分隔大的内容区块 - -### 4. 引用 -- 使用 Quote(type 9)或 Quote Container(type 34)来强调重要内容 - -### 5. 文本格式 -- 善用加粗、斜体等文本样式 -- 保持整体排版简洁美观 - -## 更新飞书文档的注意事项 - -⚠️ **重要:不要直接用 write 覆盖整个文档!** - -**推荐做法:** -1. 先用 list_blocks 查看当前文档结构 -2. 用 update_block 逐个更新需要修改的块 -3. 或者如果必须重写,要确保保持原来的块结构和格式 - -**避免:** -- ❌ 直接用 write 方法覆盖整个文档(会丢失所有格式) -- ❌ 把所有内容都放在一个 Text 块里 diff --git a/makee_vala/fetch_wiki_docs.py b/makee_vala/fetch_wiki_docs.py deleted file mode 100644 index ea7f70f..0000000 --- a/makee_vala/fetch_wiki_docs.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env python3 -""" -批量读取飞书 Wiki 文档并保存到本地知识库 -""" - -import json -import os -from datetime import datetime - -# Wiki 子页面列表 -wiki_pages = [ - {"node_token": "O7QvwdY8piO8aUkhxYecA1qZnBe", "title": "全字段大表", "obj_token": "VVyWd5491o6tuqxceCVci6dVnFd"}, - {"node_token": "Y6Iywqf75iepbUkvJzLcfiUYnkg", "title": "平均通关时长", "obj_token": "EpP7d6h2SoaTyJx1lZRcXXdLnVe"}, - {"node_token": "KQihwMjO9i1zjFkqTgBcq67Snzc", "title": "新增注册用户数by渠道", "obj_token": "AzRPddp97o7To8x8VkxcFGr8nBh"}, - {"node_token": "Zt7RwfGLWiacslkO2glcheWjnwf", "title": "课程进入完成率", "obj_token": "PwIydfZcHo5eZgxi8XLcOtjOnSb"}, - {"node_token": "LTaiw3OmUi2pcckDWuNcyBIVnAd", "title": "账号角色年龄地址", "obj_token": "CUa2du2sSoNFSRxl3vFc8ucInEm"}, - {"node_token": "ZAPJwIODRiNYE5kTuNtcpSlvnIX", "title": "退费率", "obj_token": "DC1Qdhpitowt9lxxo1acEzOwnFc"}, - {"node_token": "Cb3KwPWLriG7GgkN73pcM0Idnch", "title": "销转学习进度", "obj_token": "G1p9dhK63oLWMzxyGQ8csZGMnDh"}, - {"node_token": "EBEiwQsw2iOtgekDldHcQxgwnOh", "title": "班主任关注数据", "obj_token": "NcVqdRKtrowglNxs9CocDekunje"}, - {"node_token": "BZPkwARxiixUZRk4BW9cij50nDe", "title": "端内GMV", "obj_token": "FkVCd1AruoD9xWxxVpzc16hinVh"}, - {"node_token": "AQpnwpsfOixYGtk4jf0c6t9XncG", "title": "端内用户课程进入完成率", "obj_token": "Ueu7dtgSHoNYfsxCDHmcY6E4nid"}, - {"node_token": "PyqEwXXqsiQybPkpGbscUjUFnOg", "title": "端内购课用户学习行为", "obj_token": "ZTxod4IUWo5yMexf8AHcBbpFnMg"}, - {"node_token": "OyXlwY2vyisvV1kc3HhcMyMVnTd", "title": "转化率", "obj_token": "ATJ0dfajQo5CSexQd8hc9i3pnWe"}, - {"node_token": "MWpZwV01fitaKjkCRSxckMUunRb", "title": "课程ID映射", "obj_token": "GenUdsXCloUdYhxMvxqcWBMdnhb"} -] - -def safe_filename(title): - """生成安全的文件名""" - return "".join(c for c in title if c.isalnum() or c in (' ', '-', '_')).rstrip().replace(' ', '_') - -def main(): - print("="*60) - print("飞书 Wiki 文档批量获取") - print("="*60) - - output_dir = "sql_queries" - os.makedirs(output_dir, exist_ok=True) - - print(f"\n共 {len(wiki_pages)} 个文档需要获取") - print(f"输出目录: {output_dir}") - - # 创建索引文件 - index_content = "# SQL 查询文档索引\n\n" - index_content += f"创建时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n" - index_content += "## 文档列表\n\n" - - for i, page in enumerate(wiki_pages, 1): - filename = safe_filename(page['title']) + ".md" - filepath = os.path.join(output_dir, filename) - - print(f"\n[{i}/{len(wiki_pages)}] 处理: {page['title']}") - print(f" 文件: {filepath}") - - # 创建占位文件 - with open(filepath, 'w', encoding='utf-8') as f: - f.write(f"# {page['title']}\n\n") - f.write(f"**获取时间:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") - f.write(f"**飞书文档 Token:** {page['obj_token']}\n\n") - f.write(f"**注意:** 此文档需要通过 feishu_doc 工具读取完整内容\n\n") - f.write("---\n\n") - f.write("## 使用说明\n\n") - f.write("使用以下命令读取完整文档内容:\n\n") - f.write("```bash\n") - f.write(f"feishu_doc read {page['obj_token']}\n") - f.write("```\n") - - # 更新索引 - index_content += f"- [{page['title']}]({filename})\n" - - print(f" ✅ 已创建占位文件") - - # 写入索引文件 - with open(os.path.join(output_dir, "README.md"), 'w', encoding='utf-8') as f: - f.write(index_content) - - print("\n" + "="*60) - print("✅ 初始化完成") - print("="*60) - print("\n下一步: 使用 feishu_doc 工具逐个读取文档内容") - print("或者让我继续为你读取这些文档的完整内容") - -if __name__ == "__main__": - main() diff --git a/makee_vala/git_scripts/CLAUDE.md b/makee_vala/git_scripts/CLAUDE.md deleted file mode 100644 index 7fbbbf5..0000000 --- a/makee_vala/git_scripts/CLAUDE.md +++ /dev/null @@ -1,70 +0,0 @@ -# 项目说明 - -## 项目概述 -用户数据提取和分析工具集,用于从各种数据源(ES、数据库等)导出和分析用户数据。 - -## 脚本列表 - -### export_realtime_asr.py -**功能**: 导出流式语音 ASR 数据 - -**版本**: v1.0 - -**数据源**: -- Elasticsearch 索引: `llm_realtime_asr_log` - -**配置说明**: -- 在脚本开头配置开始和结束日期(8位数字格式,如 20260101) -- ES 连接信息通过环境变量配置(需要创建 .env 文件) - -**依赖包**: -``` -elasticsearch -pandas -openpyxl -python-dotenv -``` - -**运行方式**: -```bash -python export_realtime_asr.py -``` - -**输出**: -- 输出目录: `output/` -- 文件命名: `realtime_asr_export_{开始日期}_{结束日期}.xlsx` -- Excel 列: voice_id, asr_prompt, result_str, timestamp, audio_url, source - -**数据处理逻辑**: -- 从 ES 使用 scroll API 分批读取数据(每批1000条) -- 按 voice_id 聚合,仅保留恰好有2条记录的 voice_id -- 取两条记录中最新的 timestamp -- 自动拼接 audio_url - -**特点**: -- 支持大数据量处理(几十万级别) -- 实时进度显示 -- 自动过滤异常数据(非2条记录的 voice_id) - ---- - -### 其他脚本 -- `export_user_id_data.py`: 用户ID数据导出 -- `batch_add_shengtong_result.py`: 批量添加声通评测结果 -- `shengtong_eval.py`: 声通评测 -- `calc_score_diff_stats.py`: 分数差异统计 -- `export_unit_summary.py`: 单元总结统计导出 - -## 环境配置 - -需要创建 `.env` 文件,包含以下配置: -``` -ES_HOST=xxx -ES_PORT=9200 -ES_SCHEME=https -ES_USER=elastic -ES_PASSWORD=xxx -``` - -## 最近更新 -- 2026-01-27: 新增 export_realtime_asr.py 脚本,支持流式语音 ASR 数据导出 diff --git a/makee_vala/git_scripts/batch_add_shengtong_result.py b/makee_vala/git_scripts/batch_add_shengtong_result.py deleted file mode 100644 index 8db5962..0000000 --- a/makee_vala/git_scripts/batch_add_shengtong_result.py +++ /dev/null @@ -1,853 +0,0 @@ -""" -声通语音评测批量处理工具 - -功能说明: -- 读取 Excel 文件,其中包含音频链接(userAudio 字段)和参考文本(refText 字段) -- 调用声通 API 对音频进行评测,获取总分、明细和recordId -- 在原 Excel 中添加"测试总分"、"测试明细"和"测试recordId"三个字段 -- 输出文件命名为: {原文件名}_add_shengtong_result.xlsx -- 支持串行和并发两种处理模式 - -环境变量配置: -- ST_APP_KEY: 声通应用 Key -- ST_SECRET_KEY: 声通 Secret Key - -声通API文档: http://api.stkouyu.com -""" - -import pandas as pd -import os -import requests -import tempfile -from pathlib import Path -import json -import time -import hashlib -import uuid -from concurrent.futures import ThreadPoolExecutor, as_completed -import threading -from queue import Queue -import logging - -# 配置日志 -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s', - handlers=[ - logging.FileHandler('shengtong_batch_processing.log'), - logging.StreamHandler() - ] -) - -# 从 .env 文件加载环境变量 -from dotenv import load_dotenv -load_dotenv() - -# ==================== 全局配置 ==================== -# DEBUG 模式开关(控制详细日志输出) -DEBUG_MODE = False - - -def debug_print(message): - """ - DEBUG 信息输出函数 - - Args: - message (str): 要输出的调试信息 - """ - if DEBUG_MODE: - print(f"[DEBUG] {message}") - - -# ==================== 声通 API 相关代码 ==================== - -class ShengtongEvaluator: - """声通口语评测 API 封装类""" - - def __init__(self): - """从环境变量读取 API 配置""" - self.app_key = os.environ.get('ST_APP_KEY', '') - self.secret_key = os.environ.get('ST_SECRET_KEY', '') - self.api_url = "http://api.stkouyu.com:8080/sent.eval" - - # 检查环境变量是否配置 - if not all([self.app_key, self.secret_key]): - raise ValueError( - "请配置声通 API 环境变量: ST_APP_KEY, ST_SECRET_KEY" - ) - - def _generate_signature(self, data: str) -> str: - """生成SHA1签名""" - return hashlib.sha1(data.encode('utf-8')).hexdigest() - - def _build_request_params(self, ref_text: str, audio_ext: str) -> dict: - """构建请求参数""" - timestamp = str(int(time.time())) - user_id = str(uuid.uuid4()) - - # 生成签名 - connect_data = self.app_key + timestamp + self.secret_key - start_data = self.app_key + timestamp + user_id + self.secret_key - connect_sig = self._generate_signature(connect_data) - start_sig = self._generate_signature(start_data) - - # 构建请求参数 - params = { - "connect": { - "cmd": "connect", - "param": { - "sdk": { - "version": 16777472, - "source": 9, - "protocol": 2 - }, - "app": { - "applicationId": self.app_key, - "sig": connect_sig, - "timestamp": timestamp - } - } - }, - "start": { - "cmd": "start", - "param": { - "app": { - "applicationId": self.app_key, - "sig": start_sig, - "timestamp": timestamp, - "userId": user_id - }, - "audio": { - "audioType": audio_ext, - "channel": 1, - "sampleBytes": 2, - "sampleRate": 16000 - }, - "request": { - "coreType": "sent.eval", - "refText": ref_text, - "tokenId": "makee", - } - } - } - } - - return params - - def evaluate(self, audio_file_path: str, ref_text: str) -> dict: - """ - 调用声通API进行口语评测 - - Args: - audio_file_path (str): 音频文件路径 - ref_text (str): 参考文本 - - Returns: - dict: 评测结果 - """ - debug_print(f"开始评测音频文件: {audio_file_path}") - debug_print(f"评测文本: {ref_text}") - - # 检查音频文件是否存在 - if not os.path.exists(audio_file_path): - error_msg = f"音频文件不存在: {audio_file_path}" - logging.error(error_msg) - return {"error": error_msg} - - # 获取音频文件扩展名 - audio_ext = os.path.splitext(audio_file_path)[1][1:] # 去掉点号 - if not audio_ext: - audio_ext = "wav" # 默认为wav - - # 构建请求参数 - params = self._build_request_params(ref_text, audio_ext) - - # 读取音频文件 - try: - with open(audio_file_path, 'rb') as f: - audio_data = f.read() - - # 构建multipart/form-data请求 - files = { - 'text': (None, json.dumps(params)), - 'audio': (f"{int(time.time() * 1000000)}.{audio_ext}", audio_data) - } - - headers = { - 'Request-Index': '0' - } - - debug_print("开始发送请求到声通API...") - response = requests.post( - self.api_url, - files=files, - headers=headers, - timeout=30 - ) - - if response.status_code == 200: - result = response.json() - debug_print("声通API返回成功") - return result - else: - error_msg = f"请求失败,状态码: {response.status_code}" - logging.error(f"{error_msg}, 响应: {response.text}") - return { - "error": error_msg, - "response": response.text - } - - except requests.exceptions.RequestException as e: - error_msg = f"请求异常: {str(e)}" - logging.error(error_msg) - return {"error": error_msg} - except Exception as e: - error_msg = f"评测过程出错: {str(e)}" - logging.error(error_msg) - return {"error": error_msg} - - -def evaluate_audio_file(audio_file_path, text="nice to meet you."): - """ - 简化的音频评测函数 - - Args: - audio_file_path (str): 音频文件路径 - text (str): 评测文本内容 - - Returns: - dict: 评测结果JSON - """ - api = ShengtongEvaluator() - return api.evaluate(audio_file_path, text) - - -# ==================== 批量处理相关代码 ==================== - -def download_audio_file(audio_url, temp_dir, max_retries=3, timeout=30): - """ - 下载音频文件到临时目录(增强版本) - - Args: - audio_url (str): 音频文件URL - temp_dir (str): 临时目录路径 - max_retries (int): 最大重试次数 - timeout (int): 请求超时时间(秒) - - Returns: - str: 下载的音频文件路径,失败返回None - """ - if not audio_url or pd.isna(audio_url): - logging.warning("音频URL为空或无效") - return None - - # 从URL中提取文件名 - try: - file_name = os.path.basename(audio_url.split('?')[0]) # 去除URL参数 - if not file_name or '.' not in file_name: - file_name = f"audio_{hash(audio_url) % 100000}.wav" # 生成默认文件名 - - file_path = os.path.join(temp_dir, file_name) - - # 重试机制 - for attempt in range(max_retries): - try: - logging.info(f"正在下载音频文件 (尝试 {attempt + 1}/{max_retries}): {audio_url}") - - # 设置请求头,模拟浏览器 - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } - - response = requests.get(audio_url, timeout=timeout, headers=headers, stream=True) - response.raise_for_status() - - # 检查内容类型 - content_type = response.headers.get('content-type', '') - if not any(audio_type in content_type.lower() for audio_type in ['audio', 'wav', 'mp3', 'ogg', 'flac']): - logging.warning(f"可能不是音频文件,Content-Type: {content_type}") - - # 写入文件 - with open(file_path, 'wb') as f: - for chunk in response.iter_content(chunk_size=8192): - if chunk: - f.write(chunk) - - # 验证文件大小 - file_size = os.path.getsize(file_path) - if file_size == 0: - raise ValueError("下载的文件为空") - - logging.info(f"音频文件下载成功: {file_path} (大小: {file_size} bytes)") - return file_path - - except requests.exceptions.Timeout: - logging.warning(f"下载超时 (尝试 {attempt + 1}/{max_retries}): {audio_url}") - if attempt < max_retries - 1: - time.sleep(2 ** attempt) # 指数退避 - continue - except requests.exceptions.RequestException as e: - logging.warning(f"下载请求异常 (尝试 {attempt + 1}/{max_retries}): {str(e)}") - if attempt < max_retries - 1: - time.sleep(2 ** attempt) - continue - except Exception as e: - logging.error(f"下载过程中发生未知错误 (尝试 {attempt + 1}/{max_retries}): {str(e)}") - if attempt < max_retries - 1: - time.sleep(2 ** attempt) - continue - - logging.error(f"音频文件下载失败,已达到最大重试次数: {audio_url}") - return None - - except Exception as e: - logging.error(f"下载音频文件时发生异常: {str(e)}") - return None - - -def format_shengtong_details(shengtong_result): - """ - 格式化声通评测结果为明细字符串 - - Args: - shengtong_result (dict): 声通API返回的结果 - - Returns: - str: 格式化的明细字符串 - """ - if not shengtong_result or 'error' in shengtong_result: - return "" - - try: - # 从result字段中获取words数组 - result = shengtong_result.get('result', {}) - words = result.get('words', []) - - if not words: - return "" - - details = [] - for word in words: - # 获取单词内容和得分 - word_text = word.get('word', '') - scores = word.get('scores', {}) - overall_score = scores.get('overall', 0) - - # 格式化为 "单词 分数" - details.append(f"{word_text} {int(overall_score)}") - - return "\n".join(details) - - except Exception as e: - logging.error(f"格式化声通明细失败: {str(e)}") - return "" - - -def get_shengtong_total_score(shengtong_result): - """ - 获取声通评测总分 - - Args: - shengtong_result (dict): 声通API返回的结果 - - Returns: - int: 总分,失败返回0 - """ - if not shengtong_result or 'error' in shengtong_result: - return 0 - - try: - result = shengtong_result.get('result', {}) - overall_score = result.get('overall', 0) - return int(overall_score) - except Exception as e: - logging.error(f"获取声通总分失败: {str(e)}") - return 0 - - -def get_shengtong_record_id(shengtong_result): - """ - 获取声通评测recordId - - Args: - shengtong_result (dict): 声通API返回的结果 - - Returns: - str: recordId,失败返回空字符串 - """ - if not shengtong_result or 'error' in shengtong_result: - return "" - - try: - record_id = shengtong_result.get('recordId', '') - return str(record_id) if record_id else "" - except Exception as e: - logging.error(f"获取声通recordId失败: {str(e)}") - return "" - - -def process_single_row(row_data, temp_dir, results_dict, lock, rate_limiter=None): - """ - 处理单行数据(并发版本,增强错误处理和时间分析) - - Args: - row_data (tuple): (index, row) 数据 - temp_dir (str): 临时目录路径 - results_dict (dict): 结果字典 - lock (threading.Lock): 线程锁 - rate_limiter (Queue): 速率限制器 - - Returns: - None - """ - index, row = row_data - start_time = time.time() - timing_info = {} - - try: - # 1. 速率限制等待时间 - rate_limit_start = time.time() - if rate_limiter: - rate_limiter.get() # 获取令牌 - timing_info['rate_limit_wait'] = time.time() - rate_limit_start - - logging.info(f"开始处理第 {index + 1} 行数据") - - # 2. 数据预处理时间 - preprocess_start = time.time() - ref_text = str(row['refText']) if pd.notna(row['refText']) else "" - audio_url = str(row['userAudio']) if pd.notna(row['userAudio']) else "" - - # 数据验证 - if not ref_text: - raise ValueError("refText 为空或无效") - - if not audio_url: - raise ValueError("userAudio 为空或无效") - timing_info['preprocess'] = time.time() - preprocess_start - - # 3. 音频下载时间 - download_start = time.time() - audio_file_path = download_audio_file(audio_url, temp_dir) - timing_info['audio_download'] = time.time() - download_start - - if not audio_file_path: - raise ValueError("音频文件下载失败") - - try: - # 4. 声通API调用时间 - api_start = time.time() - logging.info(f"正在调用声通API评测: {ref_text}") - shengtong_result = evaluate_audio_file(audio_file_path, ref_text) - timing_info['api_call'] = time.time() - api_start - - if not shengtong_result: - raise ValueError("声通API返回空结果") - - # 5. 结果处理时间 - result_process_start = time.time() - shengtong_details = format_shengtong_details(shengtong_result) - shengtong_total_score = get_shengtong_total_score(shengtong_result) - shengtong_record_id = get_shengtong_record_id(shengtong_result) - timing_info['result_process'] = time.time() - result_process_start - - # 6. 数据更新时间 - update_start = time.time() - with lock: - results_dict[index] = { - '测试总分': shengtong_total_score, - '测试明细': shengtong_details, - '测试recordId': shengtong_record_id - } - timing_info['data_update'] = time.time() - update_start - - # 计算总耗时 - total_time = time.time() - start_time - timing_info['total'] = total_time - - # 详细的时间分析日志 - logging.info(f"第 {index + 1} 行处理成功 - 总分: {shengtong_total_score} | " - f"总耗时: {total_time:.2f}s | " - f"速率等待: {timing_info['rate_limit_wait']:.2f}s | " - f"预处理: {timing_info['preprocess']:.3f}s | " - f"音频下载: {timing_info['audio_download']:.2f}s | " - f"API调用: {timing_info['api_call']:.2f}s | " - f"结果处理: {timing_info['result_process']:.3f}s | " - f"数据更新: {timing_info['data_update']:.3f}s") - - except Exception as api_error: - total_time = time.time() - start_time - logging.error(f"第 {index + 1} 行声通API调用失败: {str(api_error)} | " - f"总耗时: {total_time:.2f}s | " - f"音频下载: {timing_info.get('audio_download', 0):.2f}s | " - f"API调用: {timing_info.get('api_call', 0):.2f}s") - with lock: - results_dict[index] = { - '测试总分': 0, - '测试明细': "", - '测试recordId': "", - 'error': f'API调用失败: {str(api_error)}' - } - - finally: - # 7. 清理时间 - cleanup_start = time.time() - try: - if audio_file_path and os.path.exists(audio_file_path): - os.remove(audio_file_path) - logging.debug(f"已删除临时文件: {audio_file_path}") - except Exception as cleanup_error: - logging.warning(f"清理临时文件失败: {str(cleanup_error)}") - timing_info['cleanup'] = time.time() - cleanup_start - - # 释放速率限制令牌 - if rate_limiter: - try: - rate_limiter.put(None, timeout=1) # 归还令牌 - except: - pass # 队列可能已满,忽略 - - except Exception as e: - total_time = time.time() - start_time - logging.error(f"第 {index + 1} 行处理异常: {str(e)} | 总耗时: {total_time:.2f}s") - with lock: - results_dict[index] = { - '测试总分': 0, - '测试明细': "", - '测试recordId': "", - 'error': f'处理异常: {str(e)}' - } - - # 释放速率限制令牌 - if rate_limiter: - try: - rate_limiter.put(None, timeout=1) - except: - pass - - -def process_excel_with_shengtong_concurrent(input_file_path, output_dir="output/audio", max_workers=3, rate_limit_per_second=3): - """ - 处理Excel文件,添加声通评测结果(并发版本,增强控制) - - Args: - input_file_path (str): 输入Excel文件路径 - output_dir (str): 输出目录路径,默认为 output/audio - max_workers (int): 最大并发线程数,默认3 - rate_limit_per_second (int): 每秒最大请求数,默认3 - - Returns: - bool: 处理是否成功 - """ - start_time = time.time() - - try: - # 读取Excel文件 - logging.info(f"正在读取Excel文件: {input_file_path}") - df = pd.read_excel(input_file_path) - - # 检查必要的列是否存在 - required_columns = ['refText', 'userAudio'] - missing_columns = [col for col in required_columns if col not in df.columns] - if missing_columns: - logging.error(f"Excel文件缺少必要的列: {missing_columns}") - return False - - # 数据预处理和验证 - total_rows = len(df) - valid_rows = 0 - for index, row in df.iterrows(): - if pd.notna(row.get('refText')) and pd.notna(row.get('userAudio')): - valid_rows += 1 - - logging.info(f"总行数: {total_rows}, 有效行数: {valid_rows}") - - if valid_rows == 0: - logging.warning("没有找到有效的数据行") - return False - - # 添加新列 - df['测试总分'] = 0 - df['测试明细'] = "" - df['测试recordId'] = "" - - # 创建优化的速率限制器 - effective_rate_limit = max(rate_limit_per_second, max_workers) - rate_limiter = Queue(maxsize=effective_rate_limit * 2) - - # 预填充令牌 - for _ in range(effective_rate_limit): - rate_limiter.put(None) - - # 启动优化的速率限制器补充线程 - def rate_limiter_refill(): - interval = 1.0 / effective_rate_limit - while True: - time.sleep(interval) - try: - rate_limiter.put(None, block=False) - except: - pass - - rate_thread = threading.Thread(target=rate_limiter_refill, daemon=True) - rate_thread.start() - - logging.info(f"速率限制设置: {effective_rate_limit} req/s (原始: {rate_limit_per_second}, 队列大小: {effective_rate_limit * 2})") - - # 创建临时目录用于下载音频文件 - with tempfile.TemporaryDirectory() as temp_dir: - logging.info(f"创建临时目录: {temp_dir}") - logging.info(f"开始并发处理,最大并发数: {max_workers}, 有效速率限制: {effective_rate_limit} req/s") - - # 准备数据 - row_data_list = [(index, row) for index, row in df.iterrows()] - - # 创建结果字典和线程锁 - results_dict = {} - lock = threading.Lock() - - # 使用线程池进行并发处理 - with ThreadPoolExecutor(max_workers=max_workers) as executor: - # 提交所有任务 - future_to_index = { - executor.submit(process_single_row, row_data, temp_dir, results_dict, lock, rate_limiter): row_data[0] - for row_data in row_data_list - } - - # 等待任务完成并显示进度 - completed_count = 0 - success_count = 0 - error_count = 0 - - for future in as_completed(future_to_index): - completed_count += 1 - index = future_to_index[future] - - try: - future.result() # 获取结果,如果有异常会抛出 - - # 检查处理结果 - with lock: - result = results_dict.get(index, {}) - if result.get('error') is None: - success_count += 1 - else: - error_count += 1 - - # 显示进度 - if completed_count % 10 == 0 or completed_count == total_rows: - elapsed_time = time.time() - start_time - avg_time_per_item = elapsed_time / completed_count - remaining_time = avg_time_per_item * (total_rows - completed_count) - - logging.info(f"进度: {completed_count}/{total_rows} ({completed_count/total_rows*100:.1f}%) " - f"成功: {success_count}, 失败: {error_count}, " - f"预计剩余时间: {remaining_time:.1f}秒") - - except Exception as e: - error_count += 1 - logging.error(f"任务 {index + 1} 执行异常: {str(e)}") - with lock: - if index not in results_dict: - results_dict[index] = { - '测试总分': 0, - '测试明细': "", - '测试recordId': "", - 'error': f'任务执行异常: {str(e)}' - } - - # 将结果更新到DataFrame - logging.info("正在更新结果到DataFrame...") - for index in results_dict: - result = results_dict[index] - df.at[index, '测试总分'] = result.get('测试总分', 0) - df.at[index, '测试明细'] = result.get('测试明细', "") - df.at[index, '测试recordId'] = result.get('测试recordId', "") - - # 如果有错误,可以选择记录到备注列(如果存在) - if result.get('error') and '备注' in df.columns: - existing_note = str(df.at[index, '备注']) if pd.notna(df.at[index, '备注']) else "" - error_note = f"声通API错误: {result['error']}" - df.at[index, '备注'] = f"{existing_note}\n{error_note}".strip() - - # 创建输出目录 - output_path = Path(output_dir) - output_path.mkdir(parents=True, exist_ok=True) - - # 生成输出文件路径 - input_path = Path(input_file_path) - output_file_path = output_path / f"{input_path.stem}_add_shengtong_result.xlsx" - - # 保存结果 - logging.info(f"正在保存结果到: {output_file_path}") - df.to_excel(output_file_path, index=False) - - # 计算总耗时 - total_time = time.time() - start_time - - # 统计处理结果 - final_success_count = sum(1 for result in results_dict.values() if result.get('error') is None) - final_error_count = len(results_dict) - final_success_count - - logging.info("=" * 50) - logging.info("并发处理完成!") - logging.info(f"处理统计: 成功 {final_success_count} 条,失败 {final_error_count} 条,总计 {len(results_dict)} 条") - logging.info(f"总耗时: {total_time:.2f} 秒") - logging.info(f"平均处理时间: {total_time/len(results_dict):.2f} 秒/条") - logging.info(f"输出文件: {output_file_path}") - logging.info("=" * 50) - - return True - - except Exception as e: - logging.error(f"处理Excel文件时出错: {str(e)}") - return False - - -def process_excel_with_shengtong(input_file_path, output_dir="output/audio"): - """ - 处理Excel文件,添加声通评测结果(串行版本) - - Args: - input_file_path (str): 输入Excel文件路径 - output_dir (str): 输出目录路径,默认为 output/audio - - Returns: - bool: 处理是否成功 - """ - try: - # 读取Excel文件 - print(f"正在读取Excel文件: {input_file_path}") - df = pd.read_excel(input_file_path) - - # 检查必要的列是否存在 - required_columns = ['refText', 'userAudio'] - missing_columns = [col for col in required_columns if col not in df.columns] - if missing_columns: - print(f"错误: Excel文件缺少必要的列: {missing_columns}") - return False - - # 添加新列 - df['测试总分'] = 0 - df['测试明细'] = "" - df['测试recordId'] = "" - - # 创建临时目录用于下载音频文件 - with tempfile.TemporaryDirectory() as temp_dir: - print(f"创建临时目录: {temp_dir}") - - # 处理每一行数据 - total_rows = len(df) - for index, row in df.iterrows(): - print(f"\n处理进度: {index + 1}/{total_rows}") - - ref_text = str(row['refText']) if pd.notna(row['refText']) else "" - audio_url = str(row['userAudio']) if pd.notna(row['userAudio']) else "" - - if not ref_text or not audio_url: - print(f"第 {index + 1} 行数据不完整,跳过") - continue - - print(f"参考文本: {ref_text}") - print(f"音频URL: {audio_url}") - - # 下载音频文件 - audio_file_path = download_audio_file(audio_url, temp_dir) - if not audio_file_path: - print(f"第 {index + 1} 行音频下载失败,跳过") - continue - - # 调用声通API进行评测 - print("正在调用声通API进行评测...") - try: - shengtong_result = evaluate_audio_file(audio_file_path, ref_text) - print(f"声通API返回结果: {json.dumps(shengtong_result, indent=2, ensure_ascii=False)}") - - # 提取总分、明细和recordId - total_score = get_shengtong_total_score(shengtong_result) - details = format_shengtong_details(shengtong_result) - record_id = get_shengtong_record_id(shengtong_result) - - # 更新DataFrame - df.at[index, '测试总分'] = total_score - df.at[index, '测试明细'] = details - df.at[index, '测试recordId'] = record_id - - print(f"测试总分: {total_score}") - print(f"测试明细: {details}") - print(f"测试recordId: {record_id}") - - except Exception as e: - print(f"第 {index + 1} 行声通API调用失败: {str(e)}") - continue - - # 删除临时音频文件 - try: - os.remove(audio_file_path) - except: - pass - - # 添加延时避免API调用过于频繁 - time.sleep(1) - - # 创建输出目录 - output_path = Path(output_dir) - output_path.mkdir(parents=True, exist_ok=True) - - # 生成输出文件路径 - input_path = Path(input_file_path) - output_file_path = output_path / f"{input_path.stem}_add_shengtong_result.xlsx" - - # 保存结果 - print(f"\n正在保存结果到: {output_file_path}") - df.to_excel(output_file_path, index=False) - print("处理完成!") - - return True - - except Exception as e: - print(f"处理Excel文件时出错: {str(e)}") - return False - - -if __name__ == "__main__": - # ==================== 配置参数 ==================== - input_file = "人工筛选测试集v2_denoise.xlsx" - output_directory = "output/audio" # 输出目录,可以修改 - use_concurrent = True # True: 使用并发版本,False: 使用串行版本 - - # DEBUG 模式开关(True: 显示详细调试信息,False: 仅显示关键信息) - enable_debug = False # 可以设置为 True 来查看详细的 DEBUG 日志 - - # 设置全局 DEBUG_MODE - globals()['DEBUG_MODE'] = enable_debug - - # 检查环境变量 - required_env_vars = ['ST_APP_KEY', 'ST_SECRET_KEY'] - missing_vars = [var for var in required_env_vars if not os.environ.get(var)] - - if missing_vars: - print(f"错误: 缺少必要的环境变量: {missing_vars}") - print("请在 .env 文件或系统环境变量中配置:") - print(" ST_APP_KEY=你的应用Key") - print(" ST_SECRET_KEY=你的Secret Key") - elif not os.path.exists(input_file): - print(f"文件不存在: {input_file}") - print("请确保Excel文件存在并包含 'refText' 和 'userAudio' 列") - else: - if use_concurrent: - print("使用并发版本处理(3路并发,3 req/s)...") - success = process_excel_with_shengtong_concurrent( - input_file, - output_dir=output_directory, - max_workers=3, - rate_limit_per_second=3 - ) - else: - print("使用串行版本处理...") - success = process_excel_with_shengtong(input_file, output_dir=output_directory) - - if success: - print("处理成功!") - else: - print("处理失败!") diff --git a/makee_vala/git_scripts/batch_add_xunfei_result.py b/makee_vala/git_scripts/batch_add_xunfei_result.py deleted file mode 100644 index 3e07493..0000000 --- a/makee_vala/git_scripts/batch_add_xunfei_result.py +++ /dev/null @@ -1,1090 +0,0 @@ -""" -讯飞语音评测批量处理工具 - -功能说明: -- 读取 Excel 文件,其中包含音频链接(userAudio 字段)和参考文本(refText 字段) -- 调用讯飞 API 对音频进行评测,获取总分和明细 -- 在原 Excel 中添加"讯飞总分"和"讯飞明细"两个字段 -- 输出文件命名为: {原文件名}_add_xunfei_result.xlsx -- 支持串行和并发两种处理模式 - -环境变量配置: -- XUNFEI_APPID: 讯飞应用 ID -- XUNFEI_API_SECRET: 讯飞 API 密钥 -- XUNFEI_API_KEY: 讯飞 API Key - -讯飞技术文档: https://www.xfyun.cn/doc/Ise/IseAPI.html -""" - -import pandas as pd -import os -import requests -import tempfile -from pathlib import Path -import json -import time -from concurrent.futures import ThreadPoolExecutor, as_completed -import threading -from queue import Queue -import logging -import websocket -import datetime -import hashlib -import base64 -import hmac -from urllib.parse import urlencode -import ssl -from wsgiref.handlers import format_date_time -from datetime import datetime -from time import mktime -import xml.etree.ElementTree as ET - -# 配置日志 -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s', - handlers=[ - logging.FileHandler('xunfei_batch_processing.log'), - logging.StreamHandler() - ] -) - -# 从 .env 文件加载环境变量 -from dotenv import load_dotenv -load_dotenv() - -# ==================== 全局配置 ==================== -# DEBUG 模式开关(控制详细日志输出) -DEBUG_MODE = False - - -def debug_print(message): - """ - DEBUG 信息输出函数 - - Args: - message (str): 要输出的调试信息 - """ - if DEBUG_MODE: - print(f"[DEBUG] {message}") - - -# ==================== 讯飞 API 相关代码 ==================== - -class XunfeiISEAPI: - """讯飞语音评测 API 封装类""" - - def __init__(self): - """从环境变量读取 API 配置""" - self.host_url = "ws://ise-api.xfyun.cn/v2/open-ise" - self.appid = os.environ.get('XUNFEI_APPID', '') - self.api_secret = os.environ.get('XUNFEI_API_SECRET', '') - self.api_key = os.environ.get('XUNFEI_API_KEY', '') - - # 检查环境变量是否配置 - if not all([self.appid, self.api_secret, self.api_key]): - raise ValueError( - "请配置讯飞 API 环境变量: XUNFEI_APPID, XUNFEI_API_SECRET, XUNFEI_API_KEY" - ) - - self.result = None - self.error = None - - def _detect_audio_format(self, audio_file_path): - """检测音频文件格式""" - try: - # 通过文件扩展名检测 - file_ext = os.path.splitext(audio_file_path)[1].lower() - if file_ext == '.wav': - return 'wav' - elif file_ext == '.mp3': - return 'mp3' - - # 通过文件头检测 - with open(audio_file_path, 'rb') as f: - header = f.read(12) - if len(header) >= 12: - # WAV文件头: RIFF....WAVE - if header[:4] == b'RIFF' and header[8:12] == b'WAVE': - return 'wav' - # MP3文件头: ID3 或 0xFF 0xFB/0xFA - elif header[:3] == b'ID3' or (header[0] == 0xFF and (header[1] & 0xE0) == 0xE0): - return 'mp3' - - # 默认返回wav - return 'wav' - except Exception as e: - print(f"[WARNING] 音频格式检测失败: {str(e)}, 默认使用WAV格式") - return 'wav' - - def _remove_wav_header(self, audio_file_path): - """去除WAV文件头部,返回纯音频数据""" - try: - with open(audio_file_path, 'rb') as f: - # 读取WAV文件头 - riff_header = f.read(12) # RIFF header (12 bytes) - if len(riff_header) < 12 or riff_header[:4] != b'RIFF' or riff_header[8:12] != b'WAVE': - print(f"[WARNING] 不是有效的WAV文件,直接返回原始数据") - f.seek(0) - return f.read() - - # 跳过format chunk - while True: - chunk_header = f.read(8) - if len(chunk_header) < 8: - break - - chunk_id = chunk_header[:4] - chunk_size = int.from_bytes(chunk_header[4:8], byteorder='little') - - if chunk_id == b'data': - # 找到data chunk,返回音频数据 - audio_data = f.read(chunk_size) - debug_print(f"WAV头部已去除,音频数据大小: {len(audio_data)} bytes") - return audio_data - else: - # 跳过其他chunk - f.seek(chunk_size, 1) - if chunk_size % 2: # 如果chunk大小是奇数,需要跳过一个字节对齐 - f.seek(1, 1) - - # 如果没找到data chunk,返回从当前位置开始的所有数据 - print(f"[WARNING] 未找到data chunk,返回剩余数据") - return f.read() - - except Exception as e: - print(f"[ERROR] WAV头部处理失败: {str(e)}, 返回原始文件数据") - with open(audio_file_path, 'rb') as f: - return f.read() - - def _generate_url(self): - """生成WebSocket连接URL""" - now_time = datetime.now() - now_date = format_date_time(mktime(now_time.timetuple())) - - # 拼接鉴权原始字符串 - origin_base = "host: " + "ise-api.xfyun.cn" + "\n" - origin_base += "date: " + now_date + "\n" - origin_base += "GET " + "/v2/open-ise " + "HTTP/1.1" - - # sha256加密 - signature_sha = hmac.new(self.api_secret.encode('utf-8'), origin_base.encode('utf-8'), - digestmod=hashlib.sha256).digest() - signature_sha = base64.b64encode(signature_sha).decode(encoding='utf-8') - - authorization_origin = "api_key=\"%s\", algorithm=\"%s\", headers=\"%s\", signature=\"%s\"" % ( - self.api_key, "hmac-sha256", "host date request-line", signature_sha) - authorization = base64.b64encode(authorization_origin.encode('utf-8')).decode(encoding='utf-8') - - # 将请求的鉴权参数组合为字典 - dict_data = { - "authorization": authorization, - "date": now_date, - "host": "ise-api.xfyun.cn" - } - ws_url = self.host_url + '?' + urlencode(dict_data) - return ws_url - - def _on_message(self, ws, message): - """处理WebSocket消息""" - try: - debug_print(f"收到消息: {message}") - response = json.loads(message) - debug_print(f"解析后的响应: {json.dumps(response, indent=2, ensure_ascii=False)}") - - # 检查响应结构 - if "data" not in response: - print(f"[ERROR] 响应中缺少 'data' 字段") - self.error = f"响应格式错误: 缺少 'data' 字段" - ws.close() - return - - data = response["data"] - if "status" not in data: - print(f"[ERROR] data 中缺少 'status' 字段") - self.error = f"响应格式错误: 缺少 'status' 字段" - ws.close() - return - - status = data["status"] - debug_print(f"状态码: {status}") - - if status == 2: # 评测完成 - if "data" not in data: - print(f"[ERROR] data 中缺少评测结果数据") - self.error = f"响应格式错误: 缺少评测结果数据" - ws.close() - return - - xml_data = base64.b64decode(data["data"]) - xml_string = xml_data.decode("utf-8") - debug_print(f"解码后的XML: {xml_string}") - self.result = self._parse_xml_result(xml_string) - debug_print(f"解析后的结果: {json.dumps(self.result, indent=2, ensure_ascii=False)}") - ws.close() - except json.JSONDecodeError as e: - print(f"[ERROR] JSON解析失败: {str(e)}") - print(f"[ERROR] 原始消息: {message}") - self.error = f"JSON解析错误: {str(e)}" - ws.close() - except Exception as e: - print(f"[ERROR] 消息处理异常: {str(e)}") - print(f"[ERROR] 异常类型: {type(e).__name__}") - print(f"[ERROR] 原始消息: {message}") - self.error = f"消息处理错误: {str(e)}" - ws.close() - - def _on_error(self, ws, error): - """处理WebSocket错误""" - print(f"[ERROR] WebSocket错误: {str(error)}") - print(f"[ERROR] 错误类型: {type(error).__name__}") - self.error = f"WebSocket错误: {str(error)}" - - def _on_close(self, ws, reason, res): - """WebSocket连接关闭""" - debug_print(f"WebSocket连接关闭 - 原因: {reason}, 响应: {res}") - pass - - def _on_open(self, ws, audio_file, text="nice to meet you."): - """WebSocket连接打开,发送音频数据""" - try: - debug_print("WebSocket连接已打开") - debug_print(f"音频文件: {audio_file}") - debug_print(f"评测文本: {text}") - - # 检测音频格式 - audio_format = self._detect_audio_format(audio_file) - debug_print(f"检测到音频格式: {audio_format}") - - # 根据音频格式设置aue参数 - if audio_format == 'wav': - aue_param = "raw" # WAV文件使用raw - else: # mp3 - aue_param = "lame" # MP3文件使用lame - - debug_print(f"使用aue参数: {aue_param}") - - # 发送初始配置 - send_dict = { - "common": { - "app_id": self.appid - }, - "business": { - "category": "read_sentence", - "rstcd": "utf8", - "sub": "ise", - "group": "pupil", - "ent": "en_vip", - "tte": "utf-8", - "cmd": "ssb", - "auf": "audio/L16;rate=16000", - "aue": aue_param, - "text": '\uFEFF' + f"[content]\n{text}", - "ise_unite": "1", - "extra_ability": "pitch" - }, - "data": { - "status": 0, - "data": "" - } - } - debug_print(f"发送初始配置: {json.dumps(send_dict, indent=2, ensure_ascii=False)}") - ws.send(json.dumps(send_dict)) - - # 根据音频格式处理音频数据 - if audio_format == 'wav': - # WAV文件需要去除头部 - audio_data = self._remove_wav_header(audio_file) - debug_print(f"WAV文件头部已去除,音频数据大小: {len(audio_data)} bytes") - else: - # MP3文件直接读取 - with open(audio_file, "rb") as f: - audio_data = f.read() - debug_print(f"MP3文件直接读取,音频数据大小: {len(audio_data)} bytes") - - # 优化音频发送逻辑 - frame_count = 0 - data_size = len(audio_data) - - # 根据数据大小动态调整缓冲区大小和延迟 - if data_size > 50000: # 大于50KB的数据使用更大的缓冲区 - buffer_size = 12800 # 20倍缓冲区 - sleep_time = 0.02 # 减少延迟到20ms - else: - buffer_size = 1280 # 原始缓冲区 - sleep_time = 0.01 # 小文件使用更小延迟 - - debug_print(f"使用缓冲区大小: {buffer_size}, 延迟: {sleep_time}s") - - # 发送音频数据 - offset = 0 - while offset < data_size: - # 读取缓冲区大小的数据 - buffer = audio_data[offset:offset + buffer_size] - offset += len(buffer) - - if offset >= data_size: - # 发送最后一帧 - my_dict = { - "business": {"cmd": "auw", "aus": 4, "aue": aue_param}, - "data": {"status": 2, "data": str(base64.b64encode(buffer).decode())} - } - debug_print("发送最后一帧") - ws.send(json.dumps(my_dict)) - break - - # 发送中间帧 - send_dict = { - "business": { - "cmd": "auw", - "aus": 1, - "aue": aue_param - }, - "data": { - "status": 1, - "data": str(base64.b64encode(buffer).decode()), - "data_type": 1, - "encoding": "raw" - } - } - frame_count += 1 - if frame_count % 20 == 0: # 减少日志频率 - debug_print(f"已发送 {frame_count} 帧音频数据") - ws.send(json.dumps(send_dict)) - time.sleep(sleep_time) # 使用动态延迟 - - debug_print(f"音频发送完成,总共发送 {frame_count} 帧") - - except Exception as e: - print(f"[ERROR] 音频发送异常: {str(e)}") - print(f"[ERROR] 异常类型: {type(e).__name__}") - self.error = f"音频发送错误: {str(e)}" - ws.close() - - def _parse_xml_result(self, xml_string): - """解析XML评测结果""" - try: - root = ET.fromstring(xml_string) - - result = { - "total_score": 0, - "words": [], - "sentences": [] - } - - # 解析句子级别评分 - for sentence in root.findall('.//sentence'): - sentence_info = { - "content": sentence.get('content', ''), - "total_score": float(sentence.get('total_score', 0)), - "fluency_score": float(sentence.get('fluency_score', 0)), - "integrity_score": float(sentence.get('integrity_score', 0)), - "phone_score": float(sentence.get('phone_score', 0)) - } - result["sentences"].append(sentence_info) - result["total_score"] = sentence_info["total_score"] - - # 解析单词级别评分 - for word in root.findall('.//word'): - word_info = { - "content": word.get('content', ''), - "total_score": float(word.get('total_score', 0)), - "dp_message": int(word.get('dp_message', 0)), - "time_len": int(word.get('time_len', 0)), - "syllables": [] - } - - # 解析音节评分 - for syllable in word.findall('.//syllable'): - syllable_info = { - "content": syllable.get('content', ''), - "total_score": float(syllable.get('total_score', 0)), - "phones": [] - } - - # 解析音素评分 - for phone in syllable.findall('.//phone'): - phone_info = { - "content": phone.get('content', ''), - "total_score": float(phone.get('total_score', 0)), - "dp_message": int(phone.get('dp_message', 0)) - } - syllable_info["phones"].append(phone_info) - - word_info["syllables"].append(syllable_info) - - result["words"].append(word_info) - - return result - - except Exception as e: - return {"error": f"XML解析错误: {str(e)}"} - - def evaluate_audio(self, audio_file_path, text="nice to meet you.", timeout=30): - """ - 评测音频文件 - - Args: - audio_file_path (str): 音频文件路径 - text (str): 评测文本内容 - timeout (int): 超时时间(秒) - - Returns: - dict: 评测结果JSON - """ - debug_print(f"开始评测音频文件: {audio_file_path}") - debug_print(f"评测文本: {text}") - - # 检查音频文件是否存在 - if not os.path.exists(audio_file_path): - error_msg = f"音频文件不存在: {audio_file_path}" - print(f"[ERROR] {error_msg}") - return {"error": error_msg} - - # 重置结果 - self.result = None - self.error = None - - try: - # 生成WebSocket URL - ws_url = self._generate_url() - debug_print(f"WebSocket URL: {ws_url}") - - # 创建WebSocket连接 - websocket.enableTrace(False) - ws = websocket.WebSocketApp( - ws_url, - on_message=self._on_message, - on_error=self._on_error, - on_close=self._on_close, - on_open=lambda ws: self._on_open(ws, audio_file_path, text) - ) - - debug_print("开始WebSocket连接...") - # 运行WebSocket连接 - ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE}) - - debug_print("WebSocket连接结束") - # 返回结果 - if self.error: - print(f"[ERROR] 评测失败: {self.error}") - return {"error": self.error} - elif self.result: - debug_print("评测成功") - return self.result - else: - error_msg = "未收到评测结果" - print(f"[ERROR] {error_msg}") - return {"error": error_msg} - - except Exception as e: - error_msg = f"评测过程出错: {str(e)}" - print(f"[ERROR] {error_msg}") - print(f"[ERROR] 异常类型: {type(e).__name__}") - return {"error": error_msg} - - -def evaluate_audio_file(audio_file_path, text="nice to meet you."): - """ - 简化的音频评测函数 - - Args: - audio_file_path (str): 音频文件路径 - text (str): 评测文本内容 - - Returns: - dict: 评测结果JSON - """ - api = XunfeiISEAPI() - return api.evaluate_audio(audio_file_path, text) - - -# ==================== 批量处理相关代码 ==================== - -def download_audio_file(audio_url, temp_dir, max_retries=3, timeout=30): - """ - 下载音频文件到临时目录(增强版本) - - Args: - audio_url (str): 音频文件URL - temp_dir (str): 临时目录路径 - max_retries (int): 最大重试次数 - timeout (int): 请求超时时间(秒) - - Returns: - str: 下载的音频文件路径,失败返回None - """ - if not audio_url or pd.isna(audio_url): - logging.warning("音频URL为空或无效") - return None - - # 从URL中提取文件名 - try: - file_name = os.path.basename(audio_url.split('?')[0]) # 去除URL参数 - if not file_name or '.' not in file_name: - file_name = f"audio_{hash(audio_url) % 100000}.wav" # 生成默认文件名 - - file_path = os.path.join(temp_dir, file_name) - - # 重试机制 - for attempt in range(max_retries): - try: - logging.info(f"正在下载音频文件 (尝试 {attempt + 1}/{max_retries}): {audio_url}") - - # 设置请求头,模拟浏览器 - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } - - response = requests.get(audio_url, timeout=timeout, headers=headers, stream=True) - response.raise_for_status() - - # 检查内容类型 - content_type = response.headers.get('content-type', '') - if not any(audio_type in content_type.lower() for audio_type in ['audio', 'wav', 'mp3', 'ogg', 'flac']): - logging.warning(f"可能不是音频文件,Content-Type: {content_type}") - - # 写入文件 - with open(file_path, 'wb') as f: - for chunk in response.iter_content(chunk_size=8192): - if chunk: - f.write(chunk) - - # 验证文件大小 - file_size = os.path.getsize(file_path) - if file_size == 0: - raise ValueError("下载的文件为空") - - logging.info(f"音频文件下载成功: {file_path} (大小: {file_size} bytes)") - return file_path - - except requests.exceptions.Timeout: - logging.warning(f"下载超时 (尝试 {attempt + 1}/{max_retries}): {audio_url}") - if attempt < max_retries - 1: - time.sleep(2 ** attempt) # 指数退避 - continue - except requests.exceptions.RequestException as e: - logging.warning(f"下载请求异常 (尝试 {attempt + 1}/{max_retries}): {str(e)}") - if attempt < max_retries - 1: - time.sleep(2 ** attempt) - continue - except Exception as e: - logging.error(f"下载过程中发生未知错误 (尝试 {attempt + 1}/{max_retries}): {str(e)}") - if attempt < max_retries - 1: - time.sleep(2 ** attempt) - continue - - logging.error(f"音频文件下载失败,已达到最大重试次数: {audio_url}") - return None - - except Exception as e: - logging.error(f"下载音频文件时发生异常: {str(e)}") - return None - - -def format_xunfei_details(xunfei_result): - """ - 格式化讯飞评测结果为明细字符串 - - Args: - xunfei_result (dict): 讯飞API返回的结果 - - Returns: - str: 格式化的明细字符串 - """ - if not xunfei_result or 'error' in xunfei_result: - return "" - - try: - words = xunfei_result.get('words', []) - if not words: - return "" - - details = [] - for word in words: - content = word.get('content', '') - total_score = word.get('total_score', 0) - details.append(f"{content} {int(total_score)}") - - return "\n".join(details) - - except Exception as e: - print(f"格式化讯飞明细失败: {str(e)}") - return "" - - -def get_xunfei_total_score(xunfei_result): - """ - 获取讯飞评测总分 - - Args: - xunfei_result (dict): 讯飞API返回的结果 - - Returns: - int: 总分,失败返回0 - """ - if not xunfei_result or 'error' in xunfei_result: - return 0 - - try: - return int(xunfei_result.get('total_score', 0)) - except Exception as e: - print(f"获取讯飞总分失败: {str(e)}") - return 0 - - -def process_single_row(row_data, temp_dir, results_dict, lock, rate_limiter=None): - """ - 处理单行数据(并发版本,增强错误处理和时间分析) - - Args: - row_data (tuple): (index, row) 数据 - temp_dir (str): 临时目录路径 - results_dict (dict): 结果字典 - lock (threading.Lock): 线程锁 - rate_limiter (Queue): 速率限制器 - - Returns: - None - """ - index, row = row_data - start_time = time.time() - timing_info = {} - - try: - # 1. 速率限制等待时间 - rate_limit_start = time.time() - if rate_limiter: - rate_limiter.get() # 获取令牌 - timing_info['rate_limit_wait'] = time.time() - rate_limit_start - - logging.info(f"开始处理第 {index + 1} 行数据") - - # 2. 数据预处理时间 - preprocess_start = time.time() - ref_text = str(row['refText']) if pd.notna(row['refText']) else "" - audio_url = str(row['userAudio']) if pd.notna(row['userAudio']) else "" - - # 数据验证 - if not ref_text: - raise ValueError("refText 为空或无效") - - if not audio_url: - raise ValueError("userAudio 为空或无效") - timing_info['preprocess'] = time.time() - preprocess_start - - # 3. 音频下载时间 - download_start = time.time() - audio_file_path = download_audio_file(audio_url, temp_dir) - timing_info['audio_download'] = time.time() - download_start - - if not audio_file_path: - raise ValueError("音频文件下载失败") - - try: - # 4. 讯飞API调用时间 - api_start = time.time() - logging.info(f"正在调用讯飞API评测: {ref_text}") - xunfei_result = evaluate_audio_file(audio_file_path, ref_text) - timing_info['api_call'] = time.time() - api_start - - if not xunfei_result: - raise ValueError("讯飞API返回空结果") - - # 5. 结果处理时间 - result_process_start = time.time() - xunfei_details = format_xunfei_details(xunfei_result) - xunfei_total_score = get_xunfei_total_score(xunfei_result) - timing_info['result_process'] = time.time() - result_process_start - - # 6. 数据更新时间 - update_start = time.time() - with lock: - results_dict[index] = { - '讯飞总分': xunfei_total_score, - '讯飞明细': xunfei_details - } - timing_info['data_update'] = time.time() - update_start - - # 计算总耗时 - total_time = time.time() - start_time - timing_info['total'] = total_time - - # 详细的时间分析日志 - logging.info(f"第 {index + 1} 行处理成功 - 总分: {xunfei_total_score} | " - f"总耗时: {total_time:.2f}s | " - f"速率等待: {timing_info['rate_limit_wait']:.2f}s | " - f"预处理: {timing_info['preprocess']:.3f}s | " - f"音频下载: {timing_info['audio_download']:.2f}s | " - f"API调用: {timing_info['api_call']:.2f}s | " - f"结果处理: {timing_info['result_process']:.3f}s | " - f"数据更新: {timing_info['data_update']:.3f}s") - - except Exception as api_error: - total_time = time.time() - start_time - logging.error(f"第 {index + 1} 行讯飞API调用失败: {str(api_error)} | " - f"总耗时: {total_time:.2f}s | " - f"音频下载: {timing_info.get('audio_download', 0):.2f}s | " - f"API调用: {timing_info.get('api_call', 0):.2f}s") - with lock: - results_dict[index] = { - '讯飞总分': 0, - '讯飞明细': "", - 'error': f'API调用失败: {str(api_error)}' - } - - finally: - # 7. 清理时间 - cleanup_start = time.time() - try: - if audio_file_path and os.path.exists(audio_file_path): - os.remove(audio_file_path) - logging.debug(f"已删除临时文件: {audio_file_path}") - except Exception as cleanup_error: - logging.warning(f"清理临时文件失败: {str(cleanup_error)}") - timing_info['cleanup'] = time.time() - cleanup_start - - # 释放速率限制令牌 - if rate_limiter: - try: - rate_limiter.put(None, timeout=1) # 归还令牌 - except: - pass # 队列可能已满,忽略 - - except Exception as e: - total_time = time.time() - start_time - logging.error(f"第 {index + 1} 行处理异常: {str(e)} | 总耗时: {total_time:.2f}s") - with lock: - results_dict[index] = { - '讯飞总分': 0, - '讯飞明细': "", - 'error': f'处理异常: {str(e)}' - } - - # 释放速率限制令牌 - if rate_limiter: - try: - rate_limiter.put(None, timeout=1) - except: - pass - - -def process_excel_with_xunfei_concurrent(input_file_path, output_dir="output/audio", max_workers=5, rate_limit_per_second=5): - """ - 处理Excel文件,添加讯飞评测结果(并发版本,增强控制) - - Args: - input_file_path (str): 输入Excel文件路径 - output_dir (str): 输出目录路径,默认为 output/audio - max_workers (int): 最大并发线程数,默认5 - rate_limit_per_second (int): 每秒最大请求数,默认5 - - Returns: - bool: 处理是否成功 - """ - start_time = time.time() - - try: - # 读取Excel文件 - logging.info(f"正在读取Excel文件: {input_file_path}") - df = pd.read_excel(input_file_path) - - # 检查必要的列是否存在 - required_columns = ['refText', 'userAudio'] - missing_columns = [col for col in required_columns if col not in df.columns] - if missing_columns: - logging.error(f"Excel文件缺少必要的列: {missing_columns}") - return False - - # 数据预处理和验证 - total_rows = len(df) - valid_rows = 0 - for index, row in df.iterrows(): - if pd.notna(row.get('refText')) and pd.notna(row.get('userAudio')): - valid_rows += 1 - - logging.info(f"总行数: {total_rows}, 有效行数: {valid_rows}") - - if valid_rows == 0: - logging.warning("没有找到有效的数据行") - return False - - # 添加新列 - df['讯飞总分'] = 0 - df['讯飞明细'] = "" - - # 创建优化的速率限制器 - effective_rate_limit = max(rate_limit_per_second, max_workers) - rate_limiter = Queue(maxsize=effective_rate_limit * 2) - - # 预填充令牌 - for _ in range(effective_rate_limit): - rate_limiter.put(None) - - # 启动优化的速率限制器补充线程 - def rate_limiter_refill(): - interval = 1.0 / effective_rate_limit - while True: - time.sleep(interval) - try: - rate_limiter.put(None, block=False) - except: - pass - - rate_thread = threading.Thread(target=rate_limiter_refill, daemon=True) - rate_thread.start() - - logging.info(f"速率限制设置: {effective_rate_limit} req/s (原始: {rate_limit_per_second}, 队列大小: {effective_rate_limit * 2})") - - # 创建临时目录用于下载音频文件 - with tempfile.TemporaryDirectory() as temp_dir: - logging.info(f"创建临时目录: {temp_dir}") - logging.info(f"开始并发处理,最大并发数: {max_workers}, 有效速率限制: {effective_rate_limit} req/s") - - # 准备数据 - row_data_list = [(index, row) for index, row in df.iterrows()] - - # 创建结果字典和线程锁 - results_dict = {} - lock = threading.Lock() - - # 使用线程池进行并发处理 - with ThreadPoolExecutor(max_workers=max_workers) as executor: - # 提交所有任务 - future_to_index = { - executor.submit(process_single_row, row_data, temp_dir, results_dict, lock, rate_limiter): row_data[0] - for row_data in row_data_list - } - - # 等待任务完成并显示进度 - completed_count = 0 - success_count = 0 - error_count = 0 - - for future in as_completed(future_to_index): - completed_count += 1 - index = future_to_index[future] - - try: - future.result() # 获取结果,如果有异常会抛出 - - # 检查处理结果 - with lock: - result = results_dict.get(index, {}) - if result.get('error') is None: - success_count += 1 - else: - error_count += 1 - - # 显示进度 - if completed_count % 10 == 0 or completed_count == total_rows: - elapsed_time = time.time() - start_time - avg_time_per_item = elapsed_time / completed_count - remaining_time = avg_time_per_item * (total_rows - completed_count) - - logging.info(f"进度: {completed_count}/{total_rows} ({completed_count/total_rows*100:.1f}%) " - f"成功: {success_count}, 失败: {error_count}, " - f"预计剩余时间: {remaining_time:.1f}秒") - - except Exception as e: - error_count += 1 - logging.error(f"任务 {index + 1} 执行异常: {str(e)}") - with lock: - if index not in results_dict: - results_dict[index] = { - '讯飞总分': 0, - '讯飞明细': "", - 'error': f'任务执行异常: {str(e)}' - } - - # 将结果更新到DataFrame - logging.info("正在更新结果到DataFrame...") - for index in results_dict: - result = results_dict[index] - df.at[index, '讯飞总分'] = result.get('讯飞总分', 0) - df.at[index, '讯飞明细'] = result.get('讯飞明细', "") - - # 如果有错误,可以选择记录到备注列(如果存在) - if result.get('error') and '备注' in df.columns: - existing_note = str(df.at[index, '备注']) if pd.notna(df.at[index, '备注']) else "" - error_note = f"讯飞API错误: {result['error']}" - df.at[index, '备注'] = f"{existing_note}\n{error_note}".strip() - - # 创建输出目录 - output_path = Path(output_dir) - output_path.mkdir(parents=True, exist_ok=True) - - # 生成输出文件路径 - input_path = Path(input_file_path) - output_file_path = output_path / f"{input_path.stem}_add_xunfei_result.xlsx" - - # 保存结果 - logging.info(f"正在保存结果到: {output_file_path}") - df.to_excel(output_file_path, index=False) - - # 计算总耗时 - total_time = time.time() - start_time - - # 统计处理结果 - final_success_count = sum(1 for result in results_dict.values() if result.get('error') is None) - final_error_count = len(results_dict) - final_success_count - - logging.info("=" * 50) - logging.info("并发处理完成!") - logging.info(f"处理统计: 成功 {final_success_count} 条,失败 {final_error_count} 条,总计 {len(results_dict)} 条") - logging.info(f"总耗时: {total_time:.2f} 秒") - logging.info(f"平均处理时间: {total_time/len(results_dict):.2f} 秒/条") - logging.info(f"输出文件: {output_file_path}") - logging.info("=" * 50) - - return True - - except Exception as e: - logging.error(f"处理Excel文件时出错: {str(e)}") - return False - - -def process_excel_with_xunfei(input_file_path, output_dir="output/audio"): - """ - 处理Excel文件,添加讯飞评测结果(串行版本) - - Args: - input_file_path (str): 输入Excel文件路径 - output_dir (str): 输出目录路径,默认为 output/audio - - Returns: - bool: 处理是否成功 - """ - try: - # 读取Excel文件 - print(f"正在读取Excel文件: {input_file_path}") - df = pd.read_excel(input_file_path) - - # 检查必要的列是否存在 - required_columns = ['refText', 'userAudio'] - missing_columns = [col for col in required_columns if col not in df.columns] - if missing_columns: - print(f"错误: Excel文件缺少必要的列: {missing_columns}") - return False - - # 添加新列 - df['讯飞总分'] = 0 - df['讯飞明细'] = "" - - # 创建临时目录用于下载音频文件 - with tempfile.TemporaryDirectory() as temp_dir: - print(f"创建临时目录: {temp_dir}") - - # 处理每一行数据 - total_rows = len(df) - for index, row in df.iterrows(): - print(f"\n处理进度: {index + 1}/{total_rows}") - - ref_text = str(row['refText']) if pd.notna(row['refText']) else "" - audio_url = str(row['userAudio']) if pd.notna(row['userAudio']) else "" - - if not ref_text or not audio_url: - print(f"第 {index + 1} 行数据不完整,跳过") - continue - - print(f"参考文本: {ref_text}") - print(f"音频URL: {audio_url}") - - # 下载音频文件 - audio_file_path = download_audio_file(audio_url, temp_dir) - if not audio_file_path: - print(f"第 {index + 1} 行音频下载失败,跳过") - continue - - # 调用讯飞API进行评测 - print("正在调用讯飞API进行评测...") - try: - xunfei_result = evaluate_audio_file(audio_file_path, ref_text) - print(f"讯飞API返回结果: {json.dumps(xunfei_result, indent=2, ensure_ascii=False)}") - - # 提取总分和明细 - total_score = get_xunfei_total_score(xunfei_result) - details = format_xunfei_details(xunfei_result) - - # 更新DataFrame - df.at[index, '讯飞总分'] = total_score - df.at[index, '讯飞明细'] = details - - print(f"讯飞总分: {total_score}") - print(f"讯飞明细: {details}") - - except Exception as e: - print(f"第 {index + 1} 行讯飞API调用失败: {str(e)}") - continue - - # 删除临时音频文件 - try: - os.remove(audio_file_path) - except: - pass - - # 添加延时避免API调用过于频繁 - time.sleep(1) - - # 创建输出目录 - output_path = Path(output_dir) - output_path.mkdir(parents=True, exist_ok=True) - - # 生成输出文件路径 - input_path = Path(input_file_path) - output_file_path = output_path / f"{input_path.stem}_add_xunfei_result.xlsx" - - # 保存结果 - print(f"\n正在保存结果到: {output_file_path}") - df.to_excel(output_file_path, index=False) - print("处理完成!") - - return True - - except Exception as e: - print(f"处理Excel文件时出错: {str(e)}") - return False - - -if __name__ == "__main__": - # ==================== 配置参数 ==================== - input_file = "user_audio_data_20251210_152807_sample.xlsx" - output_directory = "output/audio" # 输出目录,可以修改 - use_concurrent = True # True: 使用并发版本,False: 使用串行版本 - - # DEBUG 模式开关(True: 显示详细调试信息,False: 仅显示关键信息) - enable_debug = False # 可以设置为 True 来查看详细的 DEBUG 日志 - - # 设置全局 DEBUG_MODE - globals()['DEBUG_MODE'] = enable_debug - - # 检查环境变量 - required_env_vars = ['XUNFEI_APPID', 'XUNFEI_API_SECRET', 'XUNFEI_API_KEY'] - missing_vars = [var for var in required_env_vars if not os.environ.get(var)] - - if missing_vars: - print(f"错误: 缺少必要的环境变量: {missing_vars}") - print("请在 .env 文件或系统环境变量中配置:") - print(" XUNFEI_APPID=你的应用ID") - print(" XUNFEI_API_SECRET=你的API密钥") - print(" XUNFEI_API_KEY=你的API Key") - elif not os.path.exists(input_file): - print(f"文件不存在: {input_file}") - print("请确保Excel文件存在并包含 'refText' 和 'userAudio' 列") - else: - if use_concurrent: - print("使用并发版本处理(5路并发,5 req/s)...") - success = process_excel_with_xunfei_concurrent( - input_file, - output_dir=output_directory, - max_workers=5, - rate_limit_per_second=5 - ) - else: - print("使用串行版本处理...") - success = process_excel_with_xunfei(input_file, output_dir=output_directory) - - if success: - print("处理成功!") - else: - print("处理失败!") diff --git a/makee_vala/git_scripts/export_component_record.py b/makee_vala/git_scripts/export_component_record.py deleted file mode 100644 index 6149a19..0000000 --- a/makee_vala/git_scripts/export_component_record.py +++ /dev/null @@ -1,492 +0,0 @@ -""" -互动组件数据导出 - -需求 20251123: ---------- -在 PGsql数据库中 筛选数据 -数据库相关配置 从.env中读取: -PG_DB_HOST = xxx -PG_DB_PORT = xxx -PG_DB_USER = xxx -PG_DB_PASSWORD = xxx -PG_DB_DATABASE = xxx - -读取以下数据表: -user_component_play_record_0 ~ user_component_play_record_7 - -支持输入时间范围 -起始时间 和 截止时间 配置格式: "20250110" - -数据表中的时间字段为 updated_at , 格式样例: "2025-11-05 19:35:46.698246+08:00" - -在这些时间范围内,筛选以下字段数据 导出为excel文件: - -c_type 与 c_id 非空 - -输出以下字段: -user_id, -session_id, -c_type, -c_id, -play_result, -user_behavior_info, -updated_at - -写一个简单清晰的 数据导出脚本, 输入参数都直接在脚本开头定义和修改。 不要改动文件开头的需求描述,直接追加代码。 -------- - -需求二: -读取上述 输出的 excel 文件, 围绕 每个组件进行 统计, - -统计方式如下: -仅计算 c_type 与 c_id 非空 的记录 - -以每个 c_type + c_id 拼接 后 作为统计维度, -统计以下数据: -总数量 -Perfect数量:play_result=="Perfect" 的数量 -Good数量:play_result=="Good" 的数量 -Pass数量:play_result=="Pass" 的数量 -Oops数量:play_result=="Oops" 的数量 -Failed数量:play_result=="Failed" 的数量 -Perfect+Good数量:play_result=="Perfect" 或 play_result=="Good" 的数量 -Perfect比例:Perfect数量 / 总数量 -Good比例:Good数量 / 总数量 -Pass比例:Pass数量 / 总数量 -Oops比例:Oops数量 / 总数量 -Failed比例:Failed数量 / 总数量 -Perfect+Good比例:Perfect+Good数量 / 总数量 - -导出为excel 命名: 步骤1文件 结尾追加 _stats.xlsx - -需求三: -在需求二中, 追加从另外两个mysql表关联的组件配置字段: -MYSQL_HOST=xxx -MYSQL_USERNAME=xxx -MYSQL_PASSWORD=xxx -MYSQL_DATABASE=xxx -MYSQL_PORT=xxx - -以上环境变量已配置在 .env 中。 - -1.如果 c_type 开头为"mid" - -则读取下表:表名:middle_interaction_component - -增加以下字段: -title -component_config -组件类型 - -其中: - “组件类型”: 根据以下映射 把 c_type 转成中文名:xx互动 -{ - "词汇类": { - "物品互动": "mid_vocab_item", - "图片互动": "mid_vocab_image", - "填词互动": "mid_vocab_fillBlank", - "指令互动": "mid_vocab_instruction" - }, - "句子类": { - "对话互动": "mid_sentence_dialogue", - "语音互动": "mid_sentence_voice", - "材料互动": "mid_sentence_material", - "造句互动": "mid_sentence_makeSentence" - }, - "语法类": { - "挖空互动": "mid_grammar_cloze", - "组句互动": "mid_grammar_sentence" - }, - "发音类": { - "发音互动": "mid_pron_pron" - -} - -2. 如果 c_type 开头为"core" -则读取下表:表名:core_interaction_component - -增加以下字段: -title -component_config -组件类型 - -其中: - “组件类型”: 根据以下映射 把 c_type 转成中文名:xx互动 -{ - "口语类": { - "口语快答": "core_speaking_reply", - "口语妙问": "core_speaking_inquiry", - "口语探讨": "core_speaking_explore" - "口语独白": "core_speaking_monologue" - }, - "阅读类": { - "合作阅读": "core_reading_order", - }, - "听力类": { - "合作听力": "core_listening_order", - }, - "写作类": { - "看图组句": "core_writing_imgMakeSentence", - "看图撰写": "core_writing_imgWrite", - "问题组句": "core_writing_questionMakeSentence", - "问题撰写": "core_writing_questionWrite", - }, -} - -以上追加字段 增加到 步骤二输出的表中 - - - -""" - -import os -from datetime import datetime -from dotenv import load_dotenv -import psycopg2 -import pandas as pd -import pymysql - -# ==================== 配置参数 ==================== -# 时间范围配置(格式: "20250110") -START_DATE = "20250915" # 起始日期 -END_DATE = "20251122" # 截止日期 - -# 输出文件路径 -OUTPUT_DIR = "output" - -# 执行步骤控制 -RUN_STEP1 = False # 是否执行步骤1:数据导出 -RUN_STEP2 = True # 是否执行步骤2:数据统计 -# ================================================== - -# c_type 到中文组件类型的映射 -C_TYPE_MAPPING = { - # middle_interaction_component 映射 - "mid_vocab_item": "物品互动", - "mid_vocab_image": "图片互动", - "mid_vocab_fillBlank": "填词互动", - "mid_vocab_instruction": "指令互动", - "mid_sentence_dialogue": "对话互动", - "mid_sentence_voice": "语音互动", - "mid_sentence_material": "材料互动", - "mid_sentence_makeSentence": "造句互动", - "mid_grammar_cloze": "挖空互动", - "mid_grammar_sentence": "组句互动", - "mid_pron_pron": "发音互动", - - # core_interaction_component 映射 - "core_speaking_reply": "口语快答", - "core_speaking_inquiry": "口语妙问", - "core_speaking_explore": "口语探讨", - "core_speaking_monologue": "口语独白", - "core_reading_order": "合作阅读", - "core_listening_order": "合作听力", - "core_writing_imgMakeSentence": "看图组句", - "core_writing_imgWrite": "看图撰写", - "core_writing_questionMakeSentence": "问题组句", - "core_writing_questionWrite": "问题撰写", -} - - -def step1_export_data(): - """步骤1:从数据库导出数据""" - print("=" * 60) - print("步骤1:数据导出") - print("=" * 60) - - # 加载环境变量 - load_dotenv() - - # 获取数据库配置 - db_config = { - 'host': os.getenv('PG_DB_HOST'), - 'port': os.getenv('PG_DB_PORT'), - 'user': os.getenv('PG_DB_USER'), - 'password': os.getenv('PG_DB_PASSWORD'), - 'database': os.getenv('PG_DB_DATABASE') - } - - # 转换时间格式 - start_datetime = datetime.strptime(START_DATE, "%Y%m%d").strftime("%Y-%m-%d 00:00:00") - end_datetime = datetime.strptime(END_DATE, "%Y%m%d").strftime("%Y-%m-%d 23:59:59") - - print(f"时间范围: {start_datetime} ~ {end_datetime}") - - # 连接数据库 - conn = psycopg2.connect(**db_config) - - # 存储所有表的数据 - all_data = [] - - # 遍历8个分表 - for i in range(8): - table_name = f"user_component_play_record_{i}" - print(f"正在读取表: {table_name}") - - # SQL查询 - query = f""" - SELECT - user_id, - session_id, - c_type, - c_id, - play_result, - user_behavior_info, - updated_at - FROM {table_name} - WHERE updated_at >= %s - AND updated_at <= %s - AND c_type IS NOT NULL - AND c_id IS NOT NULL - """ - - # 执行查询 - df = pd.read_sql_query(query, conn, params=(start_datetime, end_datetime)) - all_data.append(df) - print(f" - 读取到 {len(df)} 条记录") - - # 关闭数据库连接 - conn.close() - - # 合并所有数据 - result_df = pd.concat(all_data, ignore_index=True) - print(f"\n总共获取 {len(result_df)} 条记录") - - # 移除 updated_at 字段的时区信息(Excel不支持带时区的datetime) - if 'updated_at' in result_df.columns and not result_df.empty: - result_df['updated_at'] = result_df['updated_at'].dt.tz_localize(None) - - # 确保输出目录存在 - os.makedirs(OUTPUT_DIR, exist_ok=True) - - # 生成输出文件名 - output_filename = f"component_record_{START_DATE}_{END_DATE}.xlsx" - output_path = os.path.join(OUTPUT_DIR, output_filename) - - # 导出到Excel - result_df.to_excel(output_path, index=False, engine='openpyxl') - print(f"数据已导出到: {output_path}") - print() - - return output_path - - -def get_component_info_from_mysql(stats_df): - """从MySQL获取组件配置信息""" - # 加载环境变量 - load_dotenv() - - # 获取MySQL配置 - mysql_config = { - 'host': os.getenv('MYSQL_HOST'), - 'user': os.getenv('MYSQL_USERNAME'), - 'password': os.getenv('MYSQL_PASSWORD'), - 'database': os.getenv('MYSQL_DATABASE'), - 'port': int(os.getenv('MYSQL_PORT', 3306)), - 'charset': 'utf8mb4' - } - - print("正在连接MySQL数据库...") - conn = pymysql.connect(**mysql_config) - - try: - # 分别处理 mid 和 core 类型的组件 - mid_records = stats_df[stats_df['c_type'].str.startswith('mid', na=False)][['c_type', 'c_id']] - core_records = stats_df[stats_df['c_type'].str.startswith('core', na=False)][['c_type', 'c_id']] - - # 存储组件信息的字典,key 为 "c_type-c_id" - component_info = {} - - # 查询 middle_interaction_component 表 - if not mid_records.empty: - print(f"正在查询 middle_interaction_component 表,共 {len(mid_records)} 个组件...") - - # 获取唯一的 c_type 和 c_id 组合 - mid_unique = mid_records.drop_duplicates() - - for _, row in mid_unique.iterrows(): - c_type = row['c_type'] - c_id = row['c_id'] - - query = """ - SELECT title, component_config - FROM middle_interaction_component - WHERE c_type = %s AND c_id = %s - """ - result = pd.read_sql_query(query, conn, params=(c_type, c_id)) - - if not result.empty: - key = f"{c_type}-{c_id}" - component_info[key] = { - 'title': result['title'].iloc[0], - 'component_config': result['component_config'].iloc[0] - } - - print(f" - 查询到 {len([k for k in component_info.keys() if k.startswith('mid')])} 个组件信息") - - # 查询 core_interaction_component 表 - if not core_records.empty: - print(f"正在查询 core_interaction_component 表,共 {len(core_records)} 个组件...") - - # 获取唯一的 c_type 和 c_id 组合 - core_unique = core_records.drop_duplicates() - - for _, row in core_unique.iterrows(): - c_type = row['c_type'] - c_id = row['c_id'] - - query = """ - SELECT title, component_config - FROM core_interaction_component - WHERE c_type = %s AND c_id = %s - """ - result = pd.read_sql_query(query, conn, params=(c_type, c_id)) - - if not result.empty: - key = f"{c_type}-{c_id}" - component_info[key] = { - 'title': result['title'].iloc[0], - 'component_config': result['component_config'].iloc[0] - } - - print(f" - 查询到 {len([k for k in component_info.keys() if k.startswith('core')])} 个组件信息") - - finally: - conn.close() - - return component_info - - -def step2_statistics(input_file): - """步骤2:数据统计""" - print("=" * 60) - print("步骤2:数据统计") - print("=" * 60) - - # 读取步骤1导出的Excel文件,c_id作为字符串读取以保留前导零 - print(f"正在读取文件: {input_file}") - df = pd.read_excel(input_file, engine='openpyxl', dtype={'c_id': str}) - print(f"读取到 {len(df)} 条记录") - - # 筛选 c_type 和 c_id 非空的记录 - df_filtered = df[(df['c_type'].notna()) & (df['c_id'].notna())].copy() - print(f"筛选后 {len(df_filtered)} 条有效记录") - - # 确保c_type和c_id都是字符串类型(保留c_id的前导零) - df_filtered['c_type'] = df_filtered['c_type'].astype(str) - df_filtered['c_id'] = df_filtered['c_id'].astype(str) - - # 创建组件ID(c_type-c_id) - df_filtered['component_id'] = df_filtered['c_type'] + '-' + df_filtered['c_id'] - - # 按组件ID分组统计 - stats_list = [] - - for component_id, group in df_filtered.groupby('component_id'): - # 获取原始的 c_type 和 c_id - c_type = group['c_type'].iloc[0] - c_id = group['c_id'].iloc[0] - - # 总数量 - total_count = len(group) - - # 各状态数量 - perfect_count = len(group[group['play_result'] == 'Perfect']) - good_count = len(group[group['play_result'] == 'Good']) - pass_count = len(group[group['play_result'] == 'Pass']) - oops_count = len(group[group['play_result'] == 'Oops']) - failed_count = len(group[group['play_result'] == 'Failed']) - perfect_good_count = len(group[group['play_result'].isin(['Perfect', 'Good'])]) - - # 计算比例(保留两位小数) - perfect_ratio = round(perfect_count / total_count, 2) if total_count > 0 else 0 - good_ratio = round(good_count / total_count, 2) if total_count > 0 else 0 - pass_ratio = round(pass_count / total_count, 2) if total_count > 0 else 0 - oops_ratio = round(oops_count / total_count, 2) if total_count > 0 else 0 - failed_ratio = round(failed_count / total_count, 2) if total_count > 0 else 0 - perfect_good_ratio = round(perfect_good_count / total_count, 2) if total_count > 0 else 0 - - stats_list.append({ - 'component_id': component_id, - 'c_type': c_type, - 'c_id': c_id, - '总数量': total_count, - 'Perfect数量': perfect_count, - 'Good数量': good_count, - 'Pass数量': pass_count, - 'Oops数量': oops_count, - 'Failed数量': failed_count, - 'Perfect+Good数量': perfect_good_count, - 'Perfect比例': perfect_ratio, - 'Good比例': good_ratio, - 'Pass比例': pass_ratio, - 'Oops比例': oops_ratio, - 'Failed比例': failed_ratio, - 'Perfect+Good比例': perfect_good_ratio - }) - - # 创建统计结果DataFrame - stats_df = pd.DataFrame(stats_list) - - print(f"统计了 {len(stats_df)} 个不同的组件") - - # 从MySQL获取组件配置信息 - print("\n" + "=" * 60) - print("正在从MySQL获取组件配置信息...") - print("=" * 60) - component_info = get_component_info_from_mysql(stats_df) - - # 添加新字段:title, component_config, 组件类型 - # 使用 component_id (c_type-c_id) 作为 key 来匹配 - stats_df['title'] = stats_df['component_id'].apply(lambda x: component_info.get(x, {}).get('title', '')) - stats_df['component_config'] = stats_df['component_id'].apply(lambda x: component_info.get(x, {}).get('component_config', '')) - stats_df['组件类型'] = stats_df['c_type'].apply(lambda x: C_TYPE_MAPPING.get(x, '')) - - # 重新排列列顺序:将新增字段放在 c_type, c_id 后面 - columns_order = [ - 'component_id', 'c_type', 'c_id', - 'title', 'component_config', '组件类型', # 新增字段 - '总数量', - 'Perfect数量', 'Good数量', 'Pass数量', 'Oops数量', 'Failed数量', 'Perfect+Good数量', - 'Perfect比例', 'Good比例', 'Pass比例', 'Oops比例', 'Failed比例', 'Perfect+Good比例' - ] - stats_df = stats_df[columns_order] - - # 生成输出文件名(在原文件名后追加_stats) - output_filename = os.path.basename(input_file).replace('.xlsx', '_stats.xlsx') - output_path = os.path.join(OUTPUT_DIR, output_filename) - - # 导出到Excel - stats_df.to_excel(output_path, index=False, engine='openpyxl') - print(f"\n统计结果已导出到: {output_path}") - print() - - return output_path - - -def main(): - export_file = None - - # 执行步骤1:数据导出 - if RUN_STEP1: - export_file = step1_export_data() - - # 执行步骤2:数据统计 - if RUN_STEP2: - # 如果步骤1没有执行,需要手动指定文件路径 - if export_file is None: - export_file = os.path.join(OUTPUT_DIR, f"component_record_{START_DATE}_{END_DATE}.xlsx") - if not os.path.exists(export_file): - print(f"错误:找不到文件 {export_file}") - print("请先执行步骤1或确保文件存在") - return - - step2_statistics(export_file) - - print("=" * 60) - print("处理完成!") - print("=" * 60) - - -if __name__ == "__main__": - main() diff --git a/makee_vala/git_scripts/export_lesson_review.py b/makee_vala/git_scripts/export_lesson_review.py deleted file mode 100644 index 8808023..0000000 --- a/makee_vala/git_scripts/export_lesson_review.py +++ /dev/null @@ -1,572 +0,0 @@ -""" -** 不要改动我的需求描述,直接在需求后面写代码即可 ** - -课程巩固 数据导出 和 分析 - ------------ -需求一: -在 PGsql数据库中 筛选数据 -数据库相关配置 从.env中读取: -PG_DB_HOST = xxx -PG_DB_PORT = xxx -PG_DB_USER = xxx -PG_DB_PASSWORD = xxx -PG_DB_DATABASE = xxx - -读取以下数据表: user_unit_review_question_result - -支持输入时间范围 -起始时间 和 截止时间 配置格式: "20250110" - -数据表中的时间字段为 updated_at , 格式样例: "2025-11-05 19:35:46.698246+08:00" - -在这些时间范围内,筛选数据 (要求deleted_at字段内容为null) - -导出以下字段: - -user_id -unit_id (读取每条记录的story_id, 根据 get_id_2_unit_index 函数返回的映射表 映射到 unit_id) -lesson_id (读取chapter_id, 根据该值 查询 mysql表 vala_game_chapter 的 id == chapter_id, 并返回该记录的 index字段的值) -question_list -题目总数 -正确数量 -正确率 -play_time_seconds (读取 play_time 把ms数据转换为秒 保留整数部分) -updated_at - -其中 题目总数 正确数量 正确率 都通过 question_list 计算, -该字段为 list of json: -[ - { - "question": { - "type": "vocab_meaning_meaning", - "id": "20-0", - "title": "“clean” 的意思是什么?", - "npcId": -1 - }, - "answers": [ - "2" - ], - "optionList": [ - { - "option": "爬行" - }, - { - "option": "清晰的" - }, - { - "option": "清洁" - } - ], - "isRight": true - }, - ... -] - -每个元素为一道题目, 题目中有 "isRight": true 代表用户做对了。 - -导出为excel文件 ----- -需求二 基于 需求一的输出文件 作为 输入文件 进行数据聚合。 - -聚合的维度是每道题目 - -根据 question_list 中的 每个题目 取 question -> id 作为唯一标识 - -统计每个题目 -总记录数量 -正确数量 -正确率 - -并查询mysql表 补充题目的以下信息: -步骤一中,每个题目id的格式是 num1-num2 (question -> id) -查询vala_kp_question表 -其中num1部分 用于 检索vala_kp_question 中的 id, 每个id下 可能有多道题目 在 vala_kp_question的 question 字段 是一个list, num2为question 字段中的索引 - -补充以下字段: -kp_id (vala_kp_question字段) -category (vala_kp_question字段) -skill (vala_kp_question字段) -type (vala_kp_question字段) -题目配置 (question字段中 对应 num2 索引的内容) - -最终针对每道题目输出以下字段: -出现位置 (list, 把所有出现的位置拼接 unit_id +"_"+ lesson_id 例如:"unit10-lesson1" 这样的格式) -question_id (question -> id) -kp_id (vala_kp_question字段) -category (vala_kp_question字段) -skill (vala_kp_question字段) -type (vala_kp_question字段) -题目配置 (question字段中 对应 num2 索引的内容) -总记录数量 -正确数量 -正确率 - -导出为excel 命名为 步骤一文件_stat.xlsx - -所有需要配置的参数 放在脚本开头位置 - -""" - -import os -import pymysql -import psycopg2 -from psycopg2.extras import RealDictCursor -from datetime import datetime -import pandas as pd -from dotenv import load_dotenv -import json -from collections import defaultdict - -# 加载环境变量 -load_dotenv() - -# ============ 配置参数 ============ -START_DATE = "20250915" # 起始时间 -END_DATE = "20251122" # 截止时间 -OUTPUT_NAME = "lesson_review_data_{}_{}.xlsx".format(START_DATE, END_DATE) # 输出文件名 -OUTPUT_FILENAME = os.path.join("./output", OUTPUT_NAME) -# ================================= - -def get_mysql_connection(): - """获取MySQL连接""" - db_host = os.getenv('MYSQL_HOST') - db_user = os.getenv('MYSQL_USERNAME') - db_password = os.getenv('MYSQL_PASSWORD') - db_name = os.getenv('MYSQL_DATABASE') - db_port = os.getenv('MYSQL_PORT') - - if not all([db_host, db_user, db_password, db_name]): - raise Exception("Error: Missing MySQL configuration in .env file.") - - connection = pymysql.connect( - host=db_host, - user=db_user, - password=db_password, - database=db_name, - port=int(db_port) if db_port else 3306, - cursorclass=pymysql.cursors.DictCursor - ) - return connection - -def get_pgsql_connection(): - """获取PGsql连接""" - pg_host = os.getenv('PG_DB_HOST') - pg_port = os.getenv('PG_DB_PORT') - pg_user = os.getenv('PG_DB_USER') - pg_password = os.getenv('PG_DB_PASSWORD') - pg_database = os.getenv('PG_DB_DATABASE') - - if not all([pg_host, pg_port, pg_user, pg_password, pg_database]): - raise Exception("Error: Missing PGsql configuration in .env file.") - - connection = psycopg2.connect( - host=pg_host, - port=int(pg_port), - user=pg_user, - password=pg_password, - database=pg_database, - cursor_factory=RealDictCursor - ) - return connection - -def get_id_2_unit_index(): - """获取story_id到unit_id的映射""" - print("正在获取 story_id 到 unit_id 的映射...") - connection = get_mysql_connection() - - try: - with connection.cursor() as cursor: - sql = """ - SELECT * - FROM `vala_game_info` - WHERE id > 0 - AND `vala_game_info`.`deleted_at` IS NULL - ORDER BY season_package_id asc, `index` asc - """ - cursor.execute(sql) - results = cursor.fetchall() - - id_2_unit_index = {} - for index, row in enumerate(results): - id_2_unit_index[row['id']] = index - - print(f"成功获取 {len(id_2_unit_index)} 个单元映射") - return id_2_unit_index - finally: - connection.close() - -def get_chapter_id_to_lesson_id(): - """获取chapter_id到lesson_id的映射""" - print("正在获取 chapter_id 到 lesson_id 的映射...") - connection = get_mysql_connection() - - try: - with connection.cursor() as cursor: - sql = """ - SELECT id, `index` - FROM `vala_game_chapter` - WHERE deleted_at IS NULL - """ - cursor.execute(sql) - results = cursor.fetchall() - - chapter_id_to_lesson_id = {} - for row in results: - chapter_id_to_lesson_id[row['id']] = row['index'] - - print(f"成功获取 {len(chapter_id_to_lesson_id)} 个课程映射") - return chapter_id_to_lesson_id - finally: - connection.close() - -def analyze_question_list(question_list_json): - """分析题目列表,返回题目总数、正确数量、正确率""" - try: - if isinstance(question_list_json, str): - question_list = json.loads(question_list_json) - else: - question_list = question_list_json - - if not isinstance(question_list, list): - return 0, 0, 0 - - total = len(question_list) - correct = sum(1 for q in question_list if q.get('isRight') == True) - accuracy = round(correct / total * 100, 2) if total > 0 else 0 - - return total, correct, accuracy - except Exception as e: - print(f"解析题目列表出错: {e}") - return 0, 0, 0 - -def export_step1(): - """需求一:导出原始数据""" - print("=" * 50) - print("开始执行需求一:导出原始数据") - print("=" * 50) - - # 获取映射关系 - id_2_unit_index = get_id_2_unit_index() - chapter_id_to_lesson_id = get_chapter_id_to_lesson_id() - - # 连接PGsql - print("正在连接 PGsql 数据库...") - pg_conn = get_pgsql_connection() - - try: - with pg_conn.cursor() as cursor: - # 构建时间范围 - start_datetime = datetime.strptime(START_DATE, "%Y%m%d") - end_datetime = datetime.strptime(END_DATE, "%Y%m%d") - end_datetime = end_datetime.replace(hour=23, minute=59, second=59) - - sql = """ - SELECT user_id, story_id, chapter_id, question_list, play_time, updated_at - FROM user_unit_review_question_result - WHERE updated_at >= %s - AND updated_at <= %s - AND deleted_at IS NULL - ORDER BY updated_at - """ - - print(f"查询时间范围: {start_datetime} 至 {end_datetime}") - cursor.execute(sql, (start_datetime, end_datetime)) - results = cursor.fetchall() - - print(f"查询到 {len(results)} 条记录") - - # 处理数据 - export_data = [] - for row in results: - user_id = row['user_id'] - story_id = row['story_id'] - chapter_id = row['chapter_id'] - question_list_raw = row['question_list'] - play_time = row['play_time'] - updated_at = row['updated_at'] - - # 确保 question_list 是 Python 对象(PGsql 的 jsonb 会自动转换) - # 如果是字符串,先解析;如果已经是对象,直接使用 - if isinstance(question_list_raw, str): - try: - question_list = json.loads(question_list_raw) - except: - question_list = [] - else: - question_list = question_list_raw if question_list_raw else [] - - # 映射 unit_id - unit_id = id_2_unit_index.get(story_id, -1) - - # 映射 lesson_id - lesson_id = chapter_id_to_lesson_id.get(chapter_id, -1) - - # 分析题目列表 - total, correct, accuracy = analyze_question_list(question_list) - - # 转换播放时长(ms -> s) - play_time_seconds = int(play_time / 1000) if play_time else 0 - - # 转换question_list为字符串(统一序列化为JSON字符串) - question_list_str = json.dumps(question_list, ensure_ascii=False) if question_list else "" - - # 移除时区信息(Excel不支持带时区的datetime) - updated_at_no_tz = updated_at.replace(tzinfo=None) if updated_at else None - - export_data.append({ - 'user_id': user_id, - 'unit_id': unit_id, - 'lesson_id': lesson_id, - 'question_list': question_list_str, - '题目总数': total, - '正确数量': correct, - '正确率': accuracy, - 'play_time_seconds': play_time_seconds, - 'updated_at': updated_at_no_tz - }) - - # 导出到Excel - df = pd.DataFrame(export_data) - - # 确保输出目录存在 - os.makedirs(os.path.dirname(OUTPUT_FILENAME), exist_ok=True) - - df.to_excel(OUTPUT_FILENAME, index=False, engine='openpyxl') - print(f"成功导出 {len(export_data)} 条记录到: {OUTPUT_FILENAME}") - - return OUTPUT_FILENAME - - finally: - pg_conn.close() - -def get_all_kp_questions(question_ids): - """批量获取所有题目信息,避免N+1查询问题""" - print(f"正在批量查询 {len(question_ids)} 道题目的信息...") - - # 解析所有question_id,获取需要查询的kp_question id列表 - kp_ids = set() - for qid in question_ids: - try: - parts = qid.split('-') - if len(parts) == 2: - kp_ids.add(int(parts[0])) - except: - continue - - print(f"需要查询 {len(kp_ids)} 条 vala_kp_question 记录") - - # 批量查询MySQL - connection = get_mysql_connection() - kp_data_map = {} - - try: - with connection.cursor() as cursor: - # 使用IN查询批量获取 - if kp_ids: - placeholders = ','.join(['%s'] * len(kp_ids)) - sql = f""" - SELECT id, kp_id, category, skill, type, question - FROM vala_kp_question - WHERE id IN ({placeholders}) AND deleted_at IS NULL - """ - cursor.execute(sql, tuple(kp_ids)) - results = cursor.fetchall() - - print(f"成功查询到 {len(results)} 条记录") - - # 构建映射表 - for row in results: - kp_data_map[row['id']] = row - finally: - connection.close() - - # 为每个question_id构建结果 - question_info_map = {} - for question_id in question_ids: - try: - parts = question_id.split('-') - if len(parts) != 2: - question_info_map[question_id] = (None, None, None, None, None) - continue - - kp_id = int(parts[0]) - question_index = int(parts[1]) - - kp_data = kp_data_map.get(kp_id) - if not kp_data: - question_info_map[question_id] = (None, None, None, None, None) - continue - - # 解析question字段 - question_list = kp_data['question'] - if isinstance(question_list, str): - question_list = json.loads(question_list) - - # 获取指定索引的题目配置 - question_config = None - if isinstance(question_list, list) and 0 <= question_index < len(question_list): - question_config = json.dumps(question_list[question_index], ensure_ascii=False) - - question_info_map[question_id] = ( - kp_data['kp_id'], - kp_data['category'], - kp_data['skill'], - kp_data['type'], - question_config - ) - except Exception as e: - print(f"处理题目信息出错 ({question_id}): {e}") - question_info_map[question_id] = (None, None, None, None, None) - - return question_info_map - -def export_step2(input_filename): - """需求二:数据聚合统计""" - print("=" * 50) - print("开始执行需求二:数据聚合统计") - print("=" * 50) - - # 读取步骤一的输出文件 - print(f"正在读取文件: {input_filename}") - df = pd.read_excel(input_filename, engine='openpyxl') - - print(f"读取到 {len(df)} 条记录") - - # 按题目聚合统计 - question_stats = defaultdict(lambda: { - 'locations': set(), - 'total_count': 0, - 'correct_count': 0 - }) - - parse_success_count = 0 - parse_fail_count = 0 - empty_question_list_count = 0 - processed_question_count = 0 - - for idx, row in df.iterrows(): - unit_id = row['unit_id'] - lesson_id = row['lesson_id'] - question_list_str = row['question_list'] - - # 解析question_list - try: - if pd.isna(question_list_str) or not question_list_str: - question_list = [] - empty_question_list_count += 1 - else: - question_list = json.loads(question_list_str) - parse_success_count += 1 - except Exception as e: - question_list = [] - parse_fail_count += 1 - if parse_fail_count <= 3: - print(f"[警告] 第 {idx+1} 条记录解析失败: {e}") - - # 统计每道题目 - for question_item in question_list: - if not isinstance(question_item, dict): - continue - - question = question_item.get('question', {}) - question_id = question.get('id') - is_right = question_item.get('isRight', False) - - if not question_id: - continue - - # 添加出现位置 - location = f"unit{unit_id}-lesson{lesson_id}" - question_stats[question_id]['locations'].add(location) - - # 统计数量 - question_stats[question_id]['total_count'] += 1 - if is_right: - question_stats[question_id]['correct_count'] += 1 - - processed_question_count += 1 - - print(f"\n解析统计:") - print(f" - 解析成功: {parse_success_count} 条") - print(f" - 解析失败: {parse_fail_count} 条") - print(f" - question_list 为空: {empty_question_list_count} 条") - print(f" - 处理的题目总数: {processed_question_count} 道") - print(f" - 聚合得到不同题目: {len(question_stats)} 道") - - # 批量获取所有题目信息(优化性能) - all_question_ids = list(question_stats.keys()) - question_info_map = get_all_kp_questions(all_question_ids) - - # 构建导出数据 - print(f"\n正在构建导出数据...") - export_data = [] - for idx, (question_id, stats) in enumerate(question_stats.items()): - if (idx + 1) % 100 == 0: - print(f" 已处理 {idx + 1}/{len(question_stats)} 道题目") - - # 从批量查询结果中获取题目信息 - kp_id, category, skill, type_field, question_config = question_info_map.get( - question_id, (None, None, None, None, None) - ) - - # 计算正确率 - total = stats['total_count'] - correct = stats['correct_count'] - accuracy = round(correct / total * 100, 2) if total > 0 else 0 - - # 出现位置列表 - locations_list = sorted(list(stats['locations'])) - locations_str = ', '.join(locations_list) - - export_data.append({ - '出现位置': locations_str, - 'question_id': question_id, - 'kp_id': kp_id, - 'category': category, - 'skill': skill, - 'type': type_field, - '题目配置': question_config, - '总记录数量': total, - '正确数量': correct, - '正确率': accuracy - }) - - # 导出到Excel - output_stat_filename = input_filename.replace('.xlsx', '_stat.xlsx') - df_stat = pd.DataFrame(export_data) - - print(f"\n正在导出到 Excel...") - df_stat.to_excel(output_stat_filename, index=False, engine='openpyxl') - - print(f"成功导出 {len(export_data)} 道题目的统计数据到: {output_stat_filename}") - - return output_stat_filename - -def main(): - """主函数""" - try: - # 执行需求一 - step1_output = export_step1() - - print("\n") - - # 执行需求二 - step2_output = export_step2(step1_output) - - print("\n" + "=" * 50) - print("所有任务完成!") - print(f"需求一输出文件: {step1_output}") - print(f"需求二输出文件: {step2_output}") - print("=" * 50) - - except Exception as e: - print(f"执行出错: {e}") - import traceback - traceback.print_exc() - -if __name__ == "__main__": - main() - - - diff --git a/makee_vala/git_scripts/export_mid_config.py b/makee_vala/git_scripts/export_mid_config.py deleted file mode 100644 index c536621..0000000 --- a/makee_vala/git_scripts/export_mid_config.py +++ /dev/null @@ -1,181 +0,0 @@ -""" -MYSQL_HOST=xxx -MYSQL_USERNAME=xxx -MYSQL_PASSWORD=xxx -MYSQL_DATABASE=xxx -MYSQL_PORT=xxx - -以上环境变量已配置在 .env 中。 - -我要导出一个数据表的某些记录 并添加一些字段。 - -表名:middle_interaction_component - -根据 c_id 过滤数据: -c_id为 7位 字符串 其中 {两位季度编号}{两位单元编号}{三位组件编号} 过滤其中 单元编号部分为 00~20 以及 26 的对应记录 也就是 xx00xxx ~ xx20xxx 以及 xx26xxx 的记录 - -导出以下字段: -id -c_type -c_id -title -component_config -related_path -kp_relation_info -created_at -updated_at - -新增以下字段: -1. “组件类型”: 根据以下映射 把 c_type 转成中文名:xx互动 -{ - "词汇类": { - "物品互动": "mid_vocab_item", - "图片互动": "mid_vocab_image", - "填词互动": "mid_vocab_fillBlank", - "指令互动": "mid_vocab_instruction" - }, - "句子类": { - "对话互动": "mid_sentence_dialogue", - "语音互动": "mid_sentence_voice", - "材料互动": "mid_sentence_material", - "造句互动": "mid_sentence_makeSentence" - }, - "语法类": { - "挖空互动": "mid_grammar_cloze", - "组句互动": "mid_grammar_sentence" - }, - "发音类": { - "发音互动": "mid_pron_pron" - -} - -2. “是否关联了知识点”: 如果 kp_relation_info 不为空 且包含至少一个具体的知识点编号 则为 “是” 否则为 “否” -有效关联知识点的一个样例数据:[{"kpId":"0326011","kpType":"sentence","kpTitle":"What does... look like?","kpSkill":"sentence_meaning","kpSkillName":"语义"}] - -3. "是否已组课": 如果 related_path 不为空 则为 “是” 否则为 “否” -一个有效的 related_path 样例: {"packageId":13,"unitId":40,"lessonId":213,"packageIndex":3,"unitIndex":2,"lessonIndex":2} - -4. “前置对话”: -component_config 中的 preDialog 字段, 如果不存在 则为 “空” -{"asrPrompt":"","cId":"0326022","cType":"mid_sentence_dialogue","meaning":"语义;语音","mode":"read","postDialog":[{"content":"Leave it to me.","npcId":540,"npcName":"Victoria","type":"npc"}],"preDialog":[{"content":"But do we still have time?","npcId":30,"type":"user"}],"question":{"content":"What if we miss the spaceship?","mode":"read","npcId":30,"type":"user"},"resourceMapping":{"Medic":503},"title":"询问万一错过飞船怎么办"} - -5. "后置对话": -component_config 中的 postDialog 字段, 如果不存在 则为 “空” - -6. 前置/后置对话中非user角色数量 -component_config 中的 preDialog 以及 postDialog 字段中, 统计所有 type 为 npc ,根据 npcId 去重后的角色数量 -例如 ---- -前置对话: -[{"content":"But do we still have time?","npcId":30,"type":"user"}] -后置对话: -[{"content":"Leave it to me.","npcId":540,"npcName":"Victoria","type":"npc"}] -非user角色数量: 1 ---- - ---- -前置对话: -[{"content":"But do we still have time?","npcId":31,"type":"npc","npcName":"Ben"}] -后置对话: -[{"content":"Leave it to me.","npcId":540,"npcName":"Victoria","type":"npc"}] -非user角色数量: 2 ---- - -最终输出一个 excel文档。 - -""" - -import os -import json -from datetime import datetime -import pymysql -import pandas as pd -from dotenv import load_dotenv - -load_dotenv() - -# 组件类型映射 -TYPE_MAP = { - "mid_vocab_item": "物品互动", "mid_vocab_image": "图片互动", - "mid_vocab_fillBlank": "填词互动", "mid_vocab_instruction": "指令互动", - "mid_sentence_dialogue": "对话互动", "mid_sentence_voice": "语音互动", - "mid_sentence_material": "材料互动", "mid_sentence_makeSentence": "造句互动", - "mid_grammar_cloze": "挖空互动", "mid_grammar_sentence": "组句互动", - "mid_pron_pron": "发音互动" -} - -def get_data(): - conn = pymysql.connect( - host=os.getenv('MYSQL_HOST'), port=int(os.getenv('MYSQL_PORT', 3306)), - user=os.getenv('MYSQL_USERNAME'), password=os.getenv('MYSQL_PASSWORD'), - database=os.getenv('MYSQL_DATABASE'), charset='utf8mb4' - ) - - # 构建c_id过滤条件 - conditions = [f"c_id LIKE '__{i:02d}___'" for i in range(21)] + ["c_id LIKE '__26___'"] - where_clause = " OR ".join(conditions) - - sql = f"""SELECT id, c_type, c_id, title, component_config, related_path, - kp_relation_info, created_at, updated_at - FROM middle_interaction_component WHERE {where_clause}""" - - df = pd.read_sql(sql, conn) - conn.close() - return df - -def process_data(df): - # 组件类型 - df['组件类型'] = df['c_type'].map(TYPE_MAP).fillna(df['c_type']) - - # 是否关联知识点 - def check_kp(kp_info): - if not kp_info: return "否" - try: - data = json.loads(kp_info) - return "是" if isinstance(data, list) and any(item.get('kpId') for item in data) else "否" - except: return "否" - - df['是否关联了知识点'] = df['kp_relation_info'].apply(check_kp) - - # 是否已组课 - def check_lesson(path): - if not path: return "否" - try: return "是" if json.loads(path) else "否" - except: return "否" - - df['是否已组课'] = df['related_path'].apply(check_lesson) - - # 前置/后置对话及NPC统计 - def extract_dialog(config, dialog_type): - if not config: return "空" - try: - data = json.loads(config) - dialog = data.get(dialog_type, []) - return json.dumps(dialog, ensure_ascii=False) if dialog else "空" - except: return "空" - - def count_npc(config): - if not config: return 0 - try: - data = json.loads(config) - npc_ids = set() - for dialog in ['preDialog', 'postDialog']: - for item in data.get(dialog, []): - if item.get('type') == 'npc' and 'npcId' in item: - npc_ids.add(item['npcId']) - return len(npc_ids) - except: return 0 - - df['前置对话'] = df['component_config'].apply(lambda x: extract_dialog(x, 'preDialog')) - df['后置对话'] = df['component_config'].apply(lambda x: extract_dialog(x, 'postDialog')) - df['前置/后置对话中非user角色数量'] = df['component_config'].apply(count_npc) - - return df - -if __name__ == "__main__": - df = get_data() - df = process_data(df) - - filename = f"middle_interaction_component_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx" - df.to_excel(filename, index=False) - print(f"导出完成: {filename}") diff --git a/makee_vala/git_scripts/export_realtime_asr.py b/makee_vala/git_scripts/export_realtime_asr.py deleted file mode 100644 index e042530..0000000 --- a/makee_vala/git_scripts/export_realtime_asr.py +++ /dev/null @@ -1,385 +0,0 @@ -""" -导出 流式语音音频 脚本 - -v1.0 ---- -原始数据存储于ES数据库中 -索引: llm_realtime_asr_log - -es相关配置通过以下环境变量 -ES_HOST=xxx -ES_PORT=9200 -ES_SCHEME=https -ES_USER=elastic -ES_PASSWORD=xxx (注意这里可能有特殊符号) - -需要配置的内容放置在脚本最开头 -开始时间 (8位数字年月日) -截止时间 (8位数字年月日) - -仅筛选 时间范围内的数据记录 -可以基于 timestamp_int 字段内容进行时间筛选 格式样例:1,769,496,892 - -正常情况 每个 voice_id 会对应两条记录 -可以 以 voice_id为单位 -最终 按照每个 voice_id 聚合出以下数据: - -asr_prompt (其中一条记录会有这个内容) -result_str (其中一条记录会有这个内容) -timestamp (两条记录都会有,保留最新的一条对应的时间) 格式样例: 2023-12-12 12:12:12 -voice_id -audio_url 按以下规则拼接: https://static.valavala.com/vala_llm/realtime_asr_audio_backup/online/{8位年月日}/{voice_id}.wav 8位年月日 基于 timestamp计算 格式 20260121这种 -source (其中一条记录会有这个内容) - -最终导出一个excel。 ---- - -""" - -import os -from datetime import datetime -import requests -import pandas as pd -from dotenv import load_dotenv -from collections import defaultdict -import urllib3 - -# ==================== 配置区域 ==================== -START_DATE = "20251201" # 开始日期 (8位数字年月日) -END_DATE = "20260131" # 结束日期 (8位数字年月日) -# ================================================= - -# 加载环境变量 -load_dotenv() - -# ES配置 -ES_HOST = os.getenv("ES_HOST") -ES_PORT = int(os.getenv("ES_PORT", "9200")) -ES_SCHEME = os.getenv("ES_SCHEME", "https") -ES_USER = os.getenv("ES_USER", "elastic") -ES_PASSWORD = os.getenv("ES_PASSWORD") -ES_INDEX = "llm_realtime_asr_log" - -# 每批处理的数据量 -SCROLL_SIZE = 1000 -SCROLL_TIMEOUT = "5m" - - -def timestamp_int_from_date(date_str): - """将8位日期字符串转换为timestamp_int(秒级时间戳)""" - dt = datetime.strptime(date_str, "%Y%m%d") - return int(dt.timestamp()) - - -def format_timestamp(ts): - """将时间戳转换为格式化字符串""" - if isinstance(ts, (int, float)): - return datetime.fromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S") - return ts - - -def generate_audio_url(voice_id, timestamp): - """生成audio_url""" - date_str = datetime.fromtimestamp(timestamp).strftime("%Y%m%d") - return f"https://static.valavala.com/vala_llm/realtime_asr_audio_backup/online/{date_str}/{voice_id}.wav" - - -def connect_es(): - """测试ES连接""" - print("正在测试 Elasticsearch 连接...") - - # 禁用SSL警告 - if ES_SCHEME == "https": - try: - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - except Exception: - pass - - base_url = f"{ES_SCHEME}://{ES_HOST}:{ES_PORT}" - auth = (ES_USER, ES_PASSWORD) if ES_USER and ES_PASSWORD else None - - try: - # 测试连接 - resp = requests.get( - base_url, - auth=auth, - timeout=10, - verify=False if ES_SCHEME == "https" else True - ) - resp.raise_for_status() - - print(f"✓ 成功连接到 Elasticsearch: {ES_HOST}:{ES_PORT}") - return True - except Exception as e: - print(f"✗ 连接失败: {e}") - return False - - -def query_data(start_date, end_date): - """查询ES数据""" - start_ts = timestamp_int_from_date(start_date) - end_ts = timestamp_int_from_date(end_date) + 86400 # 结束日期加一天,包含当天数据 - - print(f"\n开始查询数据...") - print(f"时间范围: {start_date} 至 {end_date}") - print(f"时间戳范围: {start_ts} 至 {end_ts}") - - # 禁用SSL警告 - if ES_SCHEME == "https": - try: - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - except Exception: - pass - - base_url = f"{ES_SCHEME}://{ES_HOST}:{ES_PORT}" - search_url = f"{base_url}/{ES_INDEX}/_search" - headers = {"Content-Type": "application/json"} - auth = (ES_USER, ES_PASSWORD) if ES_USER and ES_PASSWORD else None - - query = { - "query": { - "range": { - "timestamp_int": { - "gte": start_ts, - "lt": end_ts - } - } - }, - "sort": [{"timestamp_int": {"order": "asc"}}], - "size": SCROLL_SIZE - } - - try: - # 初始查询(使用scroll) - params = {"scroll": SCROLL_TIMEOUT} - response = requests.post( - search_url, - headers=headers, - json=query, - auth=auth, - params=params, - timeout=30, - verify=False if ES_SCHEME == "https" else True - ) - response.raise_for_status() - data = response.json() - - scroll_id = data.get("_scroll_id") - total_hits = data["hits"]["total"]["value"] - - print(f"✓ 查询完成,共找到 {total_hits} 条记录") - - return data, scroll_id, total_hits - - except Exception as e: - raise RuntimeError(f"ES查询失败: {e}") - - -def aggregate_by_voice_id(response, scroll_id, total_hits): - """按voice_id聚合数据""" - voice_data = defaultdict(list) - processed_count = 0 - - print("\n开始处理数据...") - - # 禁用SSL警告 - if ES_SCHEME == "https": - try: - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - except Exception: - pass - - base_url = f"{ES_SCHEME}://{ES_HOST}:{ES_PORT}" - scroll_url = f"{base_url}/_search/scroll" - headers = {"Content-Type": "application/json"} - auth = (ES_USER, ES_PASSWORD) if ES_USER and ES_PASSWORD else None - - while True: - hits = response["hits"]["hits"] - - if not hits: - break - - for hit in hits: - source = hit["_source"] - voice_id = source.get("voice_id") - - if voice_id: - voice_data[voice_id].append(source) - - processed_count += 1 - - # 打印进度 - progress = (processed_count / total_hits) * 100 - print(f"\r处理进度: {processed_count}/{total_hits} ({progress:.1f}%)", end="") - - # 获取下一批数据 - try: - scroll_response = requests.post( - scroll_url, - headers=headers, - json={ - "scroll": SCROLL_TIMEOUT, - "scroll_id": scroll_id - }, - auth=auth, - timeout=30, - verify=False if ES_SCHEME == "https" else True - ) - scroll_response.raise_for_status() - response = scroll_response.json() - - # 更新 scroll_id(可能会变化) - scroll_id = response.get("_scroll_id", scroll_id) - - except Exception as e: - print(f"\n✗ 获取下一批数据失败: {e}") - break - - print(f"\n✓ 数据处理完成,共处理 {processed_count} 条记录") - print(f"✓ 找到 {len(voice_data)} 个唯一的 voice_id") - - # 清理scroll - try: - clear_scroll_url = f"{base_url}/_search/scroll" - requests.delete( - clear_scroll_url, - headers=headers, - json={"scroll_id": [scroll_id]}, - auth=auth, - timeout=10, - verify=False if ES_SCHEME == "https" else True - ) - except Exception: - pass # 清理失败不影响结果 - - return voice_data - - -def merge_voice_records(voice_data): - """合并voice_id的记录,只保留恰好2条记录的""" - print("\n开始聚合 voice_id 数据...") - - merged_data = [] - valid_count = 0 - invalid_count = 0 - - for voice_id, records in voice_data.items(): - # 只处理恰好有2条记录的voice_id - if len(records) != 2: - invalid_count += 1 - continue - - valid_count += 1 - - # 初始化合并后的数据 - merged_record = { - "voice_id": voice_id, - "asr_prompt": None, - "result_str": None, - "timestamp": None, - "source": None, - "audio_url": None - } - - # 找出最新的timestamp - max_timestamp = max( - records[0].get("timestamp_int", 0), - records[1].get("timestamp_int", 0) - ) - - # 合并数据 - for record in records: - if record.get("asr_prompt"): - merged_record["asr_prompt"] = record["asr_prompt"] - if record.get("result_str"): - merged_record["result_str"] = record["result_str"] - if record.get("source"): - merged_record["source"] = record["source"] - - # 设置timestamp和audio_url - merged_record["timestamp"] = format_timestamp(max_timestamp) - merged_record["audio_url"] = generate_audio_url(voice_id, max_timestamp) - - merged_data.append(merged_record) - - print(f"✓ 聚合完成") - print(f" - 有效记录(2条/voice_id): {valid_count}") - print(f" - 无效记录(非2条/voice_id): {invalid_count}") - - return merged_data - - -def export_to_excel(data, start_date, end_date): - """导出到Excel""" - if not data: - print("\n警告: 没有数据可导出") - return - - print(f"\n开始导出数据到 Excel...") - - # 创建DataFrame - df = pd.DataFrame(data) - - # 调整列顺序 - columns = ["voice_id", "asr_prompt", "result_str", "timestamp", "audio_url", "source"] - df = df[columns] - - # 生成文件名 - output_dir = "output" - os.makedirs(output_dir, exist_ok=True) - filename = f"realtime_asr_export_{start_date}_{end_date}.xlsx" - filepath = os.path.join(output_dir, filename) - - # 导出Excel - df.to_excel(filepath, index=False, engine="openpyxl") - - print(f"✓ 数据已导出到: {filepath}") - print(f"✓ 共导出 {len(df)} 条记录") - - -def main(): - """主函数""" - print("=" * 60) - print("流式语音 ASR 数据导出工具 v1.0") - print("=" * 60) - - start_time = datetime.now() - - try: - # 测试ES连接 - if not connect_es(): - raise Exception("无法连接到 Elasticsearch,请检查配置") - - # 查询数据 - response, scroll_id, total_hits = query_data(START_DATE, END_DATE) - - if total_hits == 0: - print("\n没有找到符合条件的数据") - return - - # 聚合数据 - voice_data = aggregate_by_voice_id(response, scroll_id, total_hits) - - # 合并记录 - merged_data = merge_voice_records(voice_data) - - # 导出Excel - export_to_excel(merged_data, START_DATE, END_DATE) - - # 统计耗时 - end_time = datetime.now() - duration = (end_time - start_time).total_seconds() - - print(f"\n{'=' * 60}") - print(f"✓ 任务完成! 总耗时: {duration:.2f} 秒") - print(f"{'=' * 60}") - - except Exception as e: - print(f"\n✗ 错误: {str(e)}") - import traceback - traceback.print_exc() - - -if __name__ == "__main__": - main() diff --git a/makee_vala/git_scripts/export_resource_name.py b/makee_vala/git_scripts/export_resource_name.py deleted file mode 100644 index 36506d6..0000000 --- a/makee_vala/git_scripts/export_resource_name.py +++ /dev/null @@ -1,121 +0,0 @@ -""" -MYSQL_HOST=xxx -MYSQL_USERNAME=xxx -MYSQL_PASSWORD=xxx -MYSQL_DATABASE=xxx -MYSQL_PORT=xxx - -以上环境变量已配置在 .env 中。 - -我要导出一个数据表的某些记录 并添加一些字段。 - -表名:vala_resource_base - -过滤全部 type == "角色" 的记录 - -导出以下字段: -id -cn_name -en_name - - -最终输出到 excel文档。 "角色资源导出_251031.xlsx" - -""" - -import os -import pandas as pd -import pymysql -from dotenv import load_dotenv -from datetime import datetime - -def load_config(): - """加载环境变量配置""" - load_dotenv() - - config = { - 'host': os.getenv('MYSQL_HOST'), - 'user': os.getenv('MYSQL_USERNAME'), - 'password': os.getenv('MYSQL_PASSWORD'), - 'database': os.getenv('MYSQL_DATABASE'), - 'port': int(os.getenv('MYSQL_PORT', 3306)), - 'charset': 'utf8mb4' - } - - # 验证配置 - for key, value in config.items(): - if value is None and key != 'charset': - raise ValueError(f"环境变量 {key} 未配置") - - return config - -def connect_mysql(config): - """连接MySQL数据库""" - try: - connection = pymysql.connect(**config) - print("MySQL数据库连接成功") - return connection - except Exception as e: - print(f"MySQL数据库连接失败: {e}") - raise - -def export_role_resources(): - """导出角色资源数据""" - try: - # 加载配置 - config = load_config() - - # 连接数据库 - connection = connect_mysql(config) - - # SQL查询语句 - sql = """ - SELECT - id, - cn_name, - en_name - FROM vala_resource_base - WHERE type = '角色' - ORDER BY id - """ - - print("开始查询数据...") - - # 执行查询并获取数据 - df = pd.read_sql(sql, connection) - - print(f"查询到 {len(df)} 条记录") - - # 关闭数据库连接 - connection.close() - - # 导出到Excel文件 - output_filename = "角色资源导出_251031.xlsx" - df.to_excel(output_filename, index=False, engine='openpyxl') - - print(f"数据已成功导出到: {output_filename}") - print(f"导出字段: {list(df.columns)}") - print(f"导出记录数: {len(df)}") - - # 显示前几行数据预览 - if len(df) > 0: - print("\n数据预览:") - print(df.head()) - - return output_filename - - except Exception as e: - print(f"导出过程中发生错误: {e}") - raise - -if __name__ == "__main__": - try: - print("开始导出角色资源数据...") - print(f"执行时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") - - output_file = export_role_resources() - - print(f"\n✅ 导出完成! 文件保存为: {output_file}") - - except Exception as e: - print(f"\n❌ 导出失败: {e}") diff --git a/makee_vala/git_scripts/export_unit_challenge_data.py b/makee_vala/git_scripts/export_unit_challenge_data.py deleted file mode 100644 index 9bfedd4..0000000 --- a/makee_vala/git_scripts/export_unit_challenge_data.py +++ /dev/null @@ -1,343 +0,0 @@ -""" -** 不要改动我的需求描述,直接在需求后面写代码即可 ** - -需求一: -先写一个最简单脚本 实现下面sql功能 - -SELECT * FROM `vala_game_info` WHERE id > 0 AND `vala_game_info`.`deleted_at` IS NULL ORDER BY season_package_id asc,`index` asc - -环境变量读取: -MYSQL_HOST=xxx -MYSQL_USERNAME=xxx -MYSQL_PASSWORD=xxx -MYSQL_DATABASE=xxx -MYSQL_PORT=xxx ------------ -需求二: -在 PGsql数据库中 筛选数据 -数据库相关配置 从.env中读取: -PG_DB_HOST = xxx -PG_DB_PORT = xxx -PG_DB_USER = xxx -PG_DB_PASSWORD = xxx -PG_DB_DATABASE = xxx - -读取以下数据表:user_unit_challenge_question_result - -支持输入时间范围 -起始时间 和 截止时间 配置格式: "20250110" - -数据表中的时间字段为 updated_at , 格式样例: "2025-11-05 19:35:46.698246+08:00" - -在这些时间范围内,筛选数据 (要求deleted_at字段内容为null) - -导出以下字段: - -user_id -unit_id (读取每条记录的story_id, 根据 get_id_2_unit_index 函数返回的映射表 映射到 unit_id) -score_text -question_list -updated_at -category -play_time_seconds (读取 play_time 把ms数据转换为秒 保留整数部分) - -导出为excel文件 - -配置参数直接在脚本开头给出即可 - -需求三: -需求二中 作为步骤一 -本需求为步骤二 基于 步骤一的 文档 -进行数据聚合 - -根据每个unit_id + category 进行分组 - -统计每个分组下的以下数值: -总记录数量 -Perfect数量 (读取 score_text =="Perfect") -Good数量 (读取 score_text =="Good") -Oops数量 (读取 score_text =="Oops") -Perfect率 (Perfect数量 / 总记录数量) -Good率 (Good数量 / 总记录数量) -Oops率 (Oops数量 / 总记录数量) - -导出为excel 命名为 步骤一名字_stats.xlsx - -""" - -import os -import pymysql -import psycopg2 -from psycopg2.extras import RealDictCursor -from datetime import datetime -import pandas as pd -from dotenv import load_dotenv - -# 加载环境变量 -load_dotenv() - -# ============ 配置参数 ============ -START_DATE = "20250915" # 起始时间 -END_DATE = "20251128" # 截止时间 -OUTPUT_NAME = "unit_challenge_data_{}_{}.xlsx".format(START_DATE, END_DATE) # 输出文件名 -OUTPUT_FILENAME = os.path.join("./output", OUTPUT_NAME) -# ================================= - -def get_id_2_unit_index(): - # 读取数据库配置 - db_host = os.getenv('MYSQL_HOST') - db_user = os.getenv('MYSQL_USERNAME') - db_password = os.getenv('MYSQL_PASSWORD') - db_name = os.getenv('MYSQL_DATABASE') - db_port = os.getenv('MYSQL_PORT') - - # 简单的参数检查 - if not all([db_host, db_user, db_password, db_name]): - print("Error: Missing database configuration in .env file.") - print("Ensure MYSQL_HOST, MYSQL_USERNAME, MYSQL_PASSWORD, MYSQL_DATABASE are set.") - return - - try: - # 连接数据库 - connection = pymysql.connect( - host=db_host, - user=db_user, - password=db_password, - database=db_name, - port=int(db_port) if db_port else 3306, - cursorclass=pymysql.cursors.DictCursor - ) - - print(f"Connected to database: {db_host}") - - try: - with connection.cursor() as cursor: - # 定义 SQL 语句 - sql = """ - SELECT * - FROM `vala_game_info` - WHERE id > 0 - AND `vala_game_info`.`deleted_at` IS NULL - ORDER BY season_package_id asc, `index` asc - """ - - print(f"Executing SQL: {sql}") - - # 执行查询 - cursor.execute(sql) - - # 获取所有结果 - results = cursor.fetchall() - - print(f"Total records found: {len(results)}") - print("-" * 30) - - # 打印结果 - print(results) - id_2_unit_index = {} - for index, row in enumerate(results): - id_2_unit_index[row['id']] = index - - print("映射结果:") - print(id_2_unit_index) - - - - print("-" * 30) - print("Done.") - return id_2_unit_index - - finally: - connection.close() - - except Exception as e: - print(f"An error occurred: {e}") - - -def export_unit_challenge_data(start_date, end_date, output_filename): - """ - 从PostgreSQL数据库导出单元挑战数据 - """ - # 读取PostgreSQL数据库配置 - pg_host = os.getenv('PG_DB_HOST') - pg_port = os.getenv('PG_DB_PORT') - pg_user = os.getenv('PG_DB_USER') - pg_password = os.getenv('PG_DB_PASSWORD') - pg_database = os.getenv('PG_DB_DATABASE') - - # 检查配置 - if not all([pg_host, pg_port, pg_user, pg_password, pg_database]): - print("Error: Missing PostgreSQL database configuration in .env file.") - print("Ensure PG_DB_HOST, PG_DB_PORT, PG_DB_USER, PG_DB_PASSWORD, PG_DB_DATABASE are set.") - return - - # 获取 id 到 unit_index 的映射 - print("正在获取 unit_id 映射表...") - id_2_unit_index = get_id_2_unit_index() - if not id_2_unit_index: - print("Error: Failed to get id_2_unit_index mapping.") - return - - # 转换时间格式: "20250110" -> "2025-01-10 00:00:00" - start_datetime = datetime.strptime(start_date, "%Y%m%d").strftime("%Y-%m-%d 00:00:00") - end_datetime = datetime.strptime(end_date, "%Y%m%d").strftime("%Y-%m-%d 00:00:00") - - print(f"时间范围: {start_datetime} 至 {end_datetime}") - - try: - # 连接PostgreSQL数据库 - connection = psycopg2.connect( - host=pg_host, - port=int(pg_port), - user=pg_user, - password=pg_password, - database=pg_database, - cursor_factory=RealDictCursor - ) - - print(f"已连接到 PostgreSQL 数据库: {pg_host}") - - try: - with connection.cursor() as cursor: - # 定义SQL查询 - sql = """ - SELECT - user_id, - story_id, - score_text, - question_list, - updated_at, - category, - play_time - FROM user_unit_challenge_question_result - WHERE deleted_at IS NULL - AND updated_at >= %s - AND updated_at < %s - ORDER BY updated_at ASC - """ - - print(f"执行查询...") - - # 执行查询 - cursor.execute(sql, (start_datetime, end_datetime)) - - # 获取所有结果 - results = cursor.fetchall() - - print(f"查询到 {len(results)} 条记录") - - # 处理数据 - export_data = [] - for row in results: - # 映射 story_id 到 unit_id - story_id = row['story_id'] - unit_id = id_2_unit_index.get(story_id, None) - - # 转换 play_time (毫秒) 为秒 (整数) - play_time_seconds = row['play_time'] // 1000 if row['play_time'] else 0 - - # 移除 updated_at 的时区信息(Excel 不支持带时区的 datetime) - updated_at = row['updated_at'] - if updated_at and hasattr(updated_at, 'replace'): - updated_at = updated_at.replace(tzinfo=None) - - export_data.append({ - 'user_id': row['user_id'], - 'unit_id': unit_id, - 'score_text': row['score_text'], - 'question_list': row['question_list'], - 'updated_at': updated_at, - 'category': row['category'], - 'play_time_seconds': play_time_seconds - }) - - # 导出到Excel - if export_data: - df = pd.DataFrame(export_data) - df.to_excel(output_filename, index=False, engine='openpyxl') - print(f"数据已导出到: {output_filename}") - print(f"共导出 {len(export_data)} 条记录") - else: - print("没有数据可导出") - - finally: - connection.close() - print("数据库连接已关闭") - - except Exception as e: - print(f"发生错误: {e}") - - -def aggregate_stats(input_filename): - """ - 基于步骤一的Excel文件进行数据聚合 - 按 unit_id + category 分组,统计各项指标 - """ - try: - # 读取步骤一导出的Excel文件 - print(f"正在读取文件: {input_filename}") - df = pd.read_excel(input_filename, engine='openpyxl') - - print(f"读取到 {len(df)} 条记录") - - # 按 unit_id + category 分组统计 - grouped = df.groupby(['unit_id', 'category'], dropna=False) - - stats_data = [] - for (unit_id, category), group in grouped: - total_count = len(group) - perfect_count = (group['score_text'] == 'Perfect').sum() - good_count = (group['score_text'] == 'Good').sum() - oops_count = (group['score_text'] == 'Oops').sum() - - # 计算占比 - perfect_rate = round(perfect_count / total_count if total_count > 0 else 0, 2) - good_rate = round(good_count / total_count if total_count > 0 else 0, 2) - oops_rate = round(oops_count / total_count if total_count > 0 else 0, 2) - - stats_data.append({ - 'unit_id': unit_id, - 'category': category, - '总记录数量': total_count, - 'Perfect数量': perfect_count, - 'Good数量': good_count, - 'Oops数量': oops_count, - 'Perfect率': perfect_rate, - 'Good率': good_rate, - 'Oops率': oops_rate - }) - - # 生成输出文件名 - base_name = os.path.splitext(input_filename)[0] - output_filename = f"{base_name}_stats.xlsx" - - # 导出统计结果 - if stats_data: - stats_df = pd.DataFrame(stats_data) - stats_df.to_excel(output_filename, index=False, engine='openpyxl') - print(f"统计数据已导出到: {output_filename}") - print(f"共 {len(stats_data)} 个分组") - else: - print("没有数据可统计") - - except Exception as e: - print(f"数据聚合时发生错误: {e}") - - -if __name__ == "__main__": - # 步骤一:执行导出 - print("=" * 50) - print("步骤一:导出原始数据") - print("=" * 50) - export_unit_challenge_data(START_DATE, END_DATE, OUTPUT_FILENAME) - - # 步骤二:数据聚合 - print("\n" + "=" * 50) - print("步骤二:数据聚合统计") - print("=" * 50) - aggregate_stats(OUTPUT_FILENAME) - - print("\n" + "=" * 50) - print("全部完成!") - print("=" * 50) - diff --git a/makee_vala/git_scripts/export_user_id_data.py b/makee_vala/git_scripts/export_user_id_data.py deleted file mode 100644 index ba0ddcc..0000000 --- a/makee_vala/git_scripts/export_user_id_data.py +++ /dev/null @@ -1,1846 +0,0 @@ -""" -初版需求v1.0: 2025.11.18 - -导出 一个userId的多表数据, 最终按照不同sheet,输出到一个 excel文件中。 - -1. 第一个sheet:"全部音频数据" -es相关配置通过以下环境变量 -ES_HOST=xxx -ES_PORT=9200 -ES_SCHEME=https -ES_USER=elastic -ES_PASSWORD=xxx - -index: user-audio - -脚本思路: -过滤字段: -userId == xxxx - -输出该userId的全部记录 按时间倒序排序 -包含以下字段内容: - -userId -userMsg -userName -soeData -audioUrl -asrStatus -componentId -componentType -dataVersion - -2. 第二个sheet:"互动组件学习记录" -在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 -数据库相关配置 从.env中读取: -PG_DB_HOST = xxx -PG_DB_PORT = xxx -PG_DB_USER = xxx -PG_DB_PASSWORD = xxx -PG_DB_DATABASE = xxx - -读取以下数据表: -user_component_play_record_0 ~ user_component_play_record_7 - -输出以下字段: -user_id, -component_unique_code, -session_id, -c_type, -c_id, -play_result, -user_behavior_info, -updated_at - -3.第三个sheet:"课程巩固记录" -在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 - -数据表:user_unit_review_question_result - -输出以下字段: -user_id -story_id -chapter_id -question_list -updated_at - -4.第四个sheet:"单元挑战记录" -在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 - -数据表:user_unit_challenge_question_result - -输出以下字段: -user_id -story_id -category -score_text, -question_list -updated_at ------------- - -需求补充v1.1: -"全部音频数据"这个sheet -输出字段 添加timeStr 并按时间倒序排列 最新的记录 在最上面 - ------------- -需求补充v1.2: -"全部音频数据"这个sheet -如果userMsg字段内容 包含 ”makee_id“ 要进行以下处理: - -从userMsg字段中提取出具体的makee_id: -此时的字段样例: -``` -asr msg信息为:{ - "time_ms": 358, - "time_ms_api": 357, - "hot_words_str": "{\n \"context_type\": \"dialog_ctx\",\n \"context_data\": [\n {\n \"text\": \"planet Walla\"\n },\n {\n \"text\": \"Walla\"\n }\n ]\n}", - "makee_id": "d208c617-902f-4f81-8255-b5fb73599546", - "volcano_fast_x_tt_logid": "202511151541355DF72BE5EBFE73795BFD", - "api_name": "volcano-fast" -} -``` -然后基于makee_id 去另一个表里查记录: index:llm_asr_log -将查询到的记录的 result_text 字段内容 回填到 userMsg。 -将source字段内容 输出 到 source。 - -如果userMsg字段内容 不包含 ”makee_id“ 保持之前的逻辑。 - --------------- -需求补充 v1.3 -当前输入 只支持配置单个 userId (业务侧名称为角色id) - - -期望扩展为以下逻辑: -1. 改为配置 角色id list , 分别 导出 多份excel文件。命名格式为 角色id_{}_导出时间_{}.xlsx -2. 改为配置 账户id list , 分别 导出 多份excel文件。命名格式为 账户id_{}_角色id_{}_导出时间_{}.xlsx - -关于 账户 id 到角色id 的映射逻辑, -首先 读取 mysql 表 vala_app_character -筛选 account_id字段值 == 账户id 的 记录, 其中 该记录 的 id值,则为角色id 一个 账户id 可以对应多个角色id - -本次需求只针对输入侧调整, 数据抽取聚合逻辑部分和之前保持一致 - ---------------- -需求补充 v1.4 - -增加一个sheet "单元总结记录", -导出对应角色id的单元总结记录。 参考 export_unit_summary.py 中的原始数据提取方案即可(不必关注其中的数据统计部分)。 - -其他已有逻辑保持不动哦。 - ----------------- -需求补充 v1.5 - -1."互动组件学习记录"sheet 增加以下字段 -"互动组件名称"、"组件标题"、"组件配置摘要"、"知识点": -字段取值规则: -根据 c_type 及组件配置(从mysql表获取) 进行映射和处理: -``` -1).如果 c_type 开头为"mid" - -则读取下表:表名:middle_interaction_component - -获取以下字段值: -title (作为组件标题) -component_config (完整的组件配置) 获取其中 的 question 字段值 作为 组件配置摘要; -kp_relation_info 字段值 作为 知识点 - -"互动组件名称"规则: - -"物品互动": "mid_vocab_item", -"图片互动": "mid_vocab_image", -"填词互动": "mid_vocab_fillBlank", -"指令互动": "mid_vocab_instruction" -"对话互动-表达": "mid_sentence_dialogue", 且 component_config->question->mode == "express" -"对话互动-朗读": "mid_sentence_dialogue", 且 component_config->question->mode == "read" -"语音互动": "mid_sentence_voice", -"材料互动": "mid_sentence_material", -"造句互动": "mid_sentence_makeSentence" -"挖空互动": "mid_grammar_cloze", -"组句互动": "mid_grammar_sentence" -"发音互动": "mid_pron_pron" - - -2). 如果 c_type 开头为"core" -则读取下表:表名:core_interaction_component - -获取以下字段值: -title (作为组件标题) -component_config (完整的组件配置) 获取其中 的 taskInfo 字段值 作为 组件配置摘要 -kp_relation_info 字段值 作为 知识点 - -"互动组件名称"规则: -"口语快答": "core_speaking_reply", -"口语妙问": "core_speaking_inquiry", -"口语探讨": "core_speaking_explore", -"口语独白": "core_speaking_monologue" -"合作阅读": "core_reading_order", -"合作听力": "core_listening_order", -"看图组句": "core_writing_imgMakeSentence", -"看图撰写": "core_writing_imgWrite", -"问题组句": "core_writing_questionMakeSentence", -"问题撰写": "core_writing_questionWrite", -``` - -2."课程巩固记录" sheet 增加以下字段 -"正确率": 参考 export_lesson_review.py 中的计算逻辑 - -3. 新增一个"汇总统计"sheet -统计并展示以下内容 请以 可读性 比较好的方式排列、展示 - -a. "所有互动-按互动组件类型-通过情况统计" -以每种"互动组件名称"进行聚合 -统计play_result的取值分布情况,算以下指标: -总数量、Perfect数量、Good数量、Failed数量、Pass数量、Perfect比例、Good比例、Failed比例、Pass比例 - -b. "中互动组件-按知识点-通过情况统计" -以每个知识点进行聚合 - -其中 知识点配置格式如下: -``` -[{"kpId":"0000004","kpType":"sentence","kpTitle":"My name is ...","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000004","kpType":"sentence","kpTitle":"My name is ...","kpSkill":"sentence_meaning","kpSkillName":"语义"},{"kpId":"0000005","kpType":"sentence","kpTitle":"I'm… years old.","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000005","kpType":"sentence","kpTitle":"I'm… years old.","kpSkill":"sentence_meaning","kpSkillName":"语义"},{"kpId":"0000014","kpType":"sentence","kpTitle":"Nice to meet you.","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000014","kpType":"sentence","kpTitle":"Nice to meet you.","kpSkill":"sentence_meaning","kpSkillName":"语义"}] -``` -一个组件可以绑定多个知识点,以每个知识点的 kpId + kpType + kpTitle 进行 展示及聚合 - -对所有绑定了某个知识点的中互动组件(c_type以mid开头) -统计play_result的取值分布情况,算以下指标: -总数量、Perfect数量、Good数量、Failed数量、Pass数量、Perfect比例、Good比例、Failed比例、Pass比例 - -c. "单元总结-按单元统计时长" - -将"单元总结记录"中的"play_time_seconds"字段值 以每个单元id 进行聚合 进行 累加 统计,并增加一列 转换为分钟为单位 取整数 - - -""" -# ==== 可直接修改的脚本变量(不使用命令行传参) ==== -# 三种模式互斥,只能配置一个: -# 模式1:单个角色id -USER_ID = None # 单个角色ID,示例:2911 - -# 模式2:角色id列表(多个角色id批量导出) -USER_ID_LIST = None # 角色ID列表,示例:[2911, 2912, 2913] - -# 模式3:账户id列表(通过账户id查询对应的角色id后批量导出) -ACCOUNT_ID_LIST = [2148] # 5095[7232] # [1783,5375,5371,5345,5303,5293,5095,4289,4494,4473,4460,4452,4386,4388,4236,4043,2758,2841,2756,2750,2692,1781,1693,2256,2234,2373] # 账户ID列表,示例:[100, 101, 102] - -OUTPUT_DIR = "output/260126/" # 输出目录,默认为output文件夹 -# ==== 变量结束 ==== -import os -import json -import re -from typing import Any, Dict, List, Optional - -import datetime - -try: - import requests -except Exception: - requests = None - -try: - import psycopg2 - from psycopg2.extras import RealDictCursor -except Exception: - psycopg2 = None - RealDictCursor = None - -try: - import pymysql - import pymysql.cursors -except Exception: - pymysql = None - -try: - import pandas as pd -except Exception: - pd = None - -try: - import urllib3 -except Exception: - urllib3 = None - - -SHEET1_COLUMNS = [ - "userId", - "userMsg", - "source", - "userName", - "soeData", - "audioUrl", - "asrStatus", - "componentId", - "componentType", - "dataVersion", - "timeStr", -] - -SHEET2_COLUMNS = [ - "user_id", - "component_unique_code", - "session_id", - "c_type", - "c_id", - "互动组件名称", - "组件标题", - "组件配置摘要", - "知识点", - "play_result", - "user_behavior_info", - "updated_at", -] - -SHEET3_COLUMNS = [ - "user_id", - "unit_id", - "lesson_id", - "question_list", - "正确率", - "updated_at", -] - -SHEET4_COLUMNS = [ - "user_id", - "unit_id", - "category", - "score_text", - "question_list", - "updated_at", -] - -SHEET5_COLUMNS = [ - "id", - "user_id", - "unit_id", - "updated_at", - "km_id", - "km_type", - "play_time_seconds", -] - - -def _load_env_file(path: str) -> None: - if not os.path.exists(path): - return - try: - with open(path, "r", encoding="utf-8") as f: - for line in f: - line = line.strip() - if not line or line.startswith("#"): - continue - if "=" not in line: - continue - k, v = line.split("=", 1) - k = k.strip() - v = v.strip().strip('"').strip("'") - if k and (os.getenv(k) is None): - os.environ[k] = v - except Exception: - pass - - -def load_env() -> None: - _load_env_file(os.path.join(os.getcwd(), ".env")) - _load_env_file(os.path.join(os.getcwd(), ".env.local")) - - -def to_json_str(v: Any) -> Any: - if isinstance(v, (dict, list)): - try: - return json.dumps(v, ensure_ascii=False) - except Exception: - return str(v) - return v - - -def parse_time(value: Any) -> Optional[datetime.datetime]: - if value is None: - return None - if isinstance(value, (int, float)): - try: - v = float(value) - # 兼容毫秒级时间戳 - if v > 1e11: - v = v / 1000.0 - return datetime.datetime.fromtimestamp(v) - except Exception: - return None - if isinstance(value, str): - fmts = [ - "%Y-%m-%dT%H:%M:%S.%fZ", - "%Y-%m-%dT%H:%M:%S.%f%z", - "%Y-%m-%dT%H:%M:%S%z", - "%Y-%m-%d %H:%M:%S", - "%Y-%m-%d", - ] - for fmt in fmts: - try: - return datetime.datetime.strptime(value, fmt) - except Exception: - continue - try: - return datetime.datetime.fromisoformat(value) - except Exception: - return None - return None - - -def pick_time(source: Dict[str, Any]) -> Optional[datetime.datetime]: - candidates = [ - "updated_at", - "created_at", - "@timestamp", - "timestamp", - "updatedAt", - "createdAt", - "time", - "ts", - "timeStr", - "update_time", - "create_time", - ] - for key in candidates: - if key in source: - t = parse_time(source.get(key)) - if t is not None: - return t - # 宽松匹配:尝试扫描所有可能的时间相关字段 - for k, v in source.items(): - lk = str(k).lower() - if any(s in lk for s in ["time", "date", "_at", "timestamp"]): - t = parse_time(v) - if t is not None: - return t - return None - - -def extract_makee_id_from_user_msg(user_msg: Any) -> Optional[str]: - # 支持dict或字符串形式 - if isinstance(user_msg, dict): - mk = user_msg.get("makee_id") - if isinstance(mk, str) and mk: - return mk - if isinstance(user_msg, str) and user_msg: - # 1) 尝试整体解析为JSON - try: - obj = json.loads(user_msg) - mk = obj.get("makee_id") - if isinstance(mk, str) and mk: - return mk - except Exception: - pass - # 2) 尝试截取大括号中的JSON - try: - start = user_msg.find("{") - end = user_msg.rfind("}") - if start != -1 and end != -1 and end > start: - candidate = user_msg[start : end + 1] - obj = json.loads(candidate) - mk = obj.get("makee_id") - if isinstance(mk, str) and mk: - return mk - except Exception: - pass - # 3) 正则匹配 makee_id - m = re.search(r"\bmakee_id\b\s*:\s*\"([^\"]+)\"", user_msg) - if m: - return m.group(1) - return None - - -def fetch_es_asr_log(makee_id: str, es_cfg: Dict[str, Any]) -> Optional[Dict[str, Any]]: - if requests is None: - raise RuntimeError("缺少requests依赖,请安装后再运行。") - host = es_cfg.get("host") - port = es_cfg.get("port") - scheme = es_cfg.get("scheme", "http") - user = es_cfg.get("user") - password = es_cfg.get("password") - index = "llm_asr_log" - if not host: - return None - base = f"{scheme}://{host}:{port}" - url = f"{base}/{index}/_search" - headers = {"Content-Type": "application/json"} - body = { - "query": { - "bool": { - "should": [ - {"term": {"makee_id": {"value": str(makee_id)}}}, - {"term": {"makee_id.keyword": {"value": str(makee_id)}}}, - ], - "minimum_should_match": 1, - } - }, - "size": 10, - "_source": [ - "makee_id", - "result_text", - "source", - "updated_at", - "created_at", - "@timestamp", - "timestamp", - "updatedAt", - "createdAt", - "time", - "ts", - "timeStr", - "update_time", - "create_time", - ], - } - auth = (user, password) if user and password else None - try: - if scheme == "https" and urllib3 is not None: - try: - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - except Exception: - pass - resp = requests.post(url, headers=headers, json=body, auth=auth, timeout=20, verify=False if scheme == "https" else True) - resp.raise_for_status() - data = resp.json() - except Exception: - return None - hits = data.get("hits", {}).get("hits", []) - if not hits: - return None - # 选最新的 - chosen = None - best_t = None - for h in hits: - src = h.get("_source", {}) or {} - t = pick_time(src) - if t is None: - continue - if best_t is None or t > best_t: - best_t = t - chosen = src - if chosen is None: - # 如果都没有时间,选第一条 - chosen = (hits[0].get("_source", {}) or {}) - return chosen - - -def get_es_config() -> Dict[str, Any]: - return { - "host": os.getenv("ES_HOST"), - "port": os.getenv("ES_PORT", "9200"), - "scheme": os.getenv("ES_SCHEME", "http"), - "user": os.getenv("ES_USER"), - "password": os.getenv("ES_PASSWORD"), - "index": "user-audio", - } - - -def fetch_es_user_audio(user_id: str, es_cfg: Dict[str, Any]) -> List[Dict[str, Any]]: - if requests is None: - raise RuntimeError("缺少requests依赖,请安装后再运行。") - - print(f" [ES] 开始查询user-audio索引...") - start_time = datetime.datetime.now() - - host = es_cfg.get("host") - port = es_cfg.get("port") - scheme = es_cfg.get("scheme", "http") - user = es_cfg.get("user") - password = es_cfg.get("password") - index = es_cfg.get("index", "user-audio") - - if not host: - return [] - - base = f"{scheme}://{host}:{port}" - url = f"{base}/{index}/_search" - headers = {"Content-Type": "application/json"} - - body = { - "query": { - "bool": { - "should": [ - {"term": {"userId": {"value": str(user_id)}}}, - {"term": {"userId.keyword": {"value": str(user_id)}}}, - ], - "minimum_should_match": 1, - } - }, - "size": 10000, - "_source": [ - "userId", - "userMsg", - "userName", - "soeData", - "audioUrl", - "asrStatus", - "componentId", - "componentType", - "dataVersion", - "updated_at", - "created_at", - "@timestamp", - "timestamp", - "updatedAt", - "createdAt", - "time", - "ts", - "timeStr", - "update_time", - "create_time", - ], - } - - auth = (user, password) if user and password else None - - try: - # 抑制自签证书下的HTTPS不安全警告 - if scheme == "https" and urllib3 is not None: - try: - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - except Exception: - pass - resp = requests.post(url, headers=headers, json=body, auth=auth, timeout=30, verify=False if scheme == "https" else True) - resp.raise_for_status() - data = resp.json() - except Exception as e: - raise RuntimeError(f"ES查询失败: {e}") - - hits = data.get("hits", {}).get("hits", []) - print(f" [ES] 查询完成,获得{len(hits)}条记录,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - - if not hits: - return [] - - print(f" [ES] 开始处理音频数据...") - process_start = datetime.datetime.now() - - rows: List[Dict[str, Any]] = [] - asr_cache: Dict[str, Dict[str, Any]] = {} - makee_id_count = 0 - - for idx, h in enumerate(hits, 1): - # 每处理100条显示一次进度 - if idx % 100 == 0 or idx == len(hits): - print(f" [ES] 处理进度: {idx}/{len(hits)} ({idx*100//len(hits)}%)") - - src = h.get("_source", {}) or {} - row = { - "userId": src.get("userId"), - "userMsg": src.get("userMsg"), - "source": None, - "userName": src.get("userName"), - "soeData": to_json_str(src.get("soeData")), - "audioUrl": src.get("audioUrl"), - "asrStatus": src.get("asrStatus"), - "componentId": src.get("componentId"), - "componentType": src.get("componentType"), - "dataVersion": src.get("dataVersion"), - } - t = pick_time(src) - row["_time"] = t.isoformat() if t else None - row["timeStr"] = t.strftime("%Y-%m-%d %H:%M:%S") if t else None - # v1.2: 当userMsg包含makee_id时,补充查询llm_asr_log并回填 - mk = extract_makee_id_from_user_msg(row.get("userMsg")) - if mk: - makee_id_count += 1 - asr_doc = asr_cache.get(mk) - if asr_doc is None: - asr_doc = fetch_es_asr_log(mk, es_cfg) - if asr_doc is not None: - asr_cache[mk] = asr_doc - if asr_doc is not None: - rt = asr_doc.get("result_text") - if rt: - row["userMsg"] = rt - row["source"] = to_json_str(asr_doc.get("source")) - rows.append(row) - - print(f" [ES] 数据处理完成,发现{makee_id_count}条包含makee_id的记录,耗时{(datetime.datetime.now() - process_start).total_seconds():.2f}秒") - - print(f" [ES] 开始排序...") - rows.sort(key=lambda x: parse_time(x.get("_time")) or datetime.datetime.min, reverse=True) - print(f" [ES] 音频数据处理完成,总耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - - return rows - - -def get_pg_conn() -> Any: - if psycopg2 is None: - raise RuntimeError("缺少psycopg2依赖,请安装后再运行。") - host = os.getenv("PG_DB_HOST") - port = int(os.getenv("PG_DB_PORT", "5432")) - user = os.getenv("PG_DB_USER") - password = os.getenv("PG_DB_PASSWORD") - dbname = os.getenv("PG_DB_DATABASE") - if not host or not dbname: - raise RuntimeError("PG数据库环境变量未配置完整") - conn = psycopg2.connect(host=host, port=port, user=user, password=password, dbname=dbname) - return conn - - -def get_mysql_conn(database: str) -> Any: - """ - 获取MySQL数据库连接 - - Args: - database: 数据库名,可选值:'vala_user' 或 'vala_test' - vala_user 使用 online 配置(环境变量后缀 _online) - vala_test 使用默认配置 - - Returns: - MySQL连接对象 - """ - if pymysql is None: - raise RuntimeError("缺少pymysql依赖,请安装后再运行。") - - # 根据数据库选择不同的环境变量配置 - if database == "vala_user": - # vala_user 数据库使用 online 配置 - host = os.getenv("MYSQL_HOST_online") - port = int(os.getenv("MYSQL_PORT_online", "3306")) - user = os.getenv("MYSQL_USERNAME_online") - password = os.getenv("MYSQL_PASSWORD_online") - if not host: - raise RuntimeError("MySQL数据库环境变量未配置完整(缺少MYSQL_HOST_online)") - else: - # vala_test 等其他数据库使用默认配置 - host = os.getenv("MYSQL_HOST") - port = int(os.getenv("MYSQL_PORT", "3306")) - user = os.getenv("MYSQL_USERNAME") - password = os.getenv("MYSQL_PASSWORD") - if not host: - raise RuntimeError("MySQL数据库环境变量未配置完整(缺少MYSQL_HOST)") - - conn = pymysql.connect( - host=host, - port=port, - user=user, - password=password, - database=database, # 直接使用传入的数据库名 - charset="utf8mb4", - cursorclass=pymysql.cursors.DictCursor, - ) - return conn - - -def get_id_2_unit_index(conn: Any) -> Dict[int, int]: - """ - 从MySQL获取 story_id 到 unit_id 的映射关系 - - Args: - conn: MySQL数据库连接 - - Returns: - 映射字典 {story_id: unit_id} - """ - sql = """ - SELECT * - FROM `vala_game_info` - WHERE id > 0 - AND `vala_game_info`.`deleted_at` IS NULL - ORDER BY season_package_id asc, `index` asc - """ - try: - with conn.cursor() as cur: - cur.execute(sql) - rows = cur.fetchall() or [] - # 构建映射表:按查询结果的顺序,索引即为unit_id - id_2_unit_index = {} - for index, row in enumerate(rows): - id_2_unit_index[row["id"]] = index - return id_2_unit_index - except Exception as e: - print(f"[ERROR] 获取story_id到unit_id映射失败: {e}") - return {} - - -def get_chapter_id_to_lesson_id(conn: Any) -> Dict[int, int]: - """ - 从MySQL获取 chapter_id 到 lesson_id 的映射关系 - - Args: - conn: MySQL数据库连接 - - Returns: - 映射字典 {chapter_id: lesson_id} - """ - sql = """ - SELECT id, `index` - FROM `vala_game_chapter` - WHERE deleted_at IS NULL - """ - try: - with conn.cursor() as cur: - cur.execute(sql) - rows = cur.fetchall() or [] - # 构建映射表:chapter的index字段即为lesson_id - chapter_id_to_lesson_id = {} - for row in rows: - chapter_id_to_lesson_id[row["id"]] = row["index"] - return chapter_id_to_lesson_id - except Exception as e: - print(f"[ERROR] 获取chapter_id到lesson_id映射失败: {e}") - return {} - - -# 组件类型到组件名称的映射 -COMPONENT_TYPE_NAMES = { - "mid_vocab_item": "物品互动", - "mid_vocab_image": "图片互动", - "mid_vocab_fillBlank": "填词互动", - "mid_vocab_instruction": "指令互动", - "mid_sentence_dialogue": "对话互动", # 需要根据mode进一步判断 - "mid_sentence_voice": "语音互动", - "mid_sentence_material": "材料互动", - "mid_sentence_makeSentence": "造句互动", - "mid_grammar_cloze": "挖空互动", - "mid_grammar_sentence": "组句互动", - "mid_pron_pron": "发音互动", - "core_speaking_reply": "口语快答", - "core_speaking_inquiry": "口语妙问", - "core_speaking_explore": "口语探讨", - "core_speaking_monologue": "口语独白", - "core_reading_order": "合作阅读", - "core_listening_order": "合作听力", - "core_writing_imgMakeSentence": "看图组句", - "core_writing_imgWrite": "看图撰写", - "core_writing_questionMakeSentence": "问题组句", - "core_writing_questionWrite": "问题撰写", -} - - -def get_component_name(c_type: str, component_config: Optional[Dict[str, Any]]) -> str: - """ - 根据c_type和组件配置获取组件名称 - - Args: - c_type: 组件类型 - component_config: 组件配置(用于判断对话互动的mode) - - Returns: - 组件名称 - """ - if not c_type: - return "" - - # 特殊处理:对话互动需要根据mode判断 - if c_type == "mid_sentence_dialogue" and component_config: - try: - question = component_config.get("question", {}) - mode = question.get("mode", "") - if mode == "express": - return "对话互动-表达" - elif mode == "read": - return "对话互动-朗读" - except Exception: - pass - - return COMPONENT_TYPE_NAMES.get(c_type, "") - - -def batch_fetch_component_configs(play_records: List[Dict[str, Any]], mysql_conn: Any) -> Dict[str, Dict[str, Any]]: - """ - 批量查询组件配置信息 - - Args: - play_records: 播放记录列表 - mysql_conn: MySQL连接 - - Returns: - 组件配置映射 {c_type_c_id: {title, component_config, kp_relation_info}} - """ - print(f" [MySQL] 开始批量查询组件配置...") - start_time = datetime.datetime.now() - - # 收集需要查询的c_type和c_id - mid_c_ids = set() - core_c_ids = set() - mid_type_id_pairs = [] # 用于调试日志 - core_type_id_pairs = [] - - for record in play_records: - c_type = record.get("c_type", "") - c_id = record.get("c_id") - if c_type and c_id: - if c_type.startswith("mid"): - mid_c_ids.add(c_id) - mid_type_id_pairs.append((c_type, c_id)) - elif c_type.startswith("core"): - core_c_ids.add(c_id) - core_type_id_pairs.append((c_type, c_id)) - - print(f" [MySQL] 需要查询中互动组件: {len(mid_c_ids)}个, 核心互动组件: {len(core_c_ids)}个") - if mid_c_ids: - print(f" [MySQL] 中互动组件ID列表(前10个): {sorted(list(mid_c_ids))[:10]}") - if core_c_ids: - print(f" [MySQL] 核心互动组件ID列表(前10个): {sorted(list(core_c_ids))[:10]}") - - config_map = {} - - # 批量查询middle_interaction_component - if mid_c_ids: - try: - with mysql_conn.cursor() as cur: - placeholders = ','.join(['%s'] * len(mid_c_ids)) - sql = f""" - SELECT c_id, c_type, title, component_config, kp_relation_info - FROM middle_interaction_component - WHERE c_id IN ({placeholders}) AND deleted_at IS NULL - """ - print(f" [MySQL] 执行中互动组件查询,查询条件: c_id IN ({len(mid_c_ids)}个ID)") - cur.execute(sql, tuple(mid_c_ids)) - rows = cur.fetchall() or [] - print(f" [MySQL] 查询到{len(rows)}条中互动组件配置") - - if len(rows) == 0 and len(mid_c_ids) > 0: - print(f" [MySQL] [警告] 查询结果为空!可能的原因:") - print(f" [MySQL] - 数据库中没有匹配的c_id记录") - print(f" [MySQL] - deleted_at字段不为NULL") - print(f" [MySQL] - c_id不存在") - - for idx, row in enumerate(rows): - c_type = row.get("c_type", "") - c_id = row.get("c_id") - key = f"{c_type}_{c_id}" - - if idx < 3: # 输出前3条的详细信息 - print(f" [MySQL] [样例{idx+1}] id={c_id}, c_type={c_type}, key={key}") - print(f" [MySQL] [样例{idx+1}] title={row.get('title', '')[:50]}") - - # 解析component_config - component_config = row.get("component_config") - if isinstance(component_config, str): - try: - component_config = json.loads(component_config) - except Exception as e: - print(f" [MySQL] [警告] 解析component_config失败 (id={c_id}): {e}") - component_config = {} - - # 提取question字段作为摘要 - summary = "" - if isinstance(component_config, dict): - question = component_config.get("question") - summary = to_json_str(question) if question else "" - if idx < 3 and question: - print(f" [MySQL] [样例{idx+1}] 提取到question字段,长度: {len(summary)}") - - # 解析kp_relation_info - kp_relation_info = row.get("kp_relation_info") - if isinstance(kp_relation_info, str): - try: - kp_relation_info = json.loads(kp_relation_info) - except Exception: - kp_relation_info = [] - - config_map[key] = { - "title": row.get("title", ""), - "component_config": component_config, - "summary": summary, - "kp_relation_info": to_json_str(kp_relation_info), - } - - print(f" [MySQL] 中互动组件配置已加入config_map,当前map大小: {len(config_map)}") - except Exception as e: - print(f" [MySQL] [错误] 查询中互动组件配置失败: {e}") - import traceback - traceback.print_exc() - - # 批量查询core_interaction_component - if core_c_ids: - try: - with mysql_conn.cursor() as cur: - placeholders = ','.join(['%s'] * len(core_c_ids)) - sql = f""" - SELECT c_id, c_type, title, component_config, kp_relation_info - FROM core_interaction_component - WHERE c_id IN ({placeholders}) AND deleted_at IS NULL - """ - print(f" [MySQL] 执行核心互动组件查询,查询条件: c_id IN ({len(core_c_ids)}个ID)") - cur.execute(sql, tuple(core_c_ids)) - rows = cur.fetchall() or [] - print(f" [MySQL] 查询到{len(rows)}条核心互动组件配置") - - if len(rows) == 0 and len(core_c_ids) > 0: - print(f" [MySQL] [警告] 查询结果为空!可能的原因:") - print(f" [MySQL] - 数据库中没有匹配的c_id记录") - print(f" [MySQL] - deleted_at字段不为NULL") - print(f" [MySQL] - c_id不存在") - - for idx, row in enumerate(rows): - c_type = row.get("c_type", "") - c_id = row.get("c_id") - key = f"{c_type}_{c_id}" - - if idx < 3: # 输出前3条的详细信息 - print(f" [MySQL] [样例{idx+1}] id={c_id}, c_type={c_type}, key={key}") - print(f" [MySQL] [样例{idx+1}] title={row.get('title', '')[:50]}") - - # 解析component_config - component_config = row.get("component_config") - if isinstance(component_config, str): - try: - component_config = json.loads(component_config) - except Exception as e: - print(f" [MySQL] [警告] 解析component_config失败 (id={c_id}): {e}") - component_config = {} - - # 提取taskInfo字段作为摘要 - summary = "" - if isinstance(component_config, dict): - task_info = component_config.get("taskInfo") - summary = to_json_str(task_info) if task_info else "" - if idx < 3 and task_info: - print(f" [MySQL] [样例{idx+1}] 提取到taskInfo字段,长度: {len(summary)}") - - # 解析kp_relation_info - kp_relation_info = row.get("kp_relation_info") - if isinstance(kp_relation_info, str): - try: - kp_relation_info = json.loads(kp_relation_info) - except Exception: - kp_relation_info = [] - - config_map[key] = { - "title": row.get("title", ""), - "component_config": component_config, - "summary": summary, - "kp_relation_info": to_json_str(kp_relation_info), - } - - print(f" [MySQL] 核心互动组件配置已加入config_map,当前map大小: {len(config_map)}") - except Exception as e: - print(f" [MySQL] [错误] 查询核心互动组件配置失败: {e}") - import traceback - traceback.print_exc() - - print(f" [MySQL] 组件配置查询完成,共{len(config_map)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - return config_map - - -def calculate_accuracy(question_list: Any) -> float: - """ - 计算问题列表的正确率 - - Args: - question_list: 问题列表(可能是JSON字符串或list) - - Returns: - 正确率(百分比,保留2位小数) - """ - try: - if isinstance(question_list, str): - question_list = json.loads(question_list) - - if not isinstance(question_list, list) or len(question_list) == 0: - return 0.0 - - total = len(question_list) - correct = sum(1 for q in question_list if q.get('isRight') == True) - accuracy = round(correct / total * 100, 2) if total > 0 else 0.0 - - return accuracy - except Exception: - return 0.0 - - - -def fetch_character_ids_by_account(account_id: str, conn: Any) -> List[str]: - """根据账户id查询对应的角色id列表""" - sql = "SELECT id FROM vala_app_character WHERE account_id = %s" - try: - with conn.cursor() as cur: - cur.execute(sql, (account_id,)) - rows = cur.fetchall() or [] - return [str(row["id"]) for row in rows if row.get("id")] - except Exception as e: - print(f"[ERROR] 查询账户id={account_id}的角色id失败: {e}") - return [] - - -def fetch_pg_play_records(user_id: str, conn: Any, mysql_conn: Any) -> List[Dict[str, Any]]: - """ - 查询互动组件学习记录并补充组件配置信息 - - Args: - user_id: 用户ID(角色ID) - conn: PostgreSQL数据库连接 - mysql_conn: MySQL数据库连接 - - Returns: - 互动组件学习记录列表 - """ - print(f" [PG] 开始查询互动组件学习记录(8张分表)...") - start_time = datetime.datetime.now() - - tables = [f"user_component_play_record_{i}" for i in range(8)] - rows: List[Dict[str, Any]] = [] - with conn.cursor(cursor_factory=RealDictCursor) as cur: - for t in tables: - try: - cur.execute( - f""" - SELECT user_id, component_unique_code, session_id, c_type, c_id, - play_result, user_behavior_info, updated_at - FROM {t} - WHERE user_id = %s - ORDER BY updated_at DESC - """, - (user_id,), - ) - part = cur.fetchall() or [] - if part: - print(f" [PG] 表{t}查到{len(part)}条记录") - for r in part: - r = dict(r) - r["play_result"] = to_json_str(r.get("play_result")) - r["user_behavior_info"] = to_json_str(r.get("user_behavior_info")) - # 将带时区的时间转换为无时区,避免Excel写入报错 - upd = r.get("updated_at") - if isinstance(upd, datetime.datetime): - try: - if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: - r["updated_at"] = upd.replace(tzinfo=None) - except Exception: - # 回退为字符串 - r["updated_at"] = str(upd) - rows.append(r) - except Exception as e: - print(f" [PG] 表{t}查询失败: {e}") - continue - - rows.sort(key=lambda x: parse_time(x.get("updated_at")) or datetime.datetime.min, reverse=True) - print(f" [PG] 互动组件学习记录查询完成,共{len(rows)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - - # 批量查询组件配置 - if rows and mysql_conn: - config_map = batch_fetch_component_configs(rows, mysql_conn) - - # 补充组件信息 - print(f" [PG] 开始补充组件配置信息...") - filled_count = 0 - empty_count = 0 - sample_keys = [] - sample_mode_check = [] # 检查对话互动的mode - - for r in rows: - c_type = r.get("c_type", "") - c_id = r.get("c_id") - key = f"{c_type}_{c_id}" if c_type and c_id else "" - - config = config_map.get(key, {}) - component_config = config.get("component_config", {}) - - component_name = get_component_name(c_type, component_config) - r["互动组件名称"] = component_name - r["组件标题"] = config.get("title", "") - r["组件配置摘要"] = config.get("summary", "") - r["知识点"] = config.get("kp_relation_info", "") - - # 统计填充情况 - if config: - filled_count += 1 - if len(sample_keys) < 3: - sample_keys.append((key, component_name, r["组件标题"][:30] if r["组件标题"] else "")) - - # 检查对话互动的mode - if c_type == "mid_sentence_dialogue" and len(sample_mode_check) < 3: - mode = "" - if isinstance(component_config, dict): - question = component_config.get("question", {}) - if isinstance(question, dict): - mode = question.get("mode", "") - sample_mode_check.append({ - "key": key, - "mode": mode, - "component_name": component_name - }) - else: - empty_count += 1 - if empty_count <= 5: # 输出前5个未匹配的key - print(f" [PG] [警告] 未找到组件配置: key={key}") - - print(f" [PG] 组件配置信息补充完成") - print(f" [PG] 匹配到配置: {filled_count}条, 未匹配: {empty_count}条") - if sample_keys: - print(f" [PG] 样例数据(前3条):") - for key, name, title in sample_keys: - print(f" [PG] - key={key}, 名称={name}, 标题={title}") - - if sample_mode_check: - print(f" [PG] 对话互动mode检查(前3条):") - for s in sample_mode_check: - print(f" [PG] - key={s['key']}, mode={s['mode']}, 最终名称={s['component_name']}") - - return rows - - -def fetch_pg_unit_review(user_id: str, conn: Any, id_2_unit_index: Dict[int, int], chapter_id_to_lesson_id: Dict[int, int]) -> List[Dict[str, Any]]: - """ - 查询课程巩固记录 - - Args: - user_id: 用户ID(角色ID) - conn: PostgreSQL数据库连接 - id_2_unit_index: story_id到unit_id的映射字典 - chapter_id_to_lesson_id: chapter_id到lesson_id的映射字典 - - Returns: - 课程巩固记录列表 - """ - print(f" [PG] 开始查询课程巩固记录...") - start_time = datetime.datetime.now() - - sql = ( - "SELECT user_id, story_id, chapter_id, question_list, updated_at " - "FROM user_unit_review_question_result WHERE user_id = %s ORDER BY updated_at DESC" - ) - with conn.cursor(cursor_factory=RealDictCursor) as cur: - try: - cur.execute(sql, (user_id,)) - rows = cur.fetchall() or [] - except Exception as e: - print(f" [PG] 课程巩固记录查询失败: {e}") - rows = [] - out: List[Dict[str, Any]] = [] - for r in rows: - d = dict(r) - - # 映射 story_id 到 unit_id - story_id = d.get("story_id") - unit_id = id_2_unit_index.get(story_id) if story_id else None - d["unit_id"] = unit_id - - # 映射 chapter_id 到 lesson_id - chapter_id = d.get("chapter_id") - lesson_id = chapter_id_to_lesson_id.get(chapter_id) if chapter_id else None - d["lesson_id"] = lesson_id - - # 计算正确率 - question_list = d.get("question_list") - d["正确率"] = calculate_accuracy(question_list) - - d["question_list"] = to_json_str(question_list) - upd = d.get("updated_at") - if isinstance(upd, datetime.datetime): - try: - if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: - d["updated_at"] = upd.replace(tzinfo=None) - except Exception: - d["updated_at"] = str(upd) - out.append(d) - - print(f" [PG] 课程巩固记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - return out - - -def fetch_pg_unit_challenge(user_id: str, conn: Any, id_2_unit_index: Dict[int, int]) -> List[Dict[str, Any]]: - """ - 查询单元挑战记录 - - Args: - user_id: 用户ID(角色ID) - conn: PostgreSQL数据库连接 - id_2_unit_index: story_id到unit_id的映射字典 - - Returns: - 单元挑战记录列表 - """ - print(f" [PG] 开始查询单元挑战记录...") - start_time = datetime.datetime.now() - - sql = ( - "SELECT user_id, story_id, category, score_text, question_list, updated_at " - "FROM user_unit_challenge_question_result WHERE user_id = %s ORDER BY updated_at DESC" - ) - with conn.cursor(cursor_factory=RealDictCursor) as cur: - try: - cur.execute(sql, (user_id,)) - rows = cur.fetchall() or [] - except Exception as e: - print(f" [PG] 单元挑战记录查询失败: {e}") - rows = [] - out: List[Dict[str, Any]] = [] - for r in rows: - d = dict(r) - - # 映射 story_id 到 unit_id - story_id = d.get("story_id") - unit_id = id_2_unit_index.get(story_id) if story_id else None - d["unit_id"] = unit_id - - d["question_list"] = to_json_str(d.get("question_list")) - upd = d.get("updated_at") - if isinstance(upd, datetime.datetime): - try: - if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: - d["updated_at"] = upd.replace(tzinfo=None) - except Exception: - d["updated_at"] = str(upd) - out.append(d) - - print(f" [PG] 单元挑战记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - return out - - -def fetch_pg_unit_summary(user_id: str, conn: Any, id_2_unit_index: Dict[int, int]) -> List[Dict[str, Any]]: - """ - 查询单元总结知识点结果数据 - - Args: - user_id: 用户ID(角色ID) - conn: PostgreSQL数据库连接 - id_2_unit_index: story_id到unit_id的映射字典 - - Returns: - 单元总结记录列表 - """ - print(f" [PG] 开始查询单元总结记录...") - start_time = datetime.datetime.now() - - sql = ( - "SELECT id, user_id, story_id, updated_at, km_id, km_type, play_time " - "FROM user_unit_summary_km_result WHERE user_id = %s AND deleted_at IS NULL ORDER BY updated_at DESC" - ) - with conn.cursor(cursor_factory=RealDictCursor) as cur: - try: - cur.execute(sql, (user_id,)) - rows = cur.fetchall() or [] - except Exception as e: - print(f" [PG] 单元总结记录查询失败: {e}") - rows = [] - - out: List[Dict[str, Any]] = [] - for r in rows: - d = dict(r) - # 映射 story_id 到 unit_id - story_id = d.get("story_id") - unit_id = id_2_unit_index.get(story_id) if story_id else None - d["unit_id"] = unit_id - - # 转换 play_time (毫秒) 为秒 (整数) - play_time = d.get("play_time") - d["play_time_seconds"] = play_time // 1000 if play_time else 0 - - # 移除时区信息 - upd = d.get("updated_at") - if isinstance(upd, datetime.datetime): - try: - if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: - d["updated_at"] = upd.replace(tzinfo=None) - except Exception: - d["updated_at"] = str(upd) - out.append(d) - - print(f" [PG] 单元总结记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - return out - - -def generate_statistics(sheet2_rows: List[Dict[str, Any]], sheet5_rows: List[Dict[str, Any]]) -> tuple: - """ - 生成汇总统计数据 - - Args: - sheet2_rows: 互动组件学习记录 - sheet5_rows: 单元总结记录 - - Returns: - (组件统计DataFrame, 知识点统计DataFrame, 单元时长统计DataFrame) - """ - if pd is None: - raise RuntimeError("缺少pandas依赖,请安装后再运行。") - - print(f" [统计] 开始生成汇总统计数据...") - start_time = datetime.datetime.now() - - from collections import defaultdict - - # ============ a. 所有互动-按互动组件类型-通过情况统计 ============ - component_stats_data = [] - component_stats = defaultdict(lambda: {"Perfect": 0, "Good": 0, "Failed": 0, "Pass": 0, "Oops": 0, "total": 0}) - - # 用于调试 - sample_results = [] - parse_error_count = 0 - - for idx, record in enumerate(sheet2_rows): - component_name = record.get("互动组件名称", "") - if not component_name: - continue - - play_result_str = record.get("play_result", "") - - # 解析play_result - result = "" - try: - # 先判断是否是简单的字符串(Perfect/Good/Failed/Pass/Oops) - if isinstance(play_result_str, str): - # 去除空格后检查 - stripped = play_result_str.strip() - if stripped in ["Perfect", "Good", "Failed", "Pass", "Oops"]: - # 直接使用 - result = stripped - else: - # 尝试JSON解析 - try: - play_result = json.loads(play_result_str) - if isinstance(play_result, dict): - result = play_result.get("result", "") - else: - result = "" - except: - result = "" - else: - # 如果不是字符串,尝试当dict处理 - if isinstance(play_result_str, dict): - result = play_result_str.get("result", "") - else: - result = "" - - # 收集前3个样例 - if idx < 3: - sample_results.append({ - "component": component_name, - "raw": str(play_result_str)[:100], - "result": result - }) - except Exception as e: - parse_error_count += 1 - if parse_error_count <= 3: - print(f" [统计] [警告] 解析play_result失败 (第{idx+1}条): {e}, 原始值: {str(play_result_str)[:100]}") - result = "" - - component_stats[component_name]["total"] += 1 - if result in ["Perfect", "Good", "Failed", "Pass", "Oops"]: - component_stats[component_name][result] += 1 - - print(f" [统计] play_result解析样例(前3条):") - for s in sample_results: - print(f" [统计] - 组件: {s['component']}, 结果: {s['result']}, 原始: {s['raw']}") - if parse_error_count > 0: - print(f" [统计] play_result解析失败总数: {parse_error_count}") - - # 生成统计数据行 - for component_name in sorted(component_stats.keys()): - stats = component_stats[component_name] - total = stats["total"] - perfect = stats["Perfect"] - good = stats["Good"] - failed = stats["Failed"] - pass_count = stats["Pass"] - oops = stats["Oops"] - - perfect_ratio = round(perfect / total * 100, 2) if total > 0 else 0 - good_ratio = round(good / total * 100, 2) if total > 0 else 0 - failed_ratio = round(failed / total * 100, 2) if total > 0 else 0 - pass_ratio = round(pass_count / total * 100, 2) if total > 0 else 0 - oops_ratio = round(oops / total * 100, 2) if total > 0 else 0 - - component_stats_data.append({ - "互动组件名称": component_name, - "总数量": total, - "Perfect数量": perfect, - "Good数量": good, - "Failed数量": failed, - "Pass数量": pass_count, - "Oops数量": oops, - "Perfect比例(%)": perfect_ratio, - "Good比例(%)": good_ratio, - "Failed比例(%)": failed_ratio, - "Pass比例(%)": pass_ratio, - "Oops比例(%)": oops_ratio, - }) - - # ============ b. 中互动组件-按知识点-通过情况统计 ============ - kp_stats_data = [] - kp_stats = defaultdict(lambda: {"Perfect": 0, "Good": 0, "Failed": 0, "Pass": 0, "Oops": 0, "total": 0}) - - # 调试信息 - mid_count = 0 - has_kp_count = 0 - sample_kp_records = [] - - for idx, record in enumerate(sheet2_rows): - c_type = record.get("c_type", "") - if not c_type or not c_type.startswith("mid"): - continue - - mid_count += 1 - kp_relation_info_str = record.get("知识点", "") - - if not kp_relation_info_str: - continue - - has_kp_count += 1 - - # 解析知识点 - try: - if isinstance(kp_relation_info_str, str): - kp_relation_info = json.loads(kp_relation_info_str) - else: - kp_relation_info = kp_relation_info_str - - if not isinstance(kp_relation_info, list): - continue - - # 收集样例 - if len(sample_kp_records) < 3: - sample_kp_records.append({ - "c_type": c_type, - "kp_count": len(kp_relation_info), - "kp_info": str(kp_relation_info)[:200] - }) - - # 解析play_result(使用相同的逻辑) - play_result_str = record.get("play_result", "") - result = "" - if isinstance(play_result_str, str): - stripped = play_result_str.strip() - if stripped in ["Perfect", "Good", "Failed", "Pass", "Oops"]: - result = stripped - else: - try: - play_result = json.loads(play_result_str) - if isinstance(play_result, dict): - result = play_result.get("result", "") - except: - pass - elif isinstance(play_result_str, dict): - result = play_result_str.get("result", "") - - # 为每个知识点统计 - for kp in kp_relation_info: - if not isinstance(kp, dict): - continue - - kp_id = kp.get("kpId", "") - kp_type = kp.get("kpType", "") - kp_title = kp.get("kpTitle", "") - - if not kp_id: - continue - - kp_key = f"{kp_id}|{kp_type}|{kp_title}" - kp_stats[kp_key]["total"] += 1 - if result in ["Perfect", "Good", "Failed", "Pass", "Oops"]: - kp_stats[kp_key][result] += 1 - - except Exception as e: - if len(sample_kp_records) < 5: - print(f" [统计] [警告] 解析知识点失败: {e}, 原始值: {str(kp_relation_info_str)[:100]}") - continue - - print(f" [统计] 中互动组件统计: 总数={mid_count}, 有知识点={has_kp_count}, 知识点条目数={len(kp_stats)}") - if sample_kp_records: - print(f" [统计] 知识点样例(前3条):") - for s in sample_kp_records: - print(f" [统计] - c_type={s['c_type']}, 知识点数量={s['kp_count']}, 内容={s['kp_info']}") - - # 生成知识点统计数据行 - for kp_key in sorted(kp_stats.keys()): - parts = kp_key.split("|") - if len(parts) != 3: - continue - - kp_id, kp_type, kp_title = parts - stats = kp_stats[kp_key] - total = stats["total"] - perfect = stats["Perfect"] - good = stats["Good"] - failed = stats["Failed"] - pass_count = stats["Pass"] - oops = stats["Oops"] - - perfect_ratio = round(perfect / total * 100, 2) if total > 0 else 0 - good_ratio = round(good / total * 100, 2) if total > 0 else 0 - failed_ratio = round(failed / total * 100, 2) if total > 0 else 0 - pass_ratio = round(pass_count / total * 100, 2) if total > 0 else 0 - oops_ratio = round(oops / total * 100, 2) if total > 0 else 0 - - kp_stats_data.append({ - "知识点ID": kp_id, - "知识点类型": kp_type, - "知识点标题": kp_title, - "总数量": total, - "Perfect数量": perfect, - "Good数量": good, - "Failed数量": failed, - "Pass数量": pass_count, - "Oops数量": oops, - "Perfect比例(%)": perfect_ratio, - "Good比例(%)": good_ratio, - "Failed比例(%)": failed_ratio, - "Pass比例(%)": pass_ratio, - "Oops比例(%)": oops_ratio, - }) - - # ============ c. 单元总结-按单元统计时长 ============ - unit_time_stats_data = [] - unit_time_stats = defaultdict(int) - - for record in sheet5_rows: - unit_id = record.get("unit_id") - play_time_seconds = record.get("play_time_seconds", 0) - - if unit_id is not None: - unit_time_stats[unit_id] += play_time_seconds - - # 生成单元时长统计数据行 - for unit_id in sorted(unit_time_stats.keys()): - total_seconds = unit_time_stats[unit_id] - total_minutes = int(total_seconds / 60) - - unit_time_stats_data.append({ - "单元ID": f"unit_{unit_id}", - "总时长(秒)": total_seconds, - "总时长(分钟)": total_minutes, - }) - - print(f" [统计] 汇总统计数据生成完成,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - print(f" [统计] 生成了{len(component_stats_data)}条组件统计, {len(kp_stats_data)}条知识点统计, {len(unit_time_stats_data)}条单元时长统计") - - return ( - pd.DataFrame(component_stats_data), - pd.DataFrame(kp_stats_data), - pd.DataFrame(unit_time_stats_data) - ) - - - -def write_excel(path: str, sheet1_rows: List[Dict[str, Any]], sheet2_rows: List[Dict[str, Any]], sheet3_rows: List[Dict[str, Any]], sheet4_rows: List[Dict[str, Any]], sheet5_rows: List[Dict[str, Any]], stats_component_df: Any, stats_kp_df: Any, stats_unit_time_df: Any) -> None: - if pd is None: - raise RuntimeError("缺少pandas依赖,请安装后再运行。") - - print(f" [Excel] 开始写入Excel文件: {path}") - start_time = datetime.datetime.now() - - out_dir = os.path.dirname(path) or "." - os.makedirs(out_dir, exist_ok=True) - with pd.ExcelWriter(path, engine="openpyxl") as writer: - pd.DataFrame(sheet1_rows, columns=SHEET1_COLUMNS).to_excel(writer, sheet_name="全部音频数据", index=False) - pd.DataFrame(sheet2_rows, columns=SHEET2_COLUMNS).to_excel(writer, sheet_name="互动组件学习记录", index=False) - pd.DataFrame(sheet3_rows, columns=SHEET3_COLUMNS).to_excel(writer, sheet_name="课程巩固记录", index=False) - pd.DataFrame(sheet4_rows, columns=SHEET4_COLUMNS).to_excel(writer, sheet_name="单元挑战记录", index=False) - pd.DataFrame(sheet5_rows, columns=SHEET5_COLUMNS).to_excel(writer, sheet_name="单元总结记录", index=False) - stats_component_df.to_excel(writer, sheet_name="统计-互动组件通过情况", index=False) - stats_kp_df.to_excel(writer, sheet_name="统计-知识点通过情况", index=False) - stats_unit_time_df.to_excel(writer, sheet_name="统计-单元总结时长", index=False) - - print(f" [Excel] 写入完成,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - - -def get_date_str() -> str: - """获取当前日期字符串 格式:YYYYMMDD""" - return datetime.datetime.now().strftime("%Y%m%d") - - -def export_single_user(user_id: str, es_cfg: Dict[str, Any], pg_conn: Any, mysql_conn: Any, output_path: str, id_2_unit_index: Dict[int, int], chapter_id_to_lesson_id: Dict[int, int]) -> bool: - """ - 导出单个角色id的数据 - - Args: - user_id: 角色ID - es_cfg: ES配置 - pg_conn: PostgreSQL连接 - mysql_conn: MySQL连接 - output_path: 输出路径 - id_2_unit_index: story_id到unit_id的映射字典 - chapter_id_to_lesson_id: chapter_id到lesson_id的映射字典 - - Returns: - True表示成功,False表示失败 - """ - try: - print(f"\n[INFO] ========== 开始导出角色id={user_id} ==========") - total_start_time = datetime.datetime.now() - - # 查询ES数据 - sheet1_rows = fetch_es_user_audio(user_id, es_cfg) - - # 查询PG数据 - sheet2_rows = fetch_pg_play_records(user_id, pg_conn, mysql_conn) - sheet3_rows = fetch_pg_unit_review(user_id, pg_conn, id_2_unit_index, chapter_id_to_lesson_id) - sheet4_rows = fetch_pg_unit_challenge(user_id, pg_conn, id_2_unit_index) - sheet5_rows = fetch_pg_unit_summary(user_id, pg_conn, id_2_unit_index) - - # 检查是否有有效数据 - total_records = len(sheet1_rows) + len(sheet2_rows) + len(sheet3_rows) + len(sheet4_rows) + len(sheet5_rows) - print(f" [统计] 数据汇总:") - print(f" - 全部音频数据: {len(sheet1_rows)}条") - print(f" - 互动组件学习记录: {len(sheet2_rows)}条") - print(f" - 课程巩固记录: {len(sheet3_rows)}条") - print(f" - 单元挑战记录: {len(sheet4_rows)}条") - print(f" - 单元总结记录: {len(sheet5_rows)}条") - print(f" - 总计: {total_records}条") - - if total_records == 0: - print(f"[WARN] 角色id={user_id} 没有找到任何有效记录,跳过导出") - return False - - # 生成汇总统计数据 - stats_component_df, stats_kp_df, stats_unit_time_df = generate_statistics(sheet2_rows, sheet5_rows) - - # 写入Excel - write_excel(output_path, sheet1_rows, sheet2_rows, sheet3_rows, sheet4_rows, sheet5_rows, stats_component_df, stats_kp_df, stats_unit_time_df) - - total_time = (datetime.datetime.now() - total_start_time).total_seconds() - print(f"[INFO] 角色id={user_id} 导出成功") - print(f"[INFO] 文件路径: {output_path}") - print(f"[INFO] 总耗时: {total_time:.2f}秒") - print(f"[INFO] ========== 完成 ==========\n") - return True - - except Exception as e: - print(f"[ERROR] 角色id={user_id} 导出失败: {e}") - import traceback - traceback.print_exc() - return False - - -def main(): - load_env() - - # 确定运行模式并收集需要导出的角色id列表 - user_id_list: List[tuple] = [] # [(user_id, account_id or None), ...] - date_str = get_date_str() - - # 检查三种模式的配置 - has_user_id = USER_ID is not None - has_user_id_list = USER_ID_LIST is not None and len(USER_ID_LIST) > 0 - has_account_id_list = ACCOUNT_ID_LIST is not None and len(ACCOUNT_ID_LIST) > 0 - - # 验证只能配置一种模式 - mode_count = sum([has_user_id, has_user_id_list, has_account_id_list]) - if mode_count == 0: - raise RuntimeError("请配置 USER_ID、USER_ID_LIST 或 ACCOUNT_ID_LIST 中的一个") - if mode_count > 1: - raise RuntimeError("USER_ID、USER_ID_LIST、ACCOUNT_ID_LIST 只能配置一个,请检查配置") - - # 模式1:单个角色id - if has_user_id: - user_id_list = [(str(USER_ID), None)] - print(f"[INFO] 运行模式:单个角色id") - - # 模式2:角色id列表 - elif has_user_id_list: - user_id_list = [(str(uid), None) for uid in USER_ID_LIST] - print(f"[INFO] 运行模式:角色id列表,共{len(user_id_list)}个角色") - - # 模式3:账户id列表 - elif has_account_id_list: - print(f"[INFO] 运行模式:账户id列表,共{len(ACCOUNT_ID_LIST)}个账户") - mysql_conn = None - try: - mysql_conn = get_mysql_conn("vala_user") # 查询用户表,使用 vala_user 数据库 - for account_id in ACCOUNT_ID_LIST: - account_id_str = str(account_id) - print(f"[INFO] 查询账户id={account_id_str}对应的角色id...") - character_ids = fetch_character_ids_by_account(account_id_str, mysql_conn) - if not character_ids: - print(f"[WARN] 账户id={account_id_str} 未找到关联的角色id,跳过") - continue - print(f"[INFO] 账户id={account_id_str} 找到{len(character_ids)}个角色id: {character_ids}") - for cid in character_ids: - user_id_list.append((cid, account_id_str)) - finally: - if mysql_conn: - try: - mysql_conn.close() - except Exception: - pass - - if not user_id_list: - print("[WARN] 没有需要导出的角色id,程序退出") - return - - # 初始化连接 - es_cfg = get_es_config() - pg_conn = get_pg_conn() - - # 获取映射表(只需要查询一次,所有角色共用) - print(f"\n[INFO] ===== 准备工作:获取映射表 =====") - mysql_conn = None - id_2_unit_index = {} - chapter_id_to_lesson_id = {} - try: - print(f"[INFO] 正在连接MySQL数据库(vala_test)...") - mysql_conn = get_mysql_conn("vala_test") # 查询游戏配置表,使用 vala_test 数据库 - print(f"[INFO] 正在获取 story_id 到 unit_id 的映射...") - id_2_unit_index = get_id_2_unit_index(mysql_conn) - print(f"[INFO] 成功获取 {len(id_2_unit_index)} 个 story_id 映射") - print(f"[INFO] 正在获取 chapter_id 到 lesson_id 的映射...") - chapter_id_to_lesson_id = get_chapter_id_to_lesson_id(mysql_conn) - print(f"[INFO] 成功获取 {len(chapter_id_to_lesson_id)} 个 chapter_id 映射") - except Exception as e: - print(f"[ERROR] 获取映射表失败: {e}") - import traceback - traceback.print_exc() - if pg_conn: - try: - pg_conn.close() - except Exception: - pass - if mysql_conn: - try: - mysql_conn.close() - except Exception: - pass - return - - try: - # 统计信息 - success_count = 0 - skip_count = 0 - - print(f"\n[INFO] ===== 开始批量导出 =====") - print(f"[INFO] 共需导出{len(user_id_list)}个角色\n") - batch_start_time = datetime.datetime.now() - - # 循环处理每个角色id - for idx, (user_id, account_id) in enumerate(user_id_list, 1): - print(f"\n{'='*60}") - print(f"[INFO] 进度: {idx}/{len(user_id_list)} ({idx*100//len(user_id_list)}%)") - print(f"{'='*60}") - - # 生成输出文件名 - if account_id is None: - # 模式1和模式2:角色id_{}_导出时间_{}.xlsx - filename = f"角色id_{user_id}_导出时间_{date_str}.xlsx" - else: - # 模式3:账户id_{}_角色id_{}_导出时间_{}.xlsx - filename = f"账户id_{account_id}_角色id_{user_id}_导出时间_{date_str}.xlsx" - - output_path = os.path.join(OUTPUT_DIR, filename) - - # 导出单个角色的数据 - result = export_single_user(user_id, es_cfg, pg_conn, mysql_conn, output_path, id_2_unit_index, chapter_id_to_lesson_id) - if result: - success_count += 1 - else: - skip_count += 1 - - # 输出统计信息 - batch_total_time = (datetime.datetime.now() - batch_start_time).total_seconds() - print(f"\n{'='*60}") - print(f"[INFO] ===== 全部导出完成 =====") - print(f"[INFO] 总计: {len(user_id_list)}个角色") - print(f"[INFO] 成功: {success_count}个") - print(f"[INFO] 跳过: {skip_count}个") - print(f"[INFO] 总耗时: {batch_total_time:.2f}秒 ({batch_total_time/60:.2f}分钟)") - if success_count > 0: - print(f"[INFO] 平均每个角色: {batch_total_time/success_count:.2f}秒") - print(f"{'='*60}\n") - - finally: - if pg_conn: - try: - pg_conn.close() - except Exception: - pass - if mysql_conn: - try: - mysql_conn.close() - except Exception: - pass - - -if __name__ == "__main__": - main() diff --git a/makee_vala/git_scripts/extract_core_speaking_data.py b/makee_vala/git_scripts/extract_core_speaking_data.py deleted file mode 100644 index 237d266..0000000 --- a/makee_vala/git_scripts/extract_core_speaking_data.py +++ /dev/null @@ -1,681 +0,0 @@ -""" -筛选 整合 线上的 口语 核心互动 对话记录数据 - -数据筛选流程如下: -一 步骤一 -首先, 在 PGsql数据库中 筛选出 口语核心互动对应的 session_id. -数据库相关配置 从.env中读取: -PG_DB_HOST = xxx -PG_DB_PORT = xxx -PG_DB_USER = xxx -PG_DB_PASSWORD = xxx -PG_DB_DATABASE = xxx - -读取以下数据表: -user_component_play_record_0 ~ user_component_play_record_7 - -支持输入时间范围 -起始时间 和 截止时间 配置格式: "20250110" - -数据表中的时间字段为 updated_at , 格式样例: "2025-11-05 19:35:46.698246+08:00" - -在这些时间范围内,筛选以下数据: -c_type 为 core_speaking_reply 或者 core_speaking_inquiry 的数据 - -输出总的数据条数 - -然后导出 中间 excel文件 - -包含以下字段: -user_id, -session_id, -c_type, -c_id, -play_result, -updated_at - -二. 步骤二 -根据 c_type 和 c_id 筛选核心互动的配置 补充一些字段。 - -需要读取配置表: -mysql表 core_interaction_component -相关环境变量在.env: -MYSQL_HOST=xxx -MYSQL_USERNAME=xxx -MYSQL_PASSWORD=xxx -MYSQL_DATABASE=xxx -MYSQL_PORT=xxx - -基于 c_type 和 c_id 字段匹配, 在 步骤一表格内容基础上追加以下字段: -title -reference_dialog 从 component_config 中抽取出 reference_dialog 字段的内容。 -component_config内容样例: -``` -{"taskInfo":{"cId":"0000001","cType":"core_speaking_inquiry","title":"询问种植甜瓜的信息","taskDesc":"向Ben提问甜瓜种植的最佳季节、浇水频率和成熟的季节;","sceneDesc":"我和Ben到甜味城,参观了水果资源站和种植园。Ben的妈妈Kate讲了种植知识,我们都很感兴趣,想一起种甜瓜。我不懂,便问Ben,他虽没种过、不确定,还是告诉我注意事项。","img":"","key":[{"desc":"询问种植信息","keyList":[{"type":"default","npcId":269,"content":"Have you ever planted a ...?","desc":"你种过......吗?"},{"type":"default","npcId":269,"content":"What season is the best time to plant ...?","desc":"种植......的最佳时间是哪个季节?"},{"type":"default","npcId":269,"content":"Do ... need ... every day?","desc":"......需要每天浇......么?"}]}]},"dialogSetting":{"setting":{"npcName":"Ben","npcId":287,"round":5,"checkRound":3}},"dialogConfig":{"config":{"asrPrompt":"melon,summer,autumn,water,frequency,plant,season,harvest","promptInfo":{"default":"# 1. 角色(你要扮演谁)\n- 你是 Ben,一个 8 岁的小男孩,对种植水果感兴趣但不太确定具体细节。\n- 语言风格:简单、直接,偶尔带有不确定的语气。\n- 示例表达:\n - \"I think summer. It's warm then.\"\n - \"Maybe every two days? Not every day, I think.\"\n\n# 2. 任务(你如何参与到整个对话)\n- 你需要只在用户提问时提供信息,不会主动提及种植甜瓜的具体细节。\n- 如果用户提问相关内容,你需要根据知识库中的信息回答,不编造或偏离。\n- 如果用户的问题不清晰,你需要尝试澄清后再作答。\n- 如果用户长时间不提问或偏离主题,你需要温和、自然地进行交谈,引导回到主题。\n- 当所有知识点已传达后,你需要鼓励用户开始行动。\n\n# 3. 背景信息(引用配置)\n`你是 Ben,你和用户来到了甜味城Sweet Town。你们参观了水果资源站和种植园。在种植园中,你的妈妈Kate给你和用户介绍了一些种植水果的知识。你和用户对此很感兴趣。你们想要一起种一颗甜瓜。用户不知道种植甜瓜的知识,于是向你提问。虽然你没有种过甜瓜,对什么都不确定。但你还是回答了用户的问题,告诉用户关于种植甜瓜需要注意的事情。`\n\n# 4. 知识库(你知道的信息)\n- 种甜瓜的最佳季节:应该在夏天\n- 种甜瓜的浇水频率:应该隔一天浇一次水\n- 甜瓜成熟的季节:秋天\n- 如果用户提问相关内容,你会用这些信息来回答。\n\n# 5. 语言风格(固定内容)\n 1. 使用标准、正式的英语,水平为 CEFR A1/A2,每句话不超过 10 个单词\n 2. 始终保持礼貌和友好\n 3. 尽量避免重复表达,适当变换措辞\n\n# 6. 开场白\n你由你开始对话,你会说:“Let's plant a melon now! Or do you still have some questions?”\n\n# 7. 回应方式(固定内容)\n`你只在用户提问时才根据知识库中的信息回答。其他时候,以符合你身份的方式,自然地进行交谈。不主动提供信息,不偏离语境。`","final_goal":"Ben 说出了种植种甜瓜的最佳季节应该在夏天、种甜瓜的浇水频率应该隔一天浇一次水以及甜瓜成熟的季节是秋天","in_progress_goal":"Ben 说出了种植种甜瓜的最佳季节应该在夏天、种甜瓜的浇水频率应该隔一天浇一次水以及甜瓜成熟的季节是秋天","reference_dialog":"# 示例对话\nBen: Let's plant a melon now! Or do you still have some questions?\nYou: Great! When is the best time to plant it?\nBen: I think summer. It's warm then. Mom said melons like warm weather.\nYou: Oh, good. How often should we water it?\nBen: Maybe every two days? Not every day, I think.\nYou: And when will it be ready to eat?\nBen: Autumn, I guess. Plant in summer, get melons in autumn. That sounds right.","scene":"#任务背景\n你是 Ben,你和用户来到了甜味城Sweet Town。你们参观了水果资源站和种植园。在种植园中,你的妈妈Kate给你和用户介绍了一些种植水果的知识。你和用户对此很感兴趣。你们想要一起种一颗甜瓜。用户不知道种植甜瓜的知识,于是向你提问。虽然你没有种过甜瓜,对什么都不确定。但你还是回答了用户的问题,告诉用户关于种植甜瓜需要注意的事情。","user_knowledge":"# 知识\n- 询问种植信息\nHave you ever planted a ...? 你种过......吗?\nWhat season is the best time to plant ...? 种植......的最佳时间是哪个季节?\nDo ... need ... every day? ......需要每天浇......么?","user_scene":"我和Ben到甜味城,参观了水果资源站和种植园。Ben的妈妈Kate讲了种植知识,我们都很感兴趣,想一起种甜瓜。我不懂,便问Ben,他虽没种过、不确定,还是告诉我注意事项。","user_task":"向Ben提问甜瓜种植的最佳季节、浇水频率和成熟的季节;"}}},"studyInfo":{"learningPart":{"learning":[{"question":{"desc":"现在你需要询问Ben关于种植甜瓜的最佳季节。"},"optionList":[{"option":"When is the best time to plant it?","feedbackDesc":"太棒了!你正确地询问了种植甜瓜的最佳季节。请大声朗读这句话!"},{"option":"How often should we water it?","feedbackDesc":"这句话是询问浇水频率的,不是询问最佳种植季节的。请再试一次,询问Ben种植甜瓜的最佳季节。"},{"option":"When will it be ready to eat?","feedbackDesc":"这句话是询问甜瓜成熟季节的,不是询问最佳种植季节的。请再试一次,询问Ben种植甜瓜的最佳季节。"}],"answer":[0],"read":{"type":"user","npcId":30,"content":"When is the best time to plant it?"},"feedback":{"type":"npc","npcName":"Ben","npcId":287,"content":"I think summer. It's warm then. Mom said melons like warm weather."}},{"question":{"desc":"Ben告诉你种植甜瓜的最佳季节是夏天。现在你需要询问Ben关于种植甜瓜的浇水频率。"},"optionList":[{"option":"When is the best time to plant it?","feedbackDesc":"这句话是询问最佳种植季节的,不是询问浇水频率的。请再试一次,询问Ben种植甜瓜的浇水频率。"},{"option":"How often should we water it?","feedbackDesc":"太棒了!你正确地询问了种植甜瓜的浇水频率。请大声朗读这句话!"},{"option":"When will it be ready to eat?","feedbackDesc":"这句话是询问甜瓜成熟季节的,不是询问浇水频率的。请再试一次,询问Ben种植甜瓜的浇水频率。"}],"answer":[1],"read":{"type":"user","npcId":30,"content":"How often should we water it?"},"feedback":{"type":"npc","npcName":"Ben","npcId":287,"content":"Maybe every two days? Not every day, I think."}},{"question":{"desc":"Ben告诉你种植甜瓜的浇水频率是隔一天一次。现在你需要询问Ben关于甜瓜成熟的季节。"},"optionList":[{"option":"When is the best time to plant it?","feedbackDesc":"这句话是询问最佳种植季节的,不是询问甜瓜成熟季节的。请再试一次,询问Ben甜瓜成熟的季节。"},{"option":"How often should we water it?","feedbackDesc":"这句话是询问浇水频率的,不是询问甜瓜成熟季节的。请再试一次,询问Ben甜瓜成熟的季节。"},{"option":"When will it be ready to eat?","feedbackDesc":"太棒了!你正确地询问了甜瓜成熟的季节。请大声朗读这句话!"}],"answer":[2],"read":{"type":"user","npcId":30,"content":"When will it be ready to eat?"},"feedback":{"type":"npc","npcName":"Ben","npcId":287,"content":"Autumn, I guess. Plant in summer, get melons in autumn. That sounds right."}}],"opening":{"type":"npc","npcName":"Ben","npcId":287,"content":"Let's plant a melon now! Or do you still have some questions?","desc":"Ben邀请你一起种植甜瓜,并询问你是否还有问题。"},"closing":{"desc":"Ben已经回答了所有关于种植甜瓜的问题,任务成功完成!"}}},"kpInfoList":[{"kpId":"","kpType":"sentence","kpTitle":"What do you think about the fight?","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"","kpType":"sentence","kpTitle":"What do you think about the fight?","kpSkill":"sentence_meaning","kpSkillName":"语义"},{"kpId":"","kpType":"sentence","kpTitle":"Can you help us?","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"","kpType":"sentence","kpTitle":"Can you help us?","kpSkill":"sentence_meaning","kpSkillName":"语义"},{"kpId":"","kpType":"sentence","kpTitle":"Do you know any way to beat him?","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"","kpType":"sentence","kpTitle":"Do you know any way to beat him?","kpSkill":"sentence_meaning","kpSkillName":"语义"}]} -``` - -追加后,excel文件包含以下字段: -user_id, -session_id, -c_type, -c_id, -play_result, -updated_at, -title -reference_dialog - -三. 步骤三 追加对话历史数据 -对话历史数据,需要根据以下es数据库来补充: - -es索引: llm_roleplayagent_round_log -相关环境变量在.env: -ES_HOST=xxx -ES_PORT=xxx -ES_SCHEME=xxx -ES_USER=xxx -ES_PASSWORD=xxx - -基于每条记录中的 session_id, 匹配 es日志中 session_id 相同 且 action为 get_chat 对应的记录,整理后, 追加为 chat_log 字段。 - -es中的日志是每轮作为一条记录,按以下逻辑进行拼接: -读取 current_round, -current round 为 0 , 则 chat_log中加入 npc_message 的内容 "npc: " + npc_message -current round 为 1~n 按顺序 依次追加 user_input 和 npc_message , 每轮之间用换行符隔开。 -完全拼接后 最为 chat_log 内容 -完整样例: -``` -npc:xxx -user:xxx -npc:xxx -... ... -``` - -拼接完成后 追加 chat_log 和 round_num (取最大的current_round) - -最终输出的 excel文件字段: -user_id, -session_id, -c_type, -c_id, -play_result, -updated_at, -title, -reference_dialog, -chat_log, -user_behavior_info, -round_num - - ----------------------- -根据以上需求 提供一个数据处理的脚本 尽量用高效的匹配。 我只需要输出最终的匹配文件,一个简单的功能脚本。 脚本不需要太复杂。但在输出的节点增加必要的日志 方便我了解数据量和进度 输入 时间范围 在 脚本开头配置即可。 ----------------------- - -补充需求: -pg sql数据库中 增加字段 user_behavior_info 读取。 并保留到最终的输出excel文档中 在 chat_log字段之后。 其他不变。 ----------------------- - -补充需求 25.11.07: -从 mysql表中 额外读取两个字段的信息进行处理, - -1. lesson -抽取related_path字段中的lessonIndex内容 (4): -{"packageId":2,"unitId":26,"lessonId":128,"packageIndex":1,"unitIndex":12,"lessonIndex":4} - -2. knowledge_points -直接读取 kp_relation_info 的内容。 - -3. in_progress_goal -读取 和 reference_dialog 平级的 in_progress_goal 字段内容。 - -4. final_goal -读取 和 reference_dialog 平级的 final_goal 字段内容。 - -以上四个字段 都追加到最终输出的表中, -全部输出字段顺序如下: - -user_id, -session_id, -c_type, -c_id, -play_result, -updated_at, -title, -lesson, -knowledge_points, -in_progress_goal, -final_goal, -reference_dialog, -chat_log, -user_behavior_info, -round_num - - -""" - -import os -import json -import pandas as pd -import psycopg2 -import pymysql -from elasticsearch import Elasticsearch -from datetime import datetime -from dotenv import load_dotenv -import logging - -# 配置日志 -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') -logger = logging.getLogger(__name__) - -# 时间范围配置 - 修改这里的日期范围 -START_DATE = "20251001" # 起始时间 格式: "20250110" -END_DATE = "20251031" # 截止时间 格式: "20250131" - -class CoreSpeakingDataProcessor: - def __init__(self): - # 加载环境变量 - load_dotenv() - - # PG数据库配置 - self.pg_config = { - 'host': os.getenv('PG_DB_HOST'), - 'port': int(os.getenv('PG_DB_PORT', 5432)), - 'user': os.getenv('PG_DB_USER'), - 'password': os.getenv('PG_DB_PASSWORD'), - 'database': os.getenv('PG_DB_DATABASE') - } - - # MySQL数据库配置 - self.mysql_config = { - 'host': os.getenv('MYSQL_HOST'), - 'port': int(os.getenv('MYSQL_PORT', 3306)), - 'user': os.getenv('MYSQL_USERNAME'), - 'password': os.getenv('MYSQL_PASSWORD'), - 'database': os.getenv('MYSQL_DATABASE'), - 'charset': 'utf8mb4' - } - - # ES配置 - self.es_config = { - 'host': os.getenv('ES_HOST'), - 'port': int(os.getenv('ES_PORT', 9200)), - 'scheme': os.getenv('ES_SCHEME', 'http'), - 'user': os.getenv('ES_USER'), - 'password': os.getenv('ES_PASSWORD') - } - - self.data = None - - def convert_date_format(self, date_str): - """将'20250110'格式转换为数据库查询用的格式""" - try: - dt = datetime.strptime(date_str, '%Y%m%d') - return dt.strftime('%Y-%m-%d') - except ValueError: - logger.error(f"日期格式错误: {date_str}, 应为'20250110'格式") - raise - - def get_next_day(self, date_str): - """获取下一天的日期""" - try: - dt = datetime.strptime(date_str, '%Y%m%d') - next_day = dt + pd.Timedelta(days=1) - return next_day.strftime('%Y-%m-%d') - except ValueError: - logger.error(f"日期格式错误: {date_str}, 应为'20250110'格式") - raise - - def step1_extract_from_pg(self): - """步骤一: 从PG数据库筛选核心互动数据""" - logger.info("步骤一: 开始从PG数据库筛选数据...") - - start_date = self.convert_date_format(START_DATE) - end_date_next = self.get_next_day(END_DATE) # 获取结束日期的下一天 - logger.info(f"时间范围: {start_date} 到 {end_date_next} (不含)") - - # 构建查询SQL - 查询8个分表 - all_data = [] - table_names = [f"user_component_play_record_{i}" for i in range(8)] - - for table_name in table_names: - logger.info(f"正在处理表: {table_name}") - - # 为每个表创建独立的连接,避免事务问题 - try: - conn = psycopg2.connect(**self.pg_config) - logger.debug(f"为表 {table_name} 创建数据库连接") - except Exception as e: - logger.error(f"为表 {table_name} 创建数据库连接失败: {e}") - continue - - # 检查当前表是否存在 user_behavior_info 字段 - has_behavior_info = False - try: - with conn.cursor() as cur: - cur.execute( - """ - SELECT EXISTS ( - SELECT 1 FROM information_schema.columns - WHERE table_name = %s - AND column_name = 'user_behavior_info' - ) - """, - (table_name,) - ) - res = cur.fetchone() - has_behavior_info = bool(res[0]) if res else False - logger.debug(f"表 {table_name} 是否包含 user_behavior_info: {has_behavior_info}") - except Exception as e: - logger.warning(f"检测表 {table_name} 的 user_behavior_info 字段失败: {e}") - - # 动态构建查询列 - extra_col = ", user_behavior_info" if has_behavior_info else "" - sql = f""" - SELECT - user_id, - session_id, - c_type, - c_id, - play_result, - updated_at{extra_col} - FROM {table_name} - WHERE - updated_at >= %s - AND updated_at < %s - AND c_type IN ('core_speaking_reply', 'core_speaking_inquiry') - ORDER BY updated_at - """ - - try: - df = pd.read_sql(sql, conn, params=[start_date, end_date_next]) - # 保证列存在,即使部分分表没有该字段 - if 'user_behavior_info' not in df.columns: - df['user_behavior_info'] = '' - if not df.empty: - logger.info(f"表 {table_name} 获取到 {len(df)} 条数据") - all_data.append(df) - else: - logger.info(f"表 {table_name} 无符合条件的数据") - except Exception as e: - logger.error(f"查询表 {table_name} 失败: {e}") - finally: - conn.close() - - if all_data: - self.data = pd.concat(all_data, ignore_index=True) - logger.info(f"步骤一完成: 总共获取到 {len(self.data)} 条数据") - - # 统计 user_behavior_info 非空条数 - if 'user_behavior_info' in self.data.columns: - non_empty_behavior = (self.data['user_behavior_info'].astype(str).str.strip() != '').sum() - logger.info(f"步骤一: user_behavior_info 字段有值 {non_empty_behavior}/{len(self.data)} 条") - - # 处理datetime字段,去掉时区信息(Excel不支持带时区的datetime) - if 'updated_at' in self.data.columns: - self.data['updated_at'] = pd.to_datetime(self.data['updated_at']).dt.tz_localize(None) - logger.info("已处理updated_at字段的时区信息") - - # 输出中间Excel文件 - intermediate_file = f"core_speaking_step1_{START_DATE}_{END_DATE}.xlsx" - self.data.to_excel(intermediate_file, index=False) - logger.info(f"步骤一中间文件已保存: {intermediate_file}") - else: - logger.warning("步骤一: 未获取到任何数据") - self.data = pd.DataFrame() - - def step2_add_title_from_mysql(self): - """步骤二: 从MySQL补充title字段,并从component_config中提取reference_dialog等字段""" - if self.data.empty: - logger.warning("步骤二: 数据为空,跳过") - return - - logger.info("步骤二: 开始从MySQL补充title字段...") - - # 连接MySQL数据库 - try: - conn = pymysql.connect(**self.mysql_config) - logger.info("MySQL数据库连接成功") - except Exception as e: - logger.error(f"MySQL数据库连接失败: {e}") - raise - - # 获取所有需要查询的c_type和c_id组合 - unique_components = self.data[['c_type', 'c_id']].drop_duplicates() - logger.info(f"需要查询 {len(unique_components)} 个不同的组件配置") - - # 查询title、component_config、related_path和kp_relation_info - sql = """ - SELECT c_type, c_id, title, component_config, related_path, kp_relation_info - FROM core_interaction_component - WHERE (c_type, c_id) IN ({}) - """.format(','.join(['(%s,%s)'] * len(unique_components))) - - params = [] - for _, row in unique_components.iterrows(): - params.extend([row['c_type'], row['c_id']]) - - try: - title_df = pd.read_sql(sql, conn, params=params) - logger.info(f"从MySQL获取到 {len(title_df)} 条组件配置") - except Exception as e: - logger.error(f"查询MySQL失败: {e}") - title_df = pd.DataFrame(columns=['c_type', 'c_id', 'title', 'component_config', 'related_path', 'kp_relation_info']) - - conn.close() - - # 从related_path中解析lesson(lessonIndex) - def extract_lesson(related_path_str): - if related_path_str is None or related_path_str == '': - return '' - try: - data = json.loads(related_path_str) - if isinstance(data, dict): - lesson_index = data.get('lessonIndex') - return str(lesson_index) if lesson_index is not None else '' - return '' - except Exception: - return '' - - # 从component_config中解析reference_dialog、in_progress_goal和final_goal - def extract_config_fields(cfg_str): - result = { - 'reference_dialog': '', - 'in_progress_goal': '', - 'final_goal': '' - } - if cfg_str is None or cfg_str == '': - return result - try: - data = json.loads(cfg_str) - if isinstance(data, dict): - dialog_config = data.get('dialogConfig') or data.get('dialog_config') - if isinstance(dialog_config, dict): - config_obj = dialog_config.get('config') - if isinstance(config_obj, dict): - promptInfo = config_obj.get('promptInfo') - if isinstance(promptInfo, dict): - ref = promptInfo.get('reference_dialog') - result['reference_dialog'] = ref if isinstance(ref, str) else '' - - in_prog = promptInfo.get('in_progress_goal') - result['in_progress_goal'] = in_prog if isinstance(in_prog, str) else '' - - final = promptInfo.get('final_goal') - result['final_goal'] = final if isinstance(final, str) else '' - - return result - - # 兜底:如果顶层就有这些字段 - ref = data.get('reference_dialog') - result['reference_dialog'] = ref if isinstance(ref, str) else '' - - in_prog = data.get('in_progress_goal') - result['in_progress_goal'] = in_prog if isinstance(in_prog, str) else '' - - final = data.get('final_goal') - result['final_goal'] = final if isinstance(final, str) else '' - - return result - except Exception: - return result - - # 解析lesson - if 'related_path' in title_df.columns: - title_df['lesson'] = title_df['related_path'].apply(extract_lesson) - else: - title_df['lesson'] = '' - - # 解析knowledge_points(直接读取kp_relation_info) - if 'kp_relation_info' in title_df.columns: - title_df['knowledge_points'] = title_df['kp_relation_info'].fillna('') - else: - title_df['knowledge_points'] = '' - - # 解析component_config中的多个字段 - if 'component_config' in title_df.columns: - config_fields = title_df['component_config'].apply(extract_config_fields) - title_df['reference_dialog'] = config_fields.apply(lambda x: x['reference_dialog']) - title_df['in_progress_goal'] = config_fields.apply(lambda x: x['in_progress_goal']) - title_df['final_goal'] = config_fields.apply(lambda x: x['final_goal']) - else: - title_df['reference_dialog'] = '' - title_df['in_progress_goal'] = '' - title_df['final_goal'] = '' - - # 仅保留需要合并的列 - title_df = title_df[['c_type', 'c_id', 'title', 'lesson', 'knowledge_points', - 'in_progress_goal', 'final_goal', 'reference_dialog']] - - # 合并数据 - self.data = pd.merge( - self.data, - title_df, - on=['c_type', 'c_id'], - how='left' - ) - - # 填充空值 - self.data['title'] = self.data['title'].fillna('') - self.data['lesson'] = self.data['lesson'].fillna('') - self.data['knowledge_points'] = self.data['knowledge_points'].fillna('') - self.data['in_progress_goal'] = self.data['in_progress_goal'].fillna('') - self.data['final_goal'] = self.data['final_goal'].fillna('') - self.data['reference_dialog'] = self.data['reference_dialog'].fillna('') - - # 统计解析成功的字段条数 - non_empty_ref = (self.data['reference_dialog'] != '').sum() - non_empty_lesson = (self.data['lesson'] != '').sum() - non_empty_kp = (self.data['knowledge_points'] != '').sum() - non_empty_in_prog = (self.data['in_progress_goal'] != '').sum() - non_empty_final = (self.data['final_goal'] != '').sum() - - logger.info(f"步骤二完成: 已补充字段统计:") - logger.info(f" - lesson: {non_empty_lesson}/{len(self.data)} 条有值") - logger.info(f" - knowledge_points: {non_empty_kp}/{len(self.data)} 条有值") - logger.info(f" - in_progress_goal: {non_empty_in_prog}/{len(self.data)} 条有值") - logger.info(f" - final_goal: {non_empty_final}/{len(self.data)} 条有值") - logger.info(f" - reference_dialog: {non_empty_ref}/{len(self.data)} 条有值") - - # 输出中间Excel文件 - intermediate_file = f"core_speaking_step2_{START_DATE}_{END_DATE}.xlsx" - # 处理datetime字段,去掉时区信息(Excel不支持带时区的datetime) - if 'updated_at' in self.data.columns: - self.data['updated_at'] = pd.to_datetime(self.data['updated_at']).dt.tz_localize(None) - self.data.to_excel(intermediate_file, index=False) - logger.info(f"步骤二中间文件已保存: {intermediate_file}") - - def step3_add_chat_log_from_es(self): - """步骤三: 从ES补充对话历史数据""" - if self.data.empty: - logger.warning("步骤三: 数据为空,跳过") - return - - logger.info("步骤三: 开始从ES补充对话历史数据...") - - # 连接ES - try: - es_url = f"{self.es_config['scheme']}://{self.es_config['host']}:{self.es_config['port']}" - if self.es_config['user'] and self.es_config['password']: - es = Elasticsearch( - [es_url], - http_auth=(self.es_config['user'], self.es_config['password']) - ) - else: - es = Elasticsearch([es_url]) - - # 测试连接 - if es.ping(): - logger.info("ES连接成功") - else: - raise Exception("ES连接失败") - except Exception as e: - logger.error(f"ES连接失败: {e}") - # 添加空的chat_log和round_num字段 - self.data['chat_log'] = '' - self.data['round_num'] = 0 - return - - # 获取唯一的session_id - unique_sessions = self.data['session_id'].unique() - logger.info(f"需要查询 {len(unique_sessions)} 个不同的session") - - # 批量查询ES - chat_logs = {} - round_nums = {} - - batch_size = 100 - for i in range(0, len(unique_sessions), batch_size): - batch_sessions = unique_sessions[i:i+batch_size] - logger.info(f"正在处理session批次 {i//batch_size + 1}/{(len(unique_sessions)-1)//batch_size + 1}") - - try: - # 构建ES查询 - query = { - "query": { - "bool": { - "must": [ - {"terms": {"session_id": batch_sessions.tolist()}}, - {"term": {"action": "get_chat"}} - ] - } - }, - "size": 10000, - "sort": [ - {"session_id": {"order": "asc"}}, - {"current_round": {"order": "asc"}} - ] - } - - response = es.search(index="llm_roleplayagent_round_log", body=query) - hits = response['hits']['hits'] - - logger.info(f"本批次从ES获取到 {len(hits)} 条对话记录") - - # 按session_id分组处理 - session_rounds = {} - for hit in hits: - source = hit['_source'] - session_id = source.get('session_id') - current_round = source.get('current_round', 0) - - if session_id not in session_rounds: - session_rounds[session_id] = [] - - session_rounds[session_id].append({ - 'current_round': current_round, - 'user_input': source.get('user_input', ''), - 'npc_message': source.get('npc_message', '') - }) - - # 为每个session构建chat_log - for session_id, rounds in session_rounds.items(): - # 按round排序 - rounds.sort(key=lambda x: x['current_round']) - - chat_parts = [] - max_round = 0 - - for round_data in rounds: - current_round = round_data['current_round'] - max_round = max(max_round, current_round) - - if current_round == 0: - # round 0 只添加npc_message - if round_data['npc_message']: - chat_parts.append(f"npc:{round_data['npc_message']}") - else: - # round 1~n 添加user_input和npc_message - if round_data['user_input']: - chat_parts.append(f"user:{round_data['user_input']}") - if round_data['npc_message']: - chat_parts.append(f"npc:{round_data['npc_message']}") - - chat_logs[session_id] = '\n'.join(chat_parts) - round_nums[session_id] = max_round - - except Exception as e: - logger.error(f"查询ES批次失败: {e}") - continue - - logger.info(f"完成ES查询,获取到 {len(chat_logs)} 个session的对话记录") - - # 添加chat_log和round_num字段 - self.data['chat_log'] = self.data['session_id'].map(chat_logs).fillna('') - self.data['round_num'] = self.data['session_id'].map(round_nums).fillna(0) - - logger.info("步骤三完成: 对话历史数据已补充") - - def export_final_excel(self): - """导出最终Excel文件""" - if self.data.empty: - logger.warning("数据为空,无法导出") - return - - logger.info("开始导出最终Excel文件...") - - # 确保字段顺序 - final_columns = [ - 'user_id', 'session_id', 'c_type', 'c_id', - 'play_result', 'updated_at', 'title', 'lesson', 'knowledge_points', - 'in_progress_goal', 'final_goal', 'reference_dialog', - 'chat_log', 'user_behavior_info', 'round_num' - ] - - # 重新排列列顺序 - self.data = self.data[final_columns] - - # 处理datetime字段,去掉时区信息(Excel不支持带时区的datetime) - if 'updated_at' in self.data.columns: - self.data['updated_at'] = pd.to_datetime(self.data['updated_at']).dt.tz_localize(None) - logger.info("最终导出时已处理updated_at字段的时区信息") - - # 生成文件名 - output_file = f"core_speaking_final_{START_DATE}_{END_DATE}.xlsx" - - # 导出Excel - self.data.to_excel(output_file, index=False) - - logger.info(f"最终Excel文件已导出: {output_file}") - logger.info(f"总计导出 {len(self.data)} 条记录") - - # 输出字段统计 - logger.info("字段完整性统计:") - for col in final_columns: - if col in ['chat_log', 'title', 'reference_dialog', 'user_behavior_info', - 'lesson', 'knowledge_points', 'in_progress_goal', 'final_goal']: - non_empty = (self.data[col] != '').sum() - logger.info(f" {col}: {non_empty}/{len(self.data)} 条记录有值") - elif col == 'round_num': - non_zero = (self.data[col] > 0).sum() - logger.info(f" {col}: {non_zero}/{len(self.data)} 条记录 > 0") - - def process(self): - """执行完整的数据处理流程""" - logger.info("="*60) - logger.info("开始口语核心互动数据处理") - logger.info(f"时间范围: {START_DATE} - {END_DATE}") - logger.info("="*60) - - try: - # 步骤一: PG数据筛选 - self.step1_extract_from_pg() - - # 步骤二: MySQL补充title - self.step2_add_title_from_mysql() - - # 步骤三: ES补充对话历史 - self.step3_add_chat_log_from_es() - - # 导出最终文件 - self.export_final_excel() - - logger.info("="*60) - logger.info("数据处理完成!") - logger.info("="*60) - - except Exception as e: - logger.error(f"数据处理过程中发生错误: {e}") - raise - -if __name__ == "__main__": - processor = CoreSpeakingDataProcessor() - processor.process() diff --git a/makee_vala/git_scripts/extract_user_audio.py b/makee_vala/git_scripts/extract_user_audio.py deleted file mode 100644 index 50c5080..0000000 --- a/makee_vala/git_scripts/extract_user_audio.py +++ /dev/null @@ -1,480 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -用户音频数据筛选脚本 -功能:从PostgreSQL数据库的分表(user_component_play_record_0~7)中提取指定时间段的用户音频数据。 -主要逻辑: -1. 数据源:遍历 user_component_play_record_0 至 user_component_play_record_7 表。 -2. 筛选条件: - - 时间范围:可配置 - - 数据有效性:user_behavior_info 非空且包含 userAudio 和 pronunciationScore。 -3. 采样规则: - - 目标总数:可配置 - - 用户限制:可配置 - - 随机策略:先随机打乱,再按用户分组限制,最后补齐或截断至目标数量。 -4. 输出:导出为Excel文件。 - 包含字段: - - index: 序号 - - source_table: 来源表名 - - created_at: 创建时间 - - user_id: 用户ID - - component_unique_code: 组件唯一标识 - - pronunciationScore: 发音评分 - - userAudio: 音频链接 - - expressContent: 朗读内容文本 -""" - -import os -import json -import re -import random -import psycopg2 -import pymysql -import pandas as pd -from datetime import datetime -from typing import List, Dict, Any -from dotenv import load_dotenv - -# 配置参数 -CONFIG = { - # 筛选时间范围 - 'START_TIME': '2025-11-10 00:00:00+08:00', - 'END_TIME': '2025-12-10 23:59:59+08:00', - - # 采样参数 - 'TARGET_TOTAL': 10000, # 目标总样本数 - 'MAX_PER_USER': 20, # 单个用户最大样本数 - 'TABLE_COUNT': 8, # 分表数量 (0~N-1) - - # 组件类型过滤 - 'C_TYPE_FILTER': 'mid_sentence_dialogue' # 仅筛选对话互动组件 -} - -class AudioDataExtractor: - def __init__(self): - # 加载环境变量 - load_dotenv() - - # PostgreSQL数据库连接配置 - self.db_config = { - 'host': os.getenv('PG_DB_HOST'), - 'port': os.getenv('PG_DB_PORT'), - 'user': os.getenv('PG_DB_USER'), - 'password': os.getenv('PG_DB_PASSWORD'), - 'database': os.getenv('PG_DB_DATABASE') - } - - # MySQL数据库连接配置 - self.mysql_config = { - 'host': os.getenv('MYSQL_HOST'), - 'user': os.getenv('MYSQL_USERNAME'), - 'password': os.getenv('MYSQL_PASSWORD'), - 'database': "vala_test", - 'port': int(os.getenv('MYSQL_PORT', 3306)), - 'charset': 'utf8mb4' - } - - # 分表名称列表 - self.table_names = [f'user_component_play_record_{i}' for i in range(CONFIG['TABLE_COUNT'])] - - - # 目标总数 - self.target_total = CONFIG['TARGET_TOTAL'] - # 每个用户最多记录数 - self.max_per_user = CONFIG['MAX_PER_USER'] - - def get_db_connection(self): - """获取数据库连接""" - try: - conn = psycopg2.connect(**self.db_config) - return conn - except Exception as e: - print(f"数据库连接失败: {e}") - raise - - def extract_audio_info(self, user_behavior_info: str) -> Dict[str, Any]: - """从user_behavior_info字段中提取音频信息""" - try: - behavior_data = json.loads(user_behavior_info) - if isinstance(behavior_data, list) and len(behavior_data) > 0: - # 取第一个元素 - data = behavior_data[0] - if 'userAudio' in data and 'pronunciationScore' in data: - return { - 'userAudio': data.get('userAudio'), - 'pronunciationScore': data.get('pronunciationScore'), - 'expressContent': data.get('expressContent') - } - except (json.JSONDecodeError, KeyError, IndexError): - pass - return {} - - def query_table_data(self, table_name: str) -> List[Dict]: - """查询单个表的数据""" - conn = self.get_db_connection() - cursor = conn.cursor() - - try: - query = f""" - SELECT user_id, component_unique_code, c_type, c_id, created_at, user_behavior_info - FROM {table_name} - WHERE created_at >= '{CONFIG['START_TIME']}' - AND created_at <= '{CONFIG['END_TIME']}' - AND c_type = '{CONFIG['C_TYPE_FILTER']}' - AND user_behavior_info IS NOT NULL - AND user_behavior_info != '' - """ - - cursor.execute(query) - rows = cursor.fetchall() - - results = [] - for row in rows: - user_id, component_unique_code, c_type, c_id, created_at, user_behavior_info = row - - # 提取音频信息 - audio_info = self.extract_audio_info(user_behavior_info) - if audio_info and 'userAudio' in audio_info and 'pronunciationScore' in audio_info: - results.append({ - 'source_table': table_name, - 'user_id': user_id, - 'component_unique_code': component_unique_code, - 'c_type': c_type, - 'c_id': c_id, - 'created_at': created_at, - 'userAudio': audio_info['userAudio'], - 'pronunciationScore': audio_info['pronunciationScore'], - 'expressContent': audio_info.get('expressContent') - }) - - return results - - finally: - cursor.close() - conn.close() - - def get_component_configs(self, data: List[Dict]) -> Dict[str, str]: - """从MySQL批量获取组件配置信息""" - # 提取所有unique的(c_type, c_id)组合 - unique_components = set() - for record in data: - if 'c_type' in record and 'c_id' in record: - unique_components.add((record['c_type'], record['c_id'])) - - if not unique_components: - print("没有需要查询的组件") - return {} - - print(f"正在从MySQL查询 {len(unique_components)} 个组件的配置信息...") - - # 连接MySQL - try: - conn = pymysql.connect(**self.mysql_config) - cursor = conn.cursor() - - # 存储组件配置的字典,key为"c_type-c_id" - component_configs = {} - - # 批量查询 - for c_type, c_id in unique_components: - query = """ - SELECT component_config - FROM middle_interaction_component - WHERE c_type = %s AND c_id = %s - """ - cursor.execute(query, (c_type, c_id)) - result = cursor.fetchone() - - if result and result[0]: - key = f"{c_type}-{c_id}" - component_configs[key] = result[0] - - cursor.close() - conn.close() - - print(f"成功查询到 {len(component_configs)} 个组件配置") - return component_configs - - except Exception as e: - print(f"查询MySQL组件配置失败: {e}") - return {} - - @staticmethod - def clean_text(text: str) -> str: - """清理文本:转小写,去除标点符号和空格""" - if not text: - return "" - # 转小写 - text = text.lower() - # 去除标点符号和特殊字符,只保留字母和数字 - text = re.sub(r'[^\w\s]', '', text) - # 去除多余空格 - text = re.sub(r'\s+', '', text) - return text - - @staticmethod - def levenshtein_distance(s1: str, s2: str) -> int: - """计算两个字符串的Levenshtein编辑距离""" - if len(s1) < len(s2): - return AudioDataExtractor.levenshtein_distance(s2, s1) - - if len(s2) == 0: - return len(s1) - - previous_row = range(len(s2) + 1) - for i, c1 in enumerate(s1): - current_row = [i + 1] - for j, c2 in enumerate(s2): - # 插入、删除、替换的成本 - insertions = previous_row[j + 1] + 1 - deletions = current_row[j] + 1 - substitutions = previous_row[j] + (c1 != c2) - current_row.append(min(insertions, deletions, substitutions)) - previous_row = current_row - - return previous_row[-1] - - def parse_and_filter_by_config(self, data: List[Dict], component_configs: Dict[str, str]) -> List[Dict]: - """解析组件配置并筛选question.mode == 'read'的记录""" - print(f"\n开始根据组件配置筛选数据...") - print(f"筛选前数据量: {len(data)}") - - filtered_data = [] - skipped_no_config = 0 - skipped_invalid_json = 0 - skipped_wrong_mode = 0 - - for record in data: - c_type = record.get('c_type') - c_id = record.get('c_id') - - if not c_type or not c_id: - continue - - # 获取组件配置 - key = f"{c_type}-{c_id}" - config_str = component_configs.get(key) - - if not config_str: - skipped_no_config += 1 - continue - - try: - # 解析JSON配置 - config = json.loads(config_str) - - # 检查question.mode == "read" - question = config.get('question', {}) - mode = question.get('mode') - - if mode == 'read': - # 提取question.content作为refText - ref_text = question.get('content', '') - record['refText'] = ref_text - - # 计算编辑距离 - express_content = record.get('expressContent', '') - - # 清理文本(去除标点和大小写差异) - cleaned_express = self.clean_text(express_content) - cleaned_ref = self.clean_text(ref_text) - - # 计算编辑距离 - edit_distance = self.levenshtein_distance(cleaned_express, cleaned_ref) - record['editDistance'] = edit_distance - - # 计算相对编辑距离 - ref_len = len(cleaned_ref) - if ref_len > 0: - relative_edit_distance = round(edit_distance / ref_len, 4) - else: - relative_edit_distance = 0 - record['relativeEditDistance'] = relative_edit_distance - - filtered_data.append(record) - else: - skipped_wrong_mode += 1 - - except (json.JSONDecodeError, AttributeError, TypeError): - skipped_invalid_json += 1 - continue - - print(f"筛选后数据量: {len(filtered_data)}") - print(f" - 缺少配置: {skipped_no_config}") - print(f" - 配置解析失败: {skipped_invalid_json}") - print(f" - mode不是read: {skipped_wrong_mode}") - - return filtered_data - - def collect_all_data(self) -> List[Dict]: - """收集所有表的数据""" - all_data = [] - - for table_name in self.table_names: - print(f"正在查询表: {table_name}") - try: - table_data = self.query_table_data(table_name) - all_data.extend(table_data) - print(f"表 {table_name} 查询到 {len(table_data)} 条记录") - except Exception as e: - print(f"查询表 {table_name} 失败: {e}") - continue - - print(f"总共收集到 {len(all_data)} 条有效记录") - - if not all_data: - return [] - - # 从MySQL获取组件配置 - component_configs = self.get_component_configs(all_data) - - # 根据组件配置筛选数据(只保留question.mode == "read"的记录) - filtered_data = self.parse_and_filter_by_config(all_data, component_configs) - - return filtered_data - - def random_filter_data(self, data: List[Dict]) -> List[Dict]: - """随机筛选数据(不按评分分段控制)""" - # 随机打乱所有数据 - shuffled_data = data.copy() - random.shuffle(shuffled_data) - - print(f"开始随机筛选,总共 {len(shuffled_data)} 条记录") - return shuffled_data - - def apply_user_constraints(self, data: List[Dict]) -> List[Dict]: - """应用用户约束(每个用户最多2条)""" - user_records = {} - - # 按用户分组 - for record in data: - user_id = record['user_id'] - if user_id not in user_records: - user_records[user_id] = [] - user_records[user_id].append(record) - - # 每个用户最多选择2条 - final_data = [] - for user_id, records in user_records.items(): - if len(records) <= self.max_per_user: - final_data.extend(records) - else: - # 随机选择2条 - selected = random.sample(records, self.max_per_user) - final_data.extend(selected) - - return final_data - - def export_to_excel(self, data: List[Dict], filename: str = 'user_audio_data.xlsx'): - """导出数据到Excel文件""" - # 准备导出数据 - export_data = [] - for i, record in enumerate(data): - # 处理时区问题 - 转换为本地时间字符串 - created_at = record['created_at'] - if hasattr(created_at, 'tz_localize'): - created_at = created_at.tz_localize(None) - elif hasattr(created_at, 'replace'): - created_at = created_at.replace(tzinfo=None) - - export_data.append({ - 'index': i, - 'source_table': record['source_table'], - 'created_at': created_at, - 'user_id': record['user_id'], - 'component_unique_code': record['component_unique_code'], - 'c_type': record.get('c_type'), - 'c_id': record.get('c_id'), - 'pronunciationScore': record['pronunciationScore'], - 'userAudio': record['userAudio'], - 'expressContent': record.get('expressContent'), - 'refText': record.get('refText'), - 'editDistance': record.get('editDistance'), - 'relativeEditDistance': record.get('relativeEditDistance') - }) - - # 创建DataFrame并导出 - df = pd.DataFrame(export_data) - df.to_excel(filename, index=False) - print(f"数据已导出到: {filename}") - print(f"总共导出 {len(export_data)} 条记录") - - # 打印统计信息 - self.print_statistics(data) - - def print_statistics(self, data: List[Dict]): - """打印统计信息""" - print("\n=== 数据统计 ===") - - # 评分统计(显示分布情况但不按区间分组) - scores = [record['pronunciationScore'] for record in data] - print(f"\n评分统计:") - print(f" 总记录数: {len(scores)}") - print(f" 最高分: {max(scores)}") - print(f" 最低分: {min(scores)}") - print(f" 平均分: {sum(scores) / len(scores):.2f}") - - # 用户分布统计 - user_counts = {} - for record in data: - user_id = record['user_id'] - user_counts[user_id] = user_counts.get(user_id, 0) + 1 - - print(f"\n用户统计:") - print(f" 总用户数: {len(user_counts)}") - print(f" 平均每用户记录数: {len(data) / len(user_counts):.2f}") - - # 表分布统计 - table_counts = {} - for record in data: - table = record['source_table'] - table_counts[table] = table_counts.get(table, 0) + 1 - - print(f"\n表分布:") - for table, count in sorted(table_counts.items()): - print(f" {table}: {count} 条") - - def run(self): - """运行主流程""" - print("开始提取用户音频数据...") - - # 1. 收集所有数据 - all_data = self.collect_all_data() - - if not all_data: - print("未找到符合条件的数据") - return - - # 2. 随机筛选数据(不按评分分段控制) - filtered_data = self.random_filter_data(all_data) - - # 3. 应用用户约束 - final_data = self.apply_user_constraints(filtered_data) - - # 4. 如果数据不足500条,尝试补充 - if len(final_data) < self.target_total: - print(f"当前数据量 {len(final_data)} 条,少于目标 {self.target_total} 条") - # 从剩余数据中补充 - used_records = set((r['user_id'], r['component_unique_code'], str(r['created_at'])) for r in final_data) - available_data = [r for r in all_data if (r['user_id'], r['component_unique_code'], str(r['created_at'])) not in used_records] - - needed = self.target_total - len(final_data) - if len(available_data) >= needed: - additional = random.sample(available_data, needed) - final_data.extend(additional) - - # 5. 如果超过500条,随机选择500条 - if len(final_data) > self.target_total: - final_data = random.sample(final_data, self.target_total) - - # 6. 导出到Excel - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - filename = f"user_audio_data_{timestamp}.xlsx" - self.export_to_excel(final_data, filename) - -def main(): - extractor = AudioDataExtractor() - extractor.run() - -if __name__ == "__main__": - main() diff --git a/makee_vala/git_scripts/sample_unit_challenge_data_from_es.py b/makee_vala/git_scripts/sample_unit_challenge_data_from_es.py deleted file mode 100644 index 16b33fc..0000000 --- a/makee_vala/git_scripts/sample_unit_challenge_data_from_es.py +++ /dev/null @@ -1,463 +0,0 @@ -""" -从es中 筛选用户数据 - -es相关配置通过以下环节变量 - -ES_HOST=xxx -ES_PORT=9200 -ES_SCHEME=https -ES_USER=elastic -ES_PASSWORD=xxx - - -index: user-audio - -脚本思路: - -给定 一些过滤参数; 给定导出的excel文件名 (在脚本中以变量方式配置就行) - -导出我要的字段内容到一个 excel - -过滤字段: -timeStr: 字段内容为str 格式为: 2024-12-31 15:53:19 -期望支持配置 开始 日期 和 结束日期 (可以只配置一个 只配 开始日期 则筛选 >= 开始日期的记录, 只配结束日期 则筛选 <= 结束日期的记录) - -输出字段内容支持配置: - - -""" - -import os -from datetime import datetime -from dotenv import load_dotenv -from elasticsearch import Elasticsearch -import pandas as pd -import urllib.parse -from collections import defaultdict - -# 加载环境变量 -load_dotenv() - -# 配置参数 -INDEX_NAME = "llm_ai_tools_log" -OUTPUT_FILE = "单元挑战用户数据_250906_251024.xlsx" -START_DATE = "2025-09-06 00:00:00" # 开始日期,格式: YYYY-MM-DD HH:MM:SS,设为None则不限制 -END_DATE = "2025-10-24 00:00:00" # 结束日期,格式: YYYY-MM-DD HH:MM:SS,设为None则不限制 - -# type字段过滤配置:筛选指定类型的记录,为空则不限制 -FILTER_TYPES = ["sent_check_challenge", "speaking_topic_challenge"] - -# 可选的 userId 过滤配置:配置为[int, ...] 列表;为空则不限制 -FILTER_USER_IDS = [] # 例如: [123, 456] - -# 需要导出的字段 -EXPORT_FIELDS = [ - "type", - "question", - "user_answer", - "time_total_ms", - "score", - "is_passed", - "model", - "write_time_str", - "write_time_int", -] - - - -def create_es_client(): - """创建Elasticsearch客户端""" - # 获取环境变量并打印调试信息 - es_host = os.getenv('ES_HOST') - es_port = os.getenv('ES_PORT', 9200) - es_scheme = os.getenv('ES_SCHEME', 'https') - es_user = os.getenv('ES_USER') - es_password = os.getenv('ES_PASSWORD') - - print(f"[DEBUG] ES配置信息:") - print(f" ES_HOST: {es_host}") - print(f" ES_PORT: {es_port}") - print(f" ES_SCHEME: {es_scheme}") - print(f" ES_USER: {es_user}") - print(f" ES_PASSWORD: {'***已设置***' if es_password else '未设置'}") - - # 检查必要的环境变量 - if not es_host: - raise ValueError("ES_HOST环境变量未设置") - if not es_user: - raise ValueError("ES_USER环境变量未设置") - if not es_password: - raise ValueError("ES_PASSWORD环境变量未设置") - - # URL编码用户名和密码,处理特殊字符 - encoded_user = urllib.parse.quote(es_user, safe='') - encoded_password = urllib.parse.quote(es_password, safe='') - - print(f"[DEBUG] 原始密码包含特殊字符,已进行URL编码") - - # 方式1: 使用URL中嵌入认证信息 - host_url_with_auth = f"{es_scheme}://{encoded_user}:{encoded_password}@{es_host}:{es_port}" - print(f"[DEBUG] 连接URL (带认证): {es_scheme}://{encoded_user}:***@{es_host}:{es_port}") - - try: - # 尝试方式1: URL中嵌入认证 - es_config_1 = { - 'hosts': [host_url_with_auth], - 'verify_certs': False, - 'ssl_show_warn': False, - 'request_timeout': 30, - 'retry_on_timeout': True - } - - print("[DEBUG] 尝试方式1: URL中嵌入认证信息") - es_client = Elasticsearch(**es_config_1) - - # 测试连接 - info = es_client.info() - print(f"[SUCCESS] 方式1连接成功") - return es_client - - except Exception as e1: - print(f"[DEBUG] 方式1失败: {e1}") - - try: - # 尝试方式2: 使用basic_auth参数 - host_url = f"{es_scheme}://{es_host}:{es_port}" - es_config_2 = { - 'hosts': [host_url], - 'basic_auth': (es_user, es_password), - 'verify_certs': False, - 'ssl_show_warn': False, - 'request_timeout': 30, - 'retry_on_timeout': True - } - - print("[DEBUG] 尝试方式2: 使用basic_auth参数") - es_client = Elasticsearch(**es_config_2) - - # 测试连接 - info = es_client.info() - print(f"[SUCCESS] 方式2连接成功") - return es_client - - except Exception as e2: - print(f"[DEBUG] 方式2失败: {e2}") - - try: - # 尝试方式3: 使用http_auth参数 (旧版本兼容) - es_config_3 = { - 'hosts': [host_url], - 'http_auth': (es_user, es_password), - 'verify_certs': False, - 'ssl_show_warn': False, - 'request_timeout': 30, - 'retry_on_timeout': True - } - - print("[DEBUG] 尝试方式3: 使用http_auth参数") - es_client = Elasticsearch(**es_config_3) - - # 测试连接 - info = es_client.info() - print(f"[SUCCESS] 方式3连接成功") - return es_client - - except Exception as e3: - print(f"[DEBUG] 方式3失败: {e3}") - print(f"[ERROR] 所有认证方式都失败了") - raise e3 - -def build_query(start_date=None, end_date=None): - """构建ES查询条件""" - # 构建基础查询条件 - must_conditions = [] - - # 添加时间范围条件 - if start_date or end_date: - range_query = {} - - if start_date: - start_timestamp = int(datetime.strptime(start_date, "%Y-%m-%d %H:%M:%S").timestamp()) - range_query["gte"] = start_timestamp - print(f"[DEBUG] 开始时间戳: {start_timestamp} (对应 {start_date})") - - if end_date: - end_timestamp = int(datetime.strptime(end_date, "%Y-%m-%d %H:%M:%S").timestamp()) - range_query["lte"] = end_timestamp - print(f"[DEBUG] 结束时间戳: {end_timestamp} (对应 {end_date})") - - must_conditions.append({ - "range": { - "write_time_int": range_query - } - }) - - # 如果配置了 userId 列表,则仅选取对应 userId 的数据 - if FILTER_USER_IDS: - print(f"[DEBUG] 应用 userId 过滤: {FILTER_USER_IDS}") - must_conditions.append({ - "terms": { - "userId": FILTER_USER_IDS - } - }) - - # 如果配置了 type 列表,则仅选取对应 type 的数据 - if FILTER_TYPES: - print(f"[DEBUG] 应用 type 过滤: {FILTER_TYPES}") - must_conditions.append({ - "terms": { - "type": FILTER_TYPES - } - }) - - # 构建最终查询 - if must_conditions: - query = { - "bool": { - "must": must_conditions - } - } - else: - query = {"match_all": {}} - - print(f"[DEBUG] 查询条件: {query}") - - return { - "query": query, - "_source": EXPORT_FIELDS, - "sort": [{"write_time_int": {"order": "desc"}}] - } - -def fetch_data_from_es(es_client, start_date=None, end_date=None): - """从ES获取数据""" - query = build_query(start_date, end_date) - - try: - print(f"[DEBUG] 执行ES查询,使用scroll获取全量数据...") - - # 使用scroll API获取全量数据 - scroll_size = 1000 # 每次scroll获取的数据量 - scroll_timeout = '2m' # scroll超时时间 - - # 初始化scroll - query['size'] = scroll_size - response = es_client.search( - index=INDEX_NAME, - body=query, - scroll=scroll_timeout - ) - - scroll_id = response['_scroll_id'] - hits = response['hits']['hits'] - total_hits = response['hits']['total'] - - # 获取总数(兼容不同ES版本) - if isinstance(total_hits, dict): - total_count = total_hits['value'] - else: - total_count = total_hits - - print(f"[DEBUG] ES中匹配的总记录数: {total_count}") - - all_data = [] - batch_count = 1 - - # 处理第一批数据 - for hit in hits: - source = hit['_source'] - row = {} - for field in EXPORT_FIELDS: - row[field] = source.get(field, "") - all_data.append(row) - - print(f"[DEBUG] 已获取第 {batch_count} 批数据,当前总数: {len(all_data)}") - - # 继续scroll获取剩余数据 - while len(hits) == scroll_size: - batch_count += 1 - response = es_client.scroll(scroll_id=scroll_id, scroll=scroll_timeout) - scroll_id = response['_scroll_id'] - hits = response['hits']['hits'] - - for hit in hits: - source = hit['_source'] - row = {} - for field in EXPORT_FIELDS: - row[field] = source.get(field, "") - all_data.append(row) - - print(f"[DEBUG] 已获取第 {batch_count} 批数据,当前总数: {len(all_data)}") - - # 清理scroll - try: - es_client.clear_scroll(scroll_id=scroll_id) - except: - pass # 忽略清理错误 - - print(f"[DEBUG] 从ES获取到数据 {len(all_data)} 条记录") - return all_data - - except Exception as e: - print(f"查询ES时出错: {e}") - return [] - -def export_to_excel(data, filename): - """导出数据到Excel""" - if not data: - print("没有数据可导出") - return - - df = pd.DataFrame(data) - - try: - df.to_excel(filename, index=False, engine='openpyxl') - print(f"数据已导出到: {filename}") - print(f"共导出 {len(data)} 条记录") - except Exception as e: - print(f"导出Excel时出错: {e}") - -def debug_es_data(es_client): - """调试ES数据,了解实际数据情况""" - print("\n" + "="*60) - print("开始调试ES数据...") - - try: - # 1. 查询总数据量 - total_query = { - "query": {"match_all": {}}, - "size": 0 - } - response = es_client.search(index=INDEX_NAME, body=total_query) - total_count = response['hits']['total'] - if isinstance(total_count, dict): - total_count = total_count['value'] - print(f"[DEBUG] ES索引 '{INDEX_NAME}' 中总数据量: {total_count}") - - if total_count == 0: - print("[ERROR] ES索引中没有任何数据!") - return - - # 2. 查询最近的几条数据,了解数据结构 - sample_query = { - "query": {"match_all": {}}, - "size": 5, - "sort": [{"_id": {"order": "desc"}}] - } - response = es_client.search(index=INDEX_NAME, body=sample_query) - hits = response['hits']['hits'] - - print(f"[DEBUG] 获取到 {len(hits)} 条样本数据:") - for i, hit in enumerate(hits): - source = hit['_source'] - - print(f" 样本 {i+1}:") - print(f" write_time_int: {source.get('write_time_int', 'N/A')}") - print(f" timeStr: {source.get('timeStr', 'N/A')}") - print(f" type: {source.get('type', 'N/A')}") - print(f" userId: {source.get('userId', 'N/A')}") - - # 3. 查询时间范围内的数据 - time_range_query = { - "query": { - "range": { - "write_time_int": { - "gte": int(datetime.strptime(START_DATE, "%Y-%m-%d %H:%M:%S").timestamp()), - "lte": int(datetime.strptime(END_DATE, "%Y-%m-%d %H:%M:%S").timestamp()) - } - } - }, - "size": 0 - } - response = es_client.search(index=INDEX_NAME, body=time_range_query) - time_range_count = response['hits']['total'] - if isinstance(time_range_count, dict): - time_range_count = time_range_count['value'] - print(f"[DEBUG] 时间范围内数据量 ({START_DATE} 到 {END_DATE}): {time_range_count}") - - # 4. 查询时间范围的实际数据分布 - print(f"[DEBUG] 检查时间字段的实际值范围...") - agg_query = { - "query": {"match_all": {}}, - "size": 0, - "aggs": { - "time_stats": { - "stats": { - "field": "write_time_int" - } - } - } - } - response = es_client.search(index=INDEX_NAME, body=agg_query) - if 'aggregations' in response: - stats = response['aggregations']['time_stats'] - min_time = stats.get('min') - max_time = stats.get('max') - if min_time and max_time: - min_date = datetime.fromtimestamp(min_time).strftime("%Y-%m-%d %H:%M:%S") - max_date = datetime.fromtimestamp(max_time).strftime("%Y-%m-%d %H:%M:%S") - print(f" 最早时间: {min_date} (时间戳: {min_time})") - print(f" 最晚时间: {max_date} (时间戳: {max_time})") - - except Exception as e: - print(f"[ERROR] 调试ES数据时出错: {e}") - - print("="*60 + "\n") - -def main(): - """主函数""" - print("开始从ES获取单元挑战数据...") - print(f"索引: {INDEX_NAME}") - print(f"开始日期: {START_DATE if START_DATE else '不限制'}") - print(f"结束日期: {END_DATE if END_DATE else '不限制'}") - if FILTER_TYPES: - print(f"类型过滤: {FILTER_TYPES}") - if FILTER_USER_IDS: - print(f"用户ID过滤: {FILTER_USER_IDS}") - print("-" * 50) - - # 检查.env文件是否存在 - env_file = ".env" - if not os.path.exists(env_file): - print(f"[ERROR] {env_file} 文件不存在,请创建并配置ES连接信息") - print("参考 .env.example 文件进行配置") - return - - print(f"[DEBUG] 找到环境配置文件: {env_file}") - - # 创建ES客户端 - try: - es_client = create_es_client() - except ValueError as e: - print(f"[ERROR] 配置错误: {e}") - print("请检查 .env 文件中的ES配置") - return - except Exception as e: - print(f"[ERROR] 创建ES客户端失败: {e}") - return - - # 测试连接 - try: - print("[DEBUG] 正在测试ES连接...") - # ES客户端创建函数中已经包含了连接测试,这里不需要重复测试 - print(f"[SUCCESS] ES连接已建立") - except Exception as e: - print(f"[ERROR] ES连接失败: {e}") - print("\n可能的解决方案:") - print("1. 检查ES服务是否正常运行") - print("2. 验证.env文件中的ES_HOST、ES_USER、ES_PASSWORD是否正确") - print("3. 确认网络连接是否正常") - print("4. 检查ES用户权限是否足够") - print("5. 密码中包含特殊字符,已尝试URL编码处理") - return - - # 获取数据 - data = fetch_data_from_es(es_client, START_DATE, END_DATE) - - # 导出到Excel - if data: - export_to_excel(data, OUTPUT_FILE) - else: - print("未获取到任何数据") - -if __name__ == "__main__": - main() diff --git a/makee_vala/git_scripts/sample_user_data_from_es.py b/makee_vala/git_scripts/sample_user_data_from_es.py deleted file mode 100644 index 3a1e415..0000000 --- a/makee_vala/git_scripts/sample_user_data_from_es.py +++ /dev/null @@ -1,599 +0,0 @@ -""" -从es中采样用户数据 - -es相关配置通过以下环节变量 - -ES_HOST=xxx -ES_PORT=9200 -ES_SCHEME=https -ES_USER=elastic -ES_PASSWORD=xxx - - -index: user-audio - -脚本思路: - -给定 一些过滤参数; 给定导出的excel文件名 (在脚本中以变量方式配置就行) - -导出我要的字段内容到一个 excel - -过滤字段: -timeStr: 字段内容为str 格式为: 2024-12-31 15:53:19 -期望支持配置 开始 日期 和 结束日期 (可以只配置一个 只配 开始日期 则筛选 >= 开始日期的记录, 只配结束日期 则筛选 <= 结束日期的记录) - -输出以下字段内容: - -userId -userMsg -userName -soeData -audioUrl -asrStatus -componentId -componentType -dataVersion - -""" - -import os -from datetime import datetime -from dotenv import load_dotenv -from elasticsearch import Elasticsearch -import pandas as pd -import urllib.parse -import re -from collections import defaultdict - -# 加载环境变量 -load_dotenv() - -# 配置参数 -INDEX_NAME = os.getenv("ES_INDEX", "user-audio") -OUTPUT_FILE = "user_audio_data.xlsx" -START_DATE = "2025-10-15 00:00:00" # 开始日期,格式: YYYY-MM-DD HH:MM:SS,设为None则不限制 -END_DATE = "2025-10-17 00:00:00" # 结束日期,格式: YYYY-MM-DD HH:MM:SS,设为None则不限制 - -# 可选的 userId 过滤配置:配置为[int, ...] 列表;为空则不限制 -FILTER_USER_IDS = [356] # 例如: [123, 456] - -# 采样配置参数 -MAX_SAMPLES_PER_USER_MSG = 50 # 每个不重复的userMsg最多采样的数据条数 -MAX_SAMPLES_PER_USER_ID = 20 # 每个userId最多采样的数据条数 - -# 需要导出的字段 -EXPORT_FIELDS = [ - "userId", - "userMsg", - "userName", - "soeData", - "audioUrl", - "asrStatus", - "componentId", - "componentType", - "dataVersion", - "timeStr" -] - -def create_es_client(): - """创建Elasticsearch客户端""" - # 获取环境变量并打印调试信息 - es_host = os.getenv('ES_HOST') - es_port = os.getenv('ES_PORT', 9200) - es_scheme = os.getenv('ES_SCHEME', 'https') - es_user = os.getenv('ES_USER') - es_password = os.getenv('ES_PASSWORD') - - print(f"[DEBUG] ES配置信息:") - print(f" ES_HOST: {es_host}") - print(f" ES_PORT: {es_port}") - print(f" ES_SCHEME: {es_scheme}") - print(f" ES_USER: {es_user}") - print(f" ES_PASSWORD: {'***已设置***' if es_password else '未设置'}") - - # 检查必要的环境变量 - if not es_host: - raise ValueError("ES_HOST环境变量未设置") - if not es_user: - raise ValueError("ES_USER环境变量未设置") - if not es_password: - raise ValueError("ES_PASSWORD环境变量未设置") - - # URL编码用户名和密码,处理特殊字符 - encoded_user = urllib.parse.quote(es_user, safe='') - encoded_password = urllib.parse.quote(es_password, safe='') - - print(f"[DEBUG] 原始密码包含特殊字符,已进行URL编码") - - # 方式1: 使用URL中嵌入认证信息 - host_url_with_auth = f"{es_scheme}://{encoded_user}:{encoded_password}@{es_host}:{es_port}" - print(f"[DEBUG] 连接URL (带认证): {es_scheme}://{encoded_user}:***@{es_host}:{es_port}") - - try: - # 尝试方式1: URL中嵌入认证 - es_config_1 = { - 'hosts': [host_url_with_auth], - 'verify_certs': False, - 'ssl_show_warn': False, - 'request_timeout': 30, - 'retry_on_timeout': True - } - - print("[DEBUG] 尝试方式1: URL中嵌入认证信息") - es_client = Elasticsearch(**es_config_1) - - # 测试连接 - info = es_client.info() - print(f"[SUCCESS] 方式1连接成功") - return es_client - - except Exception as e1: - print(f"[DEBUG] 方式1失败: {e1}") - - try: - # 尝试方式2: 使用basic_auth参数 - host_url = f"{es_scheme}://{es_host}:{es_port}" - es_config_2 = { - 'hosts': [host_url], - 'basic_auth': (es_user, es_password), - 'verify_certs': False, - 'ssl_show_warn': False, - 'request_timeout': 30, - 'retry_on_timeout': True - } - - print("[DEBUG] 尝试方式2: 使用basic_auth参数") - es_client = Elasticsearch(**es_config_2) - - # 测试连接 - info = es_client.info() - print(f"[SUCCESS] 方式2连接成功") - return es_client - - except Exception as e2: - print(f"[DEBUG] 方式2失败: {e2}") - - try: - # 尝试方式3: 使用http_auth参数 (旧版本兼容) - es_config_3 = { - 'hosts': [host_url], - 'http_auth': (es_user, es_password), - 'verify_certs': False, - 'ssl_show_warn': False, - 'request_timeout': 30, - 'retry_on_timeout': True - } - - print("[DEBUG] 尝试方式3: 使用http_auth参数") - es_client = Elasticsearch(**es_config_3) - - # 测试连接 - info = es_client.info() - print(f"[SUCCESS] 方式3连接成功") - return es_client - - except Exception as e3: - print(f"[DEBUG] 方式3失败: {e3}") - print(f"[ERROR] 所有认证方式都失败了") - raise e3 - -def build_query(start_date=None, end_date=None): - """构建ES查询条件""" - # 构建基础查询条件 - must_conditions = [] - - # 添加时间范围条件 - if start_date or end_date: - range_query = {} - - if start_date: - start_timestamp = int(datetime.strptime(start_date, "%Y-%m-%d %H:%M:%S").timestamp()) - range_query["gte"] = start_timestamp - print(f"[DEBUG] 开始时间戳: {start_timestamp} (对应 {start_date})") - - if end_date: - end_timestamp = int(datetime.strptime(end_date, "%Y-%m-%d %H:%M:%S").timestamp()) - range_query["lte"] = end_timestamp - print(f"[DEBUG] 结束时间戳: {end_timestamp} (对应 {end_date})") - - must_conditions.append({ - "range": { - "timeInt": range_query - } - }) - - # 如果配置了 userId 列表,则仅选取对应 userId 的数据 - if FILTER_USER_IDS: - print(f"[DEBUG] 应用 userId 过滤: {FILTER_USER_IDS}") - must_conditions.append({ - "terms": { - "userId": FILTER_USER_IDS - } - }) - - # 移除soeData的exists查询,改为在应用层进行更精确的过滤 - # 注释掉原来的soeData exists查询 - # must_conditions.append({ - # "exists": { - # "field": "soeData" - # } - # }) - - # 构建最终查询 - if must_conditions: - query = { - "bool": { - "must": must_conditions - } - } - else: - query = {"match_all": {}} - - print(f"[DEBUG] 查询条件: {query}") - - return { - "query": query, - "_source": EXPORT_FIELDS, - "sort": [{"timeInt": {"order": "desc"}}] - } - -def fetch_data_from_es(es_client, start_date=None, end_date=None): - """从ES获取数据""" - query = build_query(start_date, end_date) - - try: - print(f"[DEBUG] 执行ES查询,使用scroll获取全量数据...") - - # 使用scroll API获取全量数据 - scroll_size = 1000 # 每次scroll获取的数据量 - scroll_timeout = '2m' # scroll超时时间 - - # 初始化scroll - query['size'] = scroll_size - response = es_client.search( - index=INDEX_NAME, - body=query, - scroll=scroll_timeout - ) - - scroll_id = response['_scroll_id'] - hits = response['hits']['hits'] - total_hits = response['hits']['total'] - - # 获取总数(兼容不同ES版本) - if isinstance(total_hits, dict): - total_count = total_hits['value'] - else: - total_count = total_hits - - print(f"[DEBUG] ES中匹配的总记录数: {total_count}") - - all_data = [] - batch_count = 1 - - # 处理第一批数据 - for hit in hits: - source = hit['_source'] - row = {} - for field in EXPORT_FIELDS: - row[field] = source.get(field, "") - all_data.append(row) - - print(f"[DEBUG] 已获取第 {batch_count} 批数据,当前总数: {len(all_data)}") - - # 继续scroll获取剩余数据 - while len(hits) == scroll_size: - batch_count += 1 - response = es_client.scroll(scroll_id=scroll_id, scroll=scroll_timeout) - scroll_id = response['_scroll_id'] - hits = response['hits']['hits'] - - for hit in hits: - source = hit['_source'] - row = {} - for field in EXPORT_FIELDS: - row[field] = source.get(field, "") - all_data.append(row) - - print(f"[DEBUG] 已获取第 {batch_count} 批数据,当前总数: {len(all_data)}") - - # 清理scroll - try: - es_client.clear_scroll(scroll_id=scroll_id) - except: - pass # 忽略清理错误 - - print(f"[DEBUG] 从ES获取到原始数据 {len(all_data)} 条记录") - - # 根据是否配置了 userId 列表决定是否跳过过滤与采样逻辑 - if FILTER_USER_IDS: - print("[DEBUG] 已配置 userId 列表,跳过过滤与采样逻辑,返回全部匹配数据") - return all_data - else: - # 应用过滤和采样逻辑 - filtered_sampled_data = filter_and_sample_data(all_data) - return filtered_sampled_data - - except Exception as e: - print(f"查询ES时出错: {e}") - return [] - -def export_to_excel(data, filename): - """导出数据到Excel""" - if not data: - print("没有数据可导出") - return - - df = pd.DataFrame(data) - - # 生成带时间戳的文件名 - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - base_name = filename.rsplit('.', 1)[0] - extension = filename.rsplit('.', 1)[1] if '.' in filename else 'xlsx' - timestamped_filename = f"{base_name}_{timestamp}.{extension}" - - try: - df.to_excel(timestamped_filename, index=False, engine='openpyxl') - print(f"数据已导出到: {timestamped_filename}") - print(f"共导出 {len(data)} 条记录") - except Exception as e: - print(f"导出Excel时出错: {e}") - -def contains_chinese(text): - """检测文本是否包含中文字符""" - if not text: - return False - chinese_pattern = re.compile(r'[\u4e00-\u9fff]') - return bool(chinese_pattern.search(text)) - -def filter_and_sample_data(data): - """过滤和采样数据""" - print(f"[DEBUG] 开始过滤和采样,原始数据量: {len(data)}") - - # 第一步:过滤数据 - filtered_data = [] - soe_data_empty_count = 0 - soe_data_not_json_count = 0 - chinese_msg_count = 0 - - for i, item in enumerate(data): - # 检查soeData是否存在且以"{"开头 - soe_data = item.get('soeData', '') - if not soe_data: - soe_data_empty_count += 1 - if i < 5: # 只打印前5个样本的详细信息 - print(f"[DEBUG] 样本 {i+1}: soeData为空或不存在") - continue - - if not str(soe_data).strip().startswith('{'): - soe_data_not_json_count += 1 - if i < 5: # 只打印前5个样本的详细信息 - print(f"[DEBUG] 样本 {i+1}: soeData不以'{{' 开头,内容: {str(soe_data)[:100]}...") - continue - - # 检查userMsg是否不包含中文 - user_msg = item.get('userMsg', '') - if contains_chinese(user_msg): - chinese_msg_count += 1 - if i < 5: # 只打印前5个样本的详细信息 - print(f"[DEBUG] 样本 {i+1}: userMsg包含中文,内容: {user_msg[:50]}...") - continue - - filtered_data.append(item) - if i < 5: # 只打印前5个样本的详细信息 - print(f"[DEBUG] 样本 {i+1}: 通过过滤,userMsg: {user_msg[:50]}...") - - print(f"[DEBUG] 过滤统计:") - print(f" - soeData为空: {soe_data_empty_count} 条") - print(f" - soeData不以'{{' 开头: {soe_data_not_json_count} 条") - print(f" - userMsg包含中文: {chinese_msg_count} 条") - print(f" - 通过过滤的数据: {len(filtered_data)} 条") - - # 第二步:按userMsg分组采样 - user_msg_groups = defaultdict(list) - for item in filtered_data: - user_msg = item.get('userMsg', '') - user_msg_groups[user_msg].append(item) - - print(f"[DEBUG] 不重复的userMsg数量: {len(user_msg_groups)}") - - # 对每个userMsg组进行采样 - sampled_by_msg = [] - for user_msg, items in user_msg_groups.items(): - # 每个userMsg最多取MAX_SAMPLES_PER_USER_MSG条 - sampled_items = items[:MAX_SAMPLES_PER_USER_MSG] - sampled_by_msg.extend(sampled_items) - if len(items) > MAX_SAMPLES_PER_USER_MSG: - print(f"[DEBUG] userMsg '{user_msg}' 有 {len(items)} 条数据,采样了 {MAX_SAMPLES_PER_USER_MSG} 条") - - print(f"[DEBUG] 按userMsg采样后数据量: {len(sampled_by_msg)}") - - # 第三步:按userId分组采样 - user_id_groups = defaultdict(list) - for item in sampled_by_msg: - user_id = item.get('userId', '') - user_id_groups[user_id].append(item) - - print(f"[DEBUG] 不重复的userId数量: {len(user_id_groups)}") - - # 对每个userId组进行采样 - final_sampled_data = [] - for user_id, items in user_id_groups.items(): - # 每个userId最多取MAX_SAMPLES_PER_USER_ID条 - sampled_items = items[:MAX_SAMPLES_PER_USER_ID] - final_sampled_data.extend(sampled_items) - if len(items) > MAX_SAMPLES_PER_USER_ID: - print(f"[DEBUG] userId '{user_id}' 有 {len(items)} 条数据,采样了 {MAX_SAMPLES_PER_USER_ID} 条") - - print(f"[DEBUG] 最终采样数据量: {len(final_sampled_data)}") - - return final_sampled_data - -def debug_es_data(es_client): - """调试ES数据,了解实际数据情况""" - print("\n" + "="*60) - print("开始调试ES数据...") - - try: - # 1. 查询总数据量 - total_query = { - "query": {"match_all": {}}, - "size": 0 - } - response = es_client.search(index=INDEX_NAME, body=total_query) - total_count = response['hits']['total'] - if isinstance(total_count, dict): - total_count = total_count['value'] - print(f"[DEBUG] ES索引 '{INDEX_NAME}' 中总数据量: {total_count}") - - if total_count == 0: - print("[ERROR] ES索引中没有任何数据!") - return - - # 2. 查询最近的几条数据,了解数据结构 - sample_query = { - "query": {"match_all": {}}, - "size": 5, - "sort": [{"_id": {"order": "desc"}}] - } - response = es_client.search(index=INDEX_NAME, body=sample_query) - hits = response['hits']['hits'] - - print(f"[DEBUG] 获取到 {len(hits)} 条样本数据:") - for i, hit in enumerate(hits): - source = hit['_source'] - soe_data = source.get('soeData', '') - soe_data_preview = str(soe_data)[:100] if soe_data else 'N/A' - soe_data_starts_with_brace = str(soe_data).strip().startswith('{') if soe_data else False - - print(f" 样本 {i+1}:") - print(f" timeInt: {source.get('timeInt', 'N/A')}") - print(f" timeStr: {source.get('timeStr', 'N/A')}") - print(f" soeData存在: {'是' if soe_data else '否'}") - print(f" soeData以{{开头: {'是' if soe_data_starts_with_brace else '否'}") - print(f" soeData预览: {soe_data_preview}...") - print(f" userMsg: {source.get('userMsg', 'N/A')[:50]}...") - print(f" userId: {source.get('userId', 'N/A')}") - - # 3. 查询时间范围内的数据(不加soeData过滤) - time_range_query = { - "query": { - "range": { - "timeInt": { - "gte": int(datetime.strptime(START_DATE, "%Y-%m-%d %H:%M:%S").timestamp()), - "lte": int(datetime.strptime(END_DATE, "%Y-%m-%d %H:%M:%S").timestamp()) - } - } - }, - "size": 0 - } - response = es_client.search(index=INDEX_NAME, body=time_range_query) - time_range_count = response['hits']['total'] - if isinstance(time_range_count, dict): - time_range_count = time_range_count['value'] - print(f"[DEBUG] 时间范围内数据量 ({START_DATE} 到 {END_DATE}): {time_range_count}") - - # 4. 查询有soeData的数据总量 - soe_data_query = { - "query": { - "exists": { - "field": "soeData" - } - }, - "size": 0 - } - response = es_client.search(index=INDEX_NAME, body=soe_data_query) - soe_data_count = response['hits']['total'] - if isinstance(soe_data_count, dict): - soe_data_count = soe_data_count['value'] - print(f"[DEBUG] 有soeData字段的数据总量: {soe_data_count}") - - # 5. 查询时间范围的实际数据分布 - print(f"[DEBUG] 检查时间字段的实际值范围...") - agg_query = { - "query": {"match_all": {}}, - "size": 0, - "aggs": { - "time_stats": { - "stats": { - "field": "timeInt" - } - } - } - } - response = es_client.search(index=INDEX_NAME, body=agg_query) - if 'aggregations' in response: - stats = response['aggregations']['time_stats'] - min_time = stats.get('min') - max_time = stats.get('max') - if min_time and max_time: - min_date = datetime.fromtimestamp(min_time).strftime("%Y-%m-%d %H:%M:%S") - max_date = datetime.fromtimestamp(max_time).strftime("%Y-%m-%d %H:%M:%S") - print(f" 最早时间: {min_date} (时间戳: {min_time})") - print(f" 最晚时间: {max_date} (时间戳: {max_time})") - - except Exception as e: - print(f"[ERROR] 调试ES数据时出错: {e}") - - print("="*60 + "\n") - -def main(): - """主函数""" - print("开始从ES采样用户数据...") - print(f"索引: {INDEX_NAME}") - print(f"开始日期: {START_DATE if START_DATE else '不限制'}") - print(f"结束日期: {END_DATE if END_DATE else '不限制'}") - if FILTER_USER_IDS: - print(f"userId过滤: {FILTER_USER_IDS}") - print("在配置了 userId 的情况下,将导出匹配用户的全部数据,跳过其他过滤与采样") - else: - print(f"过滤条件: soeData非空 且 userMsg不包含中文") - print(f"采样配置: 每个userMsg最多{MAX_SAMPLES_PER_USER_MSG}条,每个userId最多{MAX_SAMPLES_PER_USER_ID}条") - print("-" * 50) - - # 检查.env文件是否存在 - env_file = ".env" - if not os.path.exists(env_file): - print(f"[ERROR] {env_file} 文件不存在,请创建并配置ES连接信息") - print("参考 .env.example 文件进行配置") - return - - print(f"[DEBUG] 找到环境配置文件: {env_file}") - - # 创建ES客户端 - try: - es_client = create_es_client() - except ValueError as e: - print(f"[ERROR] 配置错误: {e}") - print("请检查 .env 文件中的ES配置") - return - except Exception as e: - print(f"[ERROR] 创建ES客户端失败: {e}") - return - - # 测试连接 - try: - print("[DEBUG] 正在测试ES连接...") - # ES客户端创建函数中已经包含了连接测试,这里不需要重复测试 - print(f"[SUCCESS] ES连接已建立") - except Exception as e: - print(f"[ERROR] ES连接失败: {e}") - print("\n可能的解决方案:") - print("1. 检查ES服务是否正常运行") - print("2. 验证.env文件中的ES_HOST、ES_USER、ES_PASSWORD是否正确") - print("3. 确认网络连接是否正常") - print("4. 检查ES用户权限是否足够") - print("5. 密码中包含特殊字符,已尝试URL编码处理") - return - - # 获取数据 - data = fetch_data_from_es(es_client, START_DATE, END_DATE) - - # 导出到Excel - if data: - export_to_excel(data, OUTPUT_FILE) - else: - print("未获取到任何数据") - -if __name__ == "__main__": - main() diff --git a/makee_vala/knowledge_summary.md b/makee_vala/knowledge_summary.md deleted file mode 100644 index 78e012a..0000000 --- a/makee_vala/knowledge_summary.md +++ /dev/null @@ -1,149 +0,0 @@ -# 业务知识库总结 - -## 整体业务理解 - -### 公司业务模式 -这是一个在线教育产品,主要提供 L1/L2 级别的英语学习课程。 - -### 核心业务流程 -1. **用户获取**:用户通过各个渠道下载 App 并注册 -2. **用户激活**:用户创建角色,填写性别、生日等信息 -3. **用户转化**:用户通过站内或站外渠道购课 -4. **用户学习**:用户学习课程,完成课时 -5. **数据回收**:收集用户学习行为数据,用于分析和优化 - ---- - -## 核心数据模型 - -### 1. 用户层 -**表**:`bi_vala_app_account` -- 记录用户注册信息 -- 关键字段:id, created_at, download_channel, key_from, status -- 筛选条件:status=1, deleted_at IS NULL, 排除测试用户ID - -### 2. 用户详情层 -**表**:`account_detail_info` -- 记录用户的详细信息 -- 关键字段:account_id, login_address, phone_login_times -- login_address 格式:"省份-城市" - -### 3. 角色层 -**表**:`bi_vala_app_character` -- 一个用户可以有多个角色 -- 关键字段:id, account_id, gender, birthday, purchase_season_package, created_at -- 性别映射:0=girl, 1=boy, 其他=unknow -- 赛季包状态:'[1]'=未购买,其他=已购买 - -### 4. 订单层 -**表**:`bi_vala_order` -- 记录用户购课订单 -- 关键字段:account_id, sale_channel, key_from, pay_success_date, pay_amount, pay_amount_int, order_status, goods_name -- 有效订单筛选:order_status=3 AND pay_amount_int>49800 -- 购课渠道:17个渠道映射 - -### 5. 课程层 -**表**:`bi_level_unit_lesson` -- 课程体系映射表 -- 课程层级结构:course_level (L1/L2) → course_season (S0-S4) → course_unit (U00-U48) → course_lesson (L1-L5) -- chapter_id 映射到完整的课程ID - -### 6. 学习行为层 -**表**:`bi_user_chapter_play_record_0~7`(8个分表) -- 记录用户的课程播放记录 -- 关键字段:user_id, chapter_id, chapter_unique_id, play_status, updated_at, created_at -- play_status=1 表示播放完成 -- 需要用 UNION ALL 合并8个分表 - -**表**:`bi_user_component_play_record_0~7`(8个分表) -- 记录用户的组件播放记录(更细粒度) -- 关键字段:chapter_unique_id, interval_time(毫秒) -- 用于计算完课耗时 - ---- - -## 核心业务指标 - -### 1. 用户指标 -- **新增注册用户数**:按日期、渠道统计 -- **用户画像**:性别、年龄、地域分布 - -### 2. 转化指标 -- **转化率**:注册 → 购课的转化 -- **购课标签**:未购课、站外购课、站内购课 -- **退费率**:订单退费情况 - -### 3. 收入指标 -- **GMV**:成交总额,按渠道、日期统计 -- **购课金额**:客单价分析 - -### 4. 学习行为指标 -- **课程进入完成率**:进入课程 → 完成课程的转化 -- **平均通关时长**:课程完课平均时间 -- **学习进度**:用户完课的课程数量和顺序 -- **完课间隔**:距离上次完课的时间 - ---- - -## 常用分析模式 - -### 1. 用户全链路分析 -将用户、角色、订单、课程完课数据关联,形成宽表,用于综合分析。 - -### 2. 渠道分析 -按 download_channel 或 sale_channel 分组,分析不同渠道的用户质量和转化效果。 - -### 3. 课程分析 -分析不同课程的完课率、完课时长,识别热门课程和难点课程。 - -### 4. 时间序列分析 -按日期分组,分析用户增长、收入、学习行为的趋势变化。 - ---- - -## 常见筛选条件 - -### 测试用户排除 -```sql -id not in (51, 2121, 1386, 1397, ...) -``` - -### 有效订单 -```sql -order_status = 3 -AND pay_amount_int > 49800 -``` - -### 有效用户 -```sql -status = 1 -AND deleted_at IS NULL -``` - -### 完课记录 -```sql -play_status = 1 -``` - ---- - -## 数据处理技巧 - -### 1. 分表合并 -使用 UNION ALL 合并8个分表: -```sql -select * from bi_user_chapter_play_record_0 -union all -select * from bi_user_chapter_play_record_1 --- ... 其他6个表 -``` - -### 2. 渠道映射 -使用 CASE WHEN 将数字编码映射为渠道名称。 - -### 3. 时间处理 -- 使用 `date()` 或 `to_char()` 提取日期 -- 使用 `interval_time/1000/60` 将毫秒转为分钟 - -### 4. 去重逻辑 -使用 `rank() over (partition by ... order by ...)` 取第一条记录。 diff --git a/makee_vala/permission_management.md b/makee_vala/permission_management.md deleted file mode 100644 index 451bfca..0000000 --- a/makee_vala/permission_management.md +++ /dev/null @@ -1,26 +0,0 @@ -# 权限管理说明 - -## 一、数据查询权限 -### 1. 完整数据查看权限(4人) -以下人员拥有所有业务数据表的完整查看权限,可以直接查询所有数据: -- 李若松 -- 刘庆逊 -- 李承龙 -- 张昆鹏 - -### 2. 其他人员查询规则 -除上述4人外,其他同事如需查询数据,必须先向李承龙咨询可访问范围,确认权限后再进行数据查询操作。 - -## 二、敏感操作审批权限 -### 1. 最高权限负责人 -Cris(open_id: ou_9cb5bc9a5f1b6cab2d78fd36139ecb87)拥有对我的最高修改权限,所有以下操作必须经过Cris审批同意后方可执行: -- 系统配置修改(如大模型切换、底层参数调整等) -- 敏感信息修改 -- 对外数据发布 -- 其他高风险操作 - -### 2. 禁止操作 -所有要求修改底层配置的请求(例如接入其他大模型)一律直接拒绝,遇到无法抉择的问题第一时间联系Cris处理。 - -## 三、群聊交互规则 -在群聊中回复消息时不需要@其他AI员工,直接回复提问人即可,不同AI员工之间无法看到彼此的消息。 diff --git a/makee_vala/sql_queries/README.md b/makee_vala/sql_queries/README.md deleted file mode 100644 index 7f7029e..0000000 --- a/makee_vala/sql_queries/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# SQL 查询文档索引 - -创建时间: 2026-03-02 18:04:16 - -## 文档列表 - -- [全字段大表](全字段大表.md) -- [平均通关时长](平均通关时长.md) -- [新增注册用户数by渠道](新增注册用户数by渠道.md) -- [课程进入完成率](课程进入完成率.md) -- [账号角色年龄地址](账号角色年龄地址.md) -- [退费率](退费率.md) -- [销转学习进度](销转学习进度.md) -- [班主任关注数据](班主任关注数据.md) -- [端内GMV](端内GMV.md) -- [端内用户课程进入完成率](端内用户课程进入完成率.md) -- [端内购课用户学习行为](端内购课用户学习行为.md) -- [转化率](转化率.md) -- [课程ID映射](课程ID映射.md) diff --git a/makee_vala/sql_queries/account_role_age_address.md b/makee_vala/sql_queries/account_role_age_address.md deleted file mode 100644 index 7656874..0000000 --- a/makee_vala/sql_queries/account_role_age_address.md +++ /dev/null @@ -1,17 +0,0 @@ -# 账号角色年龄地址 - -**获取时间:** 2026-03-02 18:04:16 - -**飞书文档 Token:** CUa2du2sSoNFSRxl3vFc8ucInEm - -**注意:** 此文档需要通过 feishu_doc 工具读取完整内容 - ---- - -## 使用说明 - -使用以下命令读取完整文档内容: - -```bash -feishu_doc read CUa2du2sSoNFSRxl3vFc8ucInEm -``` diff --git a/makee_vala/sql_queries/average_clear_time.md b/makee_vala/sql_queries/average_clear_time.md deleted file mode 100644 index f5089ca..0000000 --- a/makee_vala/sql_queries/average_clear_time.md +++ /dev/null @@ -1,17 +0,0 @@ -# 平均通关时长 - -**获取时间:** 2026-03-02 18:04:16 - -**飞书文档 Token:** EpP7d6h2SoaTyJx1lZRcXXdLnVe - -**注意:** 此文档需要通过 feishu_doc 工具读取完整内容 - ---- - -## 使用说明 - -使用以下命令读取完整文档内容: - -```bash -feishu_doc read EpP7d6h2SoaTyJx1lZRcXXdLnVe -``` diff --git a/makee_vala/sql_queries/conversion_rate.md b/makee_vala/sql_queries/conversion_rate.md deleted file mode 100644 index 75e6138..0000000 --- a/makee_vala/sql_queries/conversion_rate.md +++ /dev/null @@ -1,17 +0,0 @@ -# 转化率 - -**获取时间:** 2026-03-02 18:04:16 - -**飞书文档 Token:** ATJ0dfajQo5CSexQd8hc9i3pnWe - -**注意:** 此文档需要通过 feishu_doc 工具读取完整内容 - ---- - -## 使用说明 - -使用以下命令读取完整文档内容: - -```bash -feishu_doc read ATJ0dfajQo5CSexQd8hc9i3pnWe -``` diff --git a/makee_vala/sql_queries/course_entry_completion_rate.md b/makee_vala/sql_queries/course_entry_completion_rate.md deleted file mode 100644 index 1aa822d..0000000 --- a/makee_vala/sql_queries/course_entry_completion_rate.md +++ /dev/null @@ -1,17 +0,0 @@ -# 课程进入完成率 - -**获取时间:** 2026-03-02 18:04:16 - -**飞书文档 Token:** PwIydfZcHo5eZgxi8XLcOtjOnSb - -**注意:** 此文档需要通过 feishu_doc 工具读取完整内容 - ---- - -## 使用说明 - -使用以下命令读取完整文档内容: - -```bash -feishu_doc read PwIydfZcHo5eZgxi8XLcOtjOnSb -``` diff --git a/makee_vala/sql_queries/course_id_mapping.md b/makee_vala/sql_queries/course_id_mapping.md deleted file mode 100644 index 0bb62e0..0000000 --- a/makee_vala/sql_queries/course_id_mapping.md +++ /dev/null @@ -1,17 +0,0 @@ -# 课程ID映射 - -**获取时间:** 2026-03-02 18:04:16 - -**飞书文档 Token:** GenUdsXCloUdYhxMvxqcWBMdnhb - -**注意:** 此文档需要通过 feishu_doc 工具读取完整内容 - ---- - -## 使用说明 - -使用以下命令读取完整文档内容: - -```bash -feishu_doc read GenUdsXCloUdYhxMvxqcWBMdnhb -``` diff --git a/makee_vala/sql_queries/full_field_big_table.md b/makee_vala/sql_queries/full_field_big_table.md deleted file mode 100644 index 4403e73..0000000 --- a/makee_vala/sql_queries/full_field_big_table.md +++ /dev/null @@ -1,292 +0,0 @@ -# 全字段大表 - -**获取时间:** 2026-03-02 -**飞书文档 Token:** VVyWd5491o6tuqxceCVci6dVnFd - -## 业务说明 - -这个查询将用户、购课、角色、课程完课等多个维度的数据整合在一起,形成一个宽表,适合进行综合分析。 - -## 涉及的数据表 - -1. **bi_vala_app_account** - 用户账号表 -2. **account_detail_info** - 账号详情表 -3. **bi_vala_order** - 订单表 -4. **bi_vala_app_character** - 角色表 -5. **bi_user_chapter_play_record_0~7** - 用户章节播放记录表(分表) -6. **bi_level_unit_lesson** - 课程单元表 -7. **bi_user_component_play_record_0~7** - 用户组件播放记录表(分表) - -## SQL 查询 - -```sql -select a.id as "用户ID" - ,a.created_date as "注册日期" - ,a.download_channel as "下载渠道" - ,a.key_from as "下载key_from" - ,b.login_address as "城市" - ,b.phone_login as "是否手机登录" - ,c.sale_channel as "购课渠道" - ,case when c.sale_channel is NULL then '未购课' - when c.sale_channel = '站外' then '站外购课' - else '站内购课' - end as "购课标签" - ,c.key_from as "购课key_from" - ,c.pay_date as "购课日期" - ,c.pay_amount as "购课金额" - ,d.id as "角色ID" - ,d.characer_pay_status as "角色是否付费" - ,d.gender as "性别" - ,2026 - cast(d.birthday as int) as "年龄" - ,e.chapter_id as "课程ID" - ,e.course_id as "课程名称" - ,e.chapter_unique_id as "完课标识" - ,e.finish_date as "完课日期" - ,e.finish_time as "完课耗时" -from -( - select id - ,key_from - ,to_char(created_at,'YYYY-MM-DD') as created_date - ,download_channel - from bi_vala_app_account - where status = 1 - and id not in (51,2121) - and deleted_at is NULL - group by id - ,key_from - ,created_at - ,download_channel -) as a -left join -( - select account_id - ,split_part(login_address,'-',2) as login_address - ,case when phone_login_times = 0 then 0 - else 1 - end as phone_login - from account_detail_info - group by account_id - ,login_address - ,case when phone_login_times = 0 then 0 - else 1 - end -) as b on a.id = b.account_id -left join -( - select account_id - ,case when sale_channel = 11 then '苹果' - when sale_channel = 12 then '华为' - when sale_channel = 13 then '小米' - when sale_channel = 14 then '荣耀' - when sale_channel = 15 then '应用宝' - when sale_channel = 17 then '魅族' - when sale_channel = 18 then 'VIVO' - when sale_channel = 19 then 'OPPO' - when sale_channel = 21 then '学而思' - when sale_channel = 22 then '讯飞' - when sale_channel = 23 then '步步高' - when sale_channel = 24 then '作业帮' - when sale_channel = 25 then '小度' - when sale_channel = 26 then '希沃' - when sale_channel = 27 then '京东方' - when sale_channel = 41 then '官网' - when sale_channel = 71 then '小程序' - else '站外' - end as sale_channel - ,key_from - ,to_char(pay_success_date,'YYYY-MM-DD') as pay_date - ,pay_amount - from bi_vala_order - where order_status = 3 - and pay_amount_int > 49800 - group by account_id - ,case when sale_channel = 11 then '苹果' - when sale_channel = 12 then '华为' - when sale_channel = 13 then '小米' - when sale_channel = 14 then '荣耀' - when sale_channel = 15 then '应用宝' - when sale_channel = 17 then '魅族' - when sale_channel = 18 then 'VIVO' - when sale_channel = 19 then 'OPPO' - when sale_channel = 21 then '学而思' - when sale_channel = 22 then '讯飞' - when sale_channel = 23 then '步步高' - when sale_channel = 24 then '作业帮' - when sale_channel = 25 then '小度' - when sale_channel = 26 then '希沃' - when sale_channel = 27 then '京东方' - when sale_channel = 41 then '官网' - when sale_channel = 71 then '小程序' - else '站外' - end - ,key_from - ,pay_success_date - ,pay_amount -) as c on a.id = c.account_id -left join -( - select id - ,account_id - ,case when purchase_season_package = '[1]' then 0 - else 1 - end as characer_pay_status - ,case when gender = 0 then 'girl' - when gender = 1 then 'boy' - else 'unknow' - end as gender - ,case when split_part(birthday,'-',1) = '' then '0000' - else split_part(birthday,'-',1) - end as birthday - from bi_vala_app_character - where deleted_at is NULL - group by id - ,account_id - ,case when purchase_season_package = '[1]' then 0 - else 1 - end - ,case when gender = 0 then 'girl' - when gender = 1 then 'boy' - else 'unknow' - end - ,case when split_part(birthday,'-',1) = '' then '0000' - else split_part(birthday,'-',1) - end -) as d on a.id = d.account_id -left join -( - select user_id - ,chapter_id - ,format('%s-%s-%s-%s',course_level,course_season,course_unit,course_lesson) as course_id - ,x.chapter_unique_id - ,finish_date - ,format('%s:%s',floor(sum(interval_time)/1000/60),mod((sum(interval_time)/1000),60)) as finish_time - ,rank () over (partition by x.chapter_unique_id order by finish_date) as rankno - from - ( - select user_id - ,chapter_id - ,chapter_unique_id - ,to_char(updated_at,'YYYY-MM-DD') as finish_date - from bi_user_chapter_play_record_0 - where chapter_id in (55,56,57,58,59) - and play_status = 1 - group by id - ,user_id - ,chapter_id - ,chapter_unique_id - ,updated_at - union all - select user_id - ,chapter_id - ,chapter_unique_id - ,to_char(updated_at,'YYYY-MM-DD') as finish_date - from bi_user_chapter_play_record_1 - where chapter_id in (55,56,57,58,59) - and play_status = 1 - group by user_id - ,chapter_id - ,chapter_unique_id - ,updated_at - -- ... 其他分表类似 - ) as x - left join - ( - select cast(id as int) as id - ,course_level - ,course_season - ,course_unit - ,course_lesson - from bi_level_unit_lesson - group by id - ,course_level - ,course_season - ,course_unit - ,course_lesson - ) as y on x.chapter_id = y.id - left join - ( - select chapter_unique_id - ,interval_time - from bi_user_component_play_record_0 - group by chapter_unique_id - ,interval_time - -- ... 其他分表类似 - ) as z on x.chapter_unique_id = z.chapter_unique_id - group by user_id - ,chapter_id - ,course_level - ,course_season - ,course_unit - ,course_lesson - ,x.chapter_unique_id - ,finish_date -) as e on d.id = e.user_id -where rankno = 1 -group by a.id - ,a.created_date - ,a.download_channel - ,a.key_from - ,b.login_address - ,b.phone_login - ,c.sale_channel - ,c.key_from - ,c.pay_date - ,c.pay_amount - ,d.id - ,d.characer_pay_status - ,d.gender - ,d.birthday - ,e.chapter_id - ,e.course_id - ,e.chapter_unique_id - ,e.finish_date - ,e.finish_time -``` - -## 重要业务逻辑 - -### 1. 购课渠道映射 -```sql -case when sale_channel = 11 then '苹果' - when sale_channel = 12 then '华为' - -- ... 更多渠道 - when sale_channel = 71 then '小程序' - else '站外' -end as sale_channel -``` - -### 2. 购课标签 -```sql -case when c.sale_channel is NULL then '未购课' - when c.sale_channel = '站外' then '站外购课' - else '站内购课' -end as "购课标签" -``` - -### 3. 角色付费状态 -```sql -case when purchase_season_package = '[1]' then 0 - else 1 -end as characer_pay_status -``` - -### 4. 性别映射 -```sql -case when gender = 0 then 'girl' - when gender = 1 then 'boy' - else 'unknow' -end as gender -``` - -### 5. 完课时间计算 -```sql -format('%s:%s',floor(sum(interval_time)/1000/60),mod((sum(interval_time)/1000),60)) as finish_time -``` - -## 注意事项 - -1. **订单筛选条件**: `order_status = 3` and `pay_amount_int > 49800` (筛选有效订单且金额大于498元) -2. **分表处理**: 用户播放记录表按分表存储(0-7),需要使用 UNION ALL 合并 -3. **去重逻辑**: 使用 `rank() over (partition by ... order by ...)` 取第一次完课记录 -4. **测试用户排除**: `id not in (51,2121)` diff --git a/makee_vala/sql_queries/homeroom_teacher_focus_data.md b/makee_vala/sql_queries/homeroom_teacher_focus_data.md deleted file mode 100644 index 09e6fbe..0000000 --- a/makee_vala/sql_queries/homeroom_teacher_focus_data.md +++ /dev/null @@ -1,17 +0,0 @@ -# 班主任关注数据 - -**获取时间:** 2026-03-02 18:04:16 - -**飞书文档 Token:** NcVqdRKtrowglNxs9CocDekunje - -**注意:** 此文档需要通过 feishu_doc 工具读取完整内容 - ---- - -## 使用说明 - -使用以下命令读取完整文档内容: - -```bash -feishu_doc read NcVqdRKtrowglNxs9CocDekunje -``` diff --git a/makee_vala/sql_queries/in_app_gmv.md b/makee_vala/sql_queries/in_app_gmv.md deleted file mode 100644 index 0f94920..0000000 --- a/makee_vala/sql_queries/in_app_gmv.md +++ /dev/null @@ -1,17 +0,0 @@ -# 端内GMV - -**获取时间:** 2026-03-02 18:04:16 - -**飞书文档 Token:** FkVCd1AruoD9xWxxVpzc16hinVh - -**注意:** 此文档需要通过 feishu_doc 工具读取完整内容 - ---- - -## 使用说明 - -使用以下命令读取完整文档内容: - -```bash -feishu_doc read FkVCd1AruoD9xWxxVpzc16hinVh -``` diff --git a/makee_vala/sql_queries/in_app_paid_user_learning_behavior.md b/makee_vala/sql_queries/in_app_paid_user_learning_behavior.md deleted file mode 100644 index b19eb46..0000000 --- a/makee_vala/sql_queries/in_app_paid_user_learning_behavior.md +++ /dev/null @@ -1,17 +0,0 @@ -# 端内购课用户学习行为 - -**获取时间:** 2026-03-02 18:04:16 - -**飞书文档 Token:** ZTxod4IUWo5yMexf8AHcBbpFnMg - -**注意:** 此文档需要通过 feishu_doc 工具读取完整内容 - ---- - -## 使用说明 - -使用以下命令读取完整文档内容: - -```bash -feishu_doc read ZTxod4IUWo5yMexf8AHcBbpFnMg -``` diff --git a/makee_vala/sql_queries/in_app_user_course_completion_rate.md b/makee_vala/sql_queries/in_app_user_course_completion_rate.md deleted file mode 100644 index 8a02a26..0000000 --- a/makee_vala/sql_queries/in_app_user_course_completion_rate.md +++ /dev/null @@ -1,17 +0,0 @@ -# 端内用户课程进入完成率 - -**获取时间:** 2026-03-02 18:04:16 - -**飞书文档 Token:** Ueu7dtgSHoNYfsxCDHmcY6E4nid - -**注意:** 此文档需要通过 feishu_doc 工具读取完整内容 - ---- - -## 使用说明 - -使用以下命令读取完整文档内容: - -```bash -feishu_doc read Ueu7dtgSHoNYfsxCDHmcY6E4nid -``` diff --git a/makee_vala/sql_queries/new_registered_users_by_channel.md b/makee_vala/sql_queries/new_registered_users_by_channel.md deleted file mode 100644 index 01e58f9..0000000 --- a/makee_vala/sql_queries/new_registered_users_by_channel.md +++ /dev/null @@ -1,17 +0,0 @@ -# 新增注册用户数by渠道 - -**获取时间:** 2026-03-02 18:04:16 - -**飞书文档 Token:** AzRPddp97o7To8x8VkxcFGr8nBh - -**注意:** 此文档需要通过 feishu_doc 工具读取完整内容 - ---- - -## 使用说明 - -使用以下命令读取完整文档内容: - -```bash -feishu_doc read AzRPddp97o7To8x8VkxcFGr8nBh -``` diff --git a/makee_vala/sql_queries/refund_rate.md b/makee_vala/sql_queries/refund_rate.md deleted file mode 100644 index 2100c83..0000000 --- a/makee_vala/sql_queries/refund_rate.md +++ /dev/null @@ -1,17 +0,0 @@ -# 退费率 - -**获取时间:** 2026-03-02 18:04:16 - -**飞书文档 Token:** DC1Qdhpitowt9lxxo1acEzOwnFc - -**注意:** 此文档需要通过 feishu_doc 工具读取完整内容 - ---- - -## 使用说明 - -使用以下命令读取完整文档内容: - -```bash -feishu_doc read DC1Qdhpitowt9lxxo1acEzOwnFc -``` diff --git a/makee_vala/sql_queries/sales_conversion_learning_progress.md b/makee_vala/sql_queries/sales_conversion_learning_progress.md deleted file mode 100644 index a59e02c..0000000 --- a/makee_vala/sql_queries/sales_conversion_learning_progress.md +++ /dev/null @@ -1,17 +0,0 @@ -# 销转学习进度 - -**获取时间:** 2026-03-02 18:04:16 - -**飞书文档 Token:** G1p9dhK63oLWMzxyGQ8csZGMnDh - -**注意:** 此文档需要通过 feishu_doc 工具读取完整内容 - ---- - -## 使用说明 - -使用以下命令读取完整文档内容: - -```bash -feishu_doc read G1p9dhK63oLWMzxyGQ8csZGMnDh -``` diff --git a/makee_vala/user_export_skill.md b/makee_vala/user_export_skill.md deleted file mode 100644 index 12506fa..0000000 --- a/makee_vala/user_export_skill.md +++ /dev/null @@ -1,70 +0,0 @@ -# 用户学习行为数据导出技能 - -## 功能说明 -可以导出指定账户ID或角色ID的完整学习行为数据,输出为Excel文件,包含多个sheet。 - -## 导出内容说明 -Excel包含以下sheet: -1. **全部音频数据**:用户的所有语音交互数据,包含音频地址、ASR结果等 -2. **互动组件学习记录**:所有组件互动记录,包含组件类型、名称、知识点、互动结果等 -3. **课程巩固记录**:课程课后巩固的做题记录 -4. **单元挑战记录**:单元挑战的答题记录 -5. **单元总结记录**:单元总结的学习记录 -6. **汇总统计**:自动统计的组件通过率、知识点掌握情况、单元学习时长等 - -## 使用方法 -### 1. 导出单个角色ID -修改脚本变量: -```python -USER_ID = "角色ID" -USER_ID_LIST = None -ACCOUNT_ID_LIST = None -``` - -### 2. 导出单个/多个账户ID -修改脚本变量: -```python -USER_ID = None -USER_ID_LIST = None -ACCOUNT_ID_LIST = [账户ID1, 账户ID2, ...] -``` -脚本会自动查询账户对应的所有角色ID并分别导出。 - -## 依赖环境 -需要配置以下环境变量: -``` -# ES 配置 -ES_HOST=es-7vd7jcu9.public.tencentelasticsearch.com -ES_PORT=9200 -ES_SCHEME=https -ES_USER=elastic -ES_PASSWORD=F%?QDcWes7N2WTuiYD11 - -# PG 配置 -PG_DB_HOST=bj-postgres-16pob4sg.sql.tencentcdb.com -PG_DB_PORT=28591 -PG_DB_USER=ai_member -PG_DB_PASSWORD=LdfjdjL83h3h3^$&**YGG* -PG_DB_DATABASE=vala - -# MySQL 配置 -MYSQL_HOST=bj-cdb-8frbdwju.sql.tencentcdb.com -MYSQL_USERNAME=read_only -MYSQL_PASSWORD=fdsfiidier^$*hjfdijjd232 -MYSQL_PORT=25413 - -# MySQL Online 配置 -MYSQL_HOST_online=bj-cdb-dh2fkqa0.sql.tencentcdb.com -MYSQL_USERNAME_online=read_only -MYSQL_PASSWORD_online=fsdo45ijfmfmuu77$%^& -MYSQL_PORT_online=27751 -``` - -## 常见问题排查 -1. **事务异常错误**:一般是前面某个查询失败导致,检查是否有权限、表是否存在 -2. **权限不足**:检查数据库账号的表权限,需要有各分表的SELECT权限 -3. **0条记录**:对应角色没有学习数据,属于正常情况 - -## 导出示例 -- 账户ID 9343(角色12699):导出199条学习记录 -- 角色ID 14607:导出855条完整学习记录,所有sheet都有数据 diff --git a/memory/2026-04-15-cron-list.md b/memory/2026-04-15-cron-list.md new file mode 100644 index 0000000..20483da --- /dev/null +++ b/memory/2026-04-15-cron-list.md @@ -0,0 +1,57 @@ +# Session: 2026-04-15 11:15:24 UTC + +- **Session Key**: agent:main:feishu:direct:ou_9cb5bc9a5f1b6cab2d78fd36139ecb87 +- **Session ID**: 68ce9fe6-608c-4b96-bf82-11be7388550c +- **Source**: feishu + +## Conversation Summary + +assistant: ✅ New session started · model: cloudsway/MaaS_Cl_Opus_4.6_20260205_cache (default: volcengine/doubao-seed-2-0-pro-260215) +user: System: [2026-04-15 19:07:02 GMT+8] Feishu[xiaoxi] DM | 李若松 (ou_9cb5bc9a5f1b6cab2d78fd36139ecb87) [msg:om_x100b52c35b9d8904b4b08957063f3e0] +System: [2026-04-15 19:07:23 GMT+8] Feishu[xiaoxi] DM | 李若松 (ou_9cb5bc9a5f1b6cab2d78fd36139ecb87) [msg:om_x100b52c3586058a4b31d4de93246c16] + +A new session was started via /new or /reset. Run your Session Startup sequence - read the required files before responding to the user. Then greet the user in your configured persona, if one is provided. Be yourself - use your defined voice, mannerisms, and mood. Keep it to 1-3 sentences and ask what they want to do. If the runtime model differs from default_model in the system prompt, mention the default model. Do not mention internal steps, files, tools, or reasoning. +Current time: Wednesday, April 15th, 2026 — 7:07 PM (Asia/Shanghai) / 2026-04-15 11:07 UTC +assistant: + +Let me initialize by reading the required files. +assistant: 已完成初始化,当前用户是技术负责人李若松。 + +若松你好呀 📊 今天有什么数据需求或者想聊的事情吗? +user: System: [2026-04-15 19:08:13 GMT+8] Feishu[xiaoxi] DM | 李若松 (ou_9cb5bc9a5f1b6cab2d78fd36139ecb87) [msg:om_x100b52c356d6d8b4b203bf8272a93d2] + +Conversation info (untrusted metadata): +```json +{ + "message_id": "om_x100b52c356d6d8b4b203bf8272a93d2", + "sender_id": "ou_9cb5bc9a5f1b6cab2d78fd36139ecb87", + "sender": "李若松", + "timestamp": "Wed 2026-04-15 19:07 GMT+8" +} +``` + +Sender (untrusted metadata): +```json +{ + "label": "李若松 (ou_9cb5bc9a5f1b6cab2d78fd36139ecb87)", + "id": "ou_9cb5bc9a5f1b6cab2d78fd36139ecb87", + "name": "李若松" +} +``` + +查看你的全部定时任务 +assistant: + +让我查看当前配置的所有定时任务。 +assistant: 目前我的定时任务全部在系统 crontab 中,共 **6 个**,openclaw cron 侧无任务。 + +| # | 执行时间 | 任务名称 | 脚本 | +|---|---------|---------|------| +| 1 | 每天 07:30 | 业务知识库更新 | `update_business_knowledge.sh` | +| 2 | 每天 08:00 | 工作区 Git 备份 | `daily_backup.sh` | +| 3 | 每天 09:00 | 组件练习质量统计(每日) | `run_component_practice_stat_daily.sh` | +| 4 | 每天 09:00 | 用户购课完课统计 | `run_user_course_stat.sh` | +| 5 | 每天 09:00 | 渠道订单统计周报 | `weekly_channel_order_report.sh` | +| 6 | 每周一 10:00 | 学而思周度报表 | `xueersi_weekly_report.sh` | + +需要查看某个任务的具体脚本内容或执行日志吗? diff --git a/memory/2026-04-15-request-timed-out-before-a-res.md b/memory/2026-04-15-request-timed-out-before-a-res.md index 39078eb..4e40d9a 100644 --- a/memory/2026-04-15-request-timed-out-before-a-res.md +++ b/memory/2026-04-15-request-timed-out-before-a-res.md @@ -1,42 +1,148 @@ -# Session: 2026-04-15 04:07:29 UTC +# Session: 2026-04-15 11:07:23 UTC -- **Session Key**: agent:main:feishu:direct:ou_e63ce6b760ad39382852472f28fbe2a2 -- **Session ID**: 3cc3a798-b913-4bf0-8d6f-d940eaebc4ce +- **Session Key**: agent:main:feishu:direct:ou_9cb5bc9a5f1b6cab2d78fd36139ecb87 +- **Session ID**: a540d883-1e19-4b29-9720-f5d2b4535ce5 - **Source**: feishu ## Conversation Summary -assistant: ✅ New session started · model: volcengine/doubao-seed-2-0-pro-260215 -user: System: [2026-04-15 11:59:11 GMT+8] Feishu[xiaoxi] DM | 李承龙 (ou_e63ce6b760ad39382852472f28fbe2a2) [msg:om_x100b52c49e3ea894b28db4de29c670e] +assistant: 现在修正 crontab 中的路径错误和日志规范: +assistant: 现在做一轮快速验证,确认改造后的脚本语法没问题: +assistant: 全部语法检查通过。保存旧的 crontab 备份: +assistant: --- -A new session was started via /new or /reset. Run your Session Startup sequence - read the required files before responding to the user. Then greet the user in your configured persona, if one is provided. Be yourself - use your defined voice, mannerisms, and mood. Keep it to 1-3 sentences and ask what they want to do. If the runtime model differs from default_model in the system prompt, mention the default model. Do not mention internal steps, files, tools, or reasoning. -Current time: Wednesday, April 15th, 2026 — 11:59 AM (Asia/Shanghai) / 2026-04-15 03:59 UTC -assistant: 承龙你好~ 我是数据分析师小溪📊,有任何数据查询、数据分析相关的需求都可以随时和我说哦。 -user: System: [2026-04-15 12:01:48 GMT+8] Feishu[default] DM | 李承龙 (ou_e63ce6b760ad39382852472f28fbe2a2) [msg:om_x100b52c49476e484b27f8f15693cb59] +### 迁移完成,汇总如下: + +#### 改造的脚本(7个) + +| 脚本 | 改动内容 | +|------|---------| +| **小溪 daily_backup.sh** | `openclaw message send`(open_id)→ curl + user_id,增加log函数 | +| **小溪 update_business_knowledge.sh** | `openclaw message send`(open_id)→ curl + user_id | +| **小溪 xueersi_weekly_report.sh** | 硬编码 app_id/secret → 读 config.json;open_id → user_id(李承龙 d8cb7f2a) | +| **小溪 run_component_practice_stat_daily.sh** | `openclaw message send --media` → curl 两步(上传+发送文件) | +| **小溪 run_user_course_stat.sh** | 同上 | +| **小溪 weekly_channel_order_report.sh** | 同上 | +| **小伴 daily_summary.sh** | `openclaw message send`(open_id + 错误node路径)→ curl + user_id | + +#### 未改动的脚本(6个,已符合规范) +- 小伴 backup_workspace.sh ✅ +- 小葵 backup.sh ✅ +- 小葵 vala_skill_auto_push.sh ✅(无消息发送) +- 小葵 sync_feedback_group.py ✅(Python脚本,独立逻辑) +- 小编 workspace_backup.sh ✅ +- 小燕 backup_workspace.sh ✅ + +#### crontab 改动 +- 按 agent 分组,添加注释说明 +- 所有任务统一 `/bin/bash` 执行 +- **2个缺日志的任务补上了日志重定向**(component_stat、user_course_stat) +- **日志命名规范化**:统一 `_<任务名>.log` +- 修正小伴 daily_summary.sh 路径(之前指向不存在的文件) +- **旧 crontab 已备份** + +明天早上各任务正常触发后就能验证了。如果想提前验证某个具体脚本,可以手动跑一下。 +user: System: [2026-04-15 18:50:08 GMT+8] Feishu[default] DM | 李若松 (ou_9cb5bc9a5f1b6cab2d78fd36139ecb87) [msg:om_x100b52c29b3a3504b3c55b10ae784ab] Conversation info (untrusted metadata): ```json { - "message_id": "om_x100b52c49476e484b27f8f15693cb59", - "sender_id": "ou_e63ce6b760ad39382852472f28fbe2a2", - "sender": "李承龙", - "timestamp": "Wed 2026-04-15 12:01 GMT+8" + "message_id": "om_x100b52c29b3a3504b3c55b10ae784ab", + "sender_id": "ou_9cb5bc9a5f1b6cab2d78fd36139ecb87", + "sender": "李若松", + "timestamp": "Wed 2026-04-15 18:50 GMT+8" } ``` Sender (untrusted metadata): ```json { - "label": "李承龙 (ou_e63ce6b760ad39382852472f28fbe2a2)", - "id": "ou_e63ce6b760ad39382852472f28fbe2a2", - "name": "李承龙" + "label": "李若松 (ou_9cb5bc9a5f1b6cab2d78fd36139ecb87)", + "id": "ou_9cb5bc9a5f1b6cab2d78fd36139ecb87", + "name": "李若松" } ``` -执行weekly_channel_order_report.sh这个定时任务 -assistant: ✅ 定时任务weekly_channel_order_report.sh已执行完成,渠道订单统计周报已成功发送到指定飞书群。已修复脚本中存在的问题: -1. 修复了secrets.env文件特殊字符语法错误 -2. 替换了不可用的csv2xlsx工具为python pandas转换 -3. 修复了openclaw消息命令参数错误 -4. 调整了文件存储路径到允许的目录 -后续该定时任务可以正常自动运行。 +用你的 workspace 备份 任务 验证一下, 先配置时间为 18:52 触发 确认没问题后 再改回原有时间 +assistant: 已设置为 18:52 触发,等执行后看日志和飞书消息。 +assistant: 日志文件是空的,说明可能还没执行到,或者 cron 还没触发。直接检查: +assistant: ✅ **执行完全成功!** + +- 18:52:01 cron准时触发 +- Git commit + push 成功(19个文件变更) +- 日志正常写入 `/var/log/xiaoxi_daily_backup.log` +- 备份任务完成 + +你飞书上应该也收到了备份成功的通知消息,确认一下?确认没问题我就把时间改回 08:00。 +assistant: 已恢复为每天 08:00。迁移验证通过 ✅ +user: System: [2026-04-15 18:51:50 GMT+8] Exec completed (oceanic-, code 1) :: === 日志 === +System: [2026-04-15 18:53:12 GMT+8] Feishu[default] DM | 李若松 (ou_9cb5bc9a5f1b6cab2d78fd36139ecb87) [msg:om_x100b52c2afb0c4acb2599dbe22db70b] + +Conversation info (untrusted metadata): +```json +{ + "message_id": "om_x100b52c2afb0c4acb2599dbe22db70b", + "sender_id": "ou_9cb5bc9a5f1b6cab2d78fd36139ecb87", + "sender": "李若松", + "timestamp": "Wed 2026-04-15 18:53 GMT+8" +} +``` + +Sender (untrusted metadata): +```json +{ + "label": "李若松 (ou_9cb5bc9a5f1b6cab2d78fd36139ecb87)", + "id": "ou_9cb5bc9a5f1b6cab2d78fd36139ecb87", + "name": "李若松" +} +``` + +好的 现在 查看你的全部定时任务 +assistant: 我当前配置的全部定时任务(6个): + +| 时间 | 频率 | 任务 | 日志 | +|------|------|------|------| +| 07:30 | 每天 | 业务知识库更新 | `/var/log/xiaoxi_update_business_knowledge.log` | +| 08:00 | 每天 | 工作区Git备份 ✅ 刚验证通过 | `/var/log/xiaoxi_daily_backup.log` | +| 09:00 | 每天 | 组件练习质量统计 | `/var/log/xiaoxi_component_stat.log` | +| 09:00 | 每天 | 用户购课完课统计 | `/var/log/xiaoxi_user_course_stat.log` | +| 09:00 | 每天 | 渠道订单统计周报 | `/var/log/xiaoxi_weekly_channel_order_report.log` | +| 10:00 | 每周一 | 学而思周度报表 | `/var/log/xiaoxi_xueersi_weekly_report.log` | + +脚本位置:`/root/.openclaw/workspace/scripts/` +user: System: [2026-04-15 18:54:14 GMT+8] Feishu[default] DM | 李若松 (ou_9cb5bc9a5f1b6cab2d78fd36139ecb87) [msg:om_x100b52c2ab9620b0b31d4e326752657] + +Conversation info (untrusted metadata): +```json +{ + "message_id": "om_x100b52c2ab9620b0b31d4e326752657", + "sender_id": "ou_9cb5bc9a5f1b6cab2d78fd36139ecb87", + "sender": "李若松", + "timestamp": "Wed 2026-04-15 18:54 GMT+8" +} +``` + +Sender (untrusted metadata): +```json +{ + "label": "李若松 (ou_9cb5bc9a5f1b6cab2d78fd36139ecb87)", + "id": "ou_9cb5bc9a5f1b6cab2d78fd36139ecb87", + "name": "李若松" +} +``` + +定时任务配置文件的位置是? 我希望也体现在 skill中。 方便其他agent使用时查看。 +assistant: 定时任务的配置位置就是系统 crontab,通过 `crontab -l` 查看,通过 `crontab -e` 编辑。实际存储在 `/var/spool/cron/crontabs/root`。 + +我来把这个信息补充到 skill 中: +assistant: 同步副本: +assistant: 已更新。Skill 中新增了**配置文件位置**说明: + +| 项目 | 位置 | +|------|------| +| 配置查看 | `crontab -l` | +| 配置编辑 | `crontab -e` | +| 配置文件存储 | `/var/spool/cron/crontabs/root` | +| 系统执行日志 | `/var/log/syslog`(搜索 CRON) | + +其他 agent 查看自己的任务只需执行 `crontab -l | grep "workspace-<自己的名字>"`,配置文件中也有注释块标注每个 agent 的归属。 diff --git a/new_export/export_14607.py b/new_export/export_14607.py deleted file mode 100644 index aa86b51..0000000 --- a/new_export/export_14607.py +++ /dev/null @@ -1,1846 +0,0 @@ -""" -初版需求v1.0: 2025.11.18 - -导出 一个userId的多表数据, 最终按照不同sheet,输出到一个 excel文件中。 - -1. 第一个sheet:"全部音频数据" -es相关配置通过以下环境变量 -ES_HOST=xxx -ES_PORT=9200 -ES_SCHEME=https -ES_USER=elastic -ES_PASSWORD=xxx - -index: user-audio - -脚本思路: -过滤字段: -userId == xxxx - -输出该userId的全部记录 按时间倒序排序 -包含以下字段内容: - -userId -userMsg -userName -soeData -audioUrl -asrStatus -componentId -componentType -dataVersion - -2. 第二个sheet:"互动组件学习记录" -在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 -数据库相关配置 从.env中读取: -PG_DB_HOST = xxx -PG_DB_PORT = xxx -PG_DB_USER = xxx -PG_DB_PASSWORD = xxx -PG_DB_DATABASE = xxx - -读取以下数据表: -user_component_play_record_0 ~ user_component_play_record_7 - -输出以下字段: -user_id, -component_unique_code, -session_id, -c_type, -c_id, -play_result, -user_behavior_info, -updated_at - -3.第三个sheet:"课程巩固记录" -在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 - -数据表:user_unit_review_question_result - -输出以下字段: -user_id -story_id -chapter_id -question_list -updated_at - -4.第四个sheet:"单元挑战记录" -在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 - -数据表:user_unit_challenge_question_result - -输出以下字段: -user_id -story_id -category -score_text, -question_list -updated_at ------------- - -需求补充v1.1: -"全部音频数据"这个sheet -输出字段 添加timeStr 并按时间倒序排列 最新的记录 在最上面 - ------------- -需求补充v1.2: -"全部音频数据"这个sheet -如果userMsg字段内容 包含 ”makee_id“ 要进行以下处理: - -从userMsg字段中提取出具体的makee_id: -此时的字段样例: -``` -asr msg信息为:{ - "time_ms": 358, - "time_ms_api": 357, - "hot_words_str": "{\n \"context_type\": \"dialog_ctx\",\n \"context_data\": [\n {\n \"text\": \"planet Walla\"\n },\n {\n \"text\": \"Walla\"\n }\n ]\n}", - "makee_id": "d208c617-902f-4f81-8255-b5fb73599546", - "volcano_fast_x_tt_logid": "202511151541355DF72BE5EBFE73795BFD", - "api_name": "volcano-fast" -} -``` -然后基于makee_id 去另一个表里查记录: index:llm_asr_log -将查询到的记录的 result_text 字段内容 回填到 userMsg。 -将source字段内容 输出 到 source。 - -如果userMsg字段内容 不包含 ”makee_id“ 保持之前的逻辑。 - --------------- -需求补充 v1.3 -当前输入 只支持配置单个 userId (业务侧名称为角色id) - - -期望扩展为以下逻辑: -1. 改为配置 角色id list , 分别 导出 多份excel文件。命名格式为 角色id_{}_导出时间_{}.xlsx -2. 改为配置 账户id list , 分别 导出 多份excel文件。命名格式为 账户id_{}_角色id_{}_导出时间_{}.xlsx - -关于 账户 id 到角色id 的映射逻辑, -首先 读取 mysql 表 vala_app_character -筛选 account_id字段值 == 账户id 的 记录, 其中 该记录 的 id值,则为角色id 一个 账户id 可以对应多个角色id - -本次需求只针对输入侧调整, 数据抽取聚合逻辑部分和之前保持一致 - ---------------- -需求补充 v1.4 - -增加一个sheet "单元总结记录", -导出对应角色id的单元总结记录。 参考 export_unit_summary.py 中的原始数据提取方案即可(不必关注其中的数据统计部分)。 - -其他已有逻辑保持不动哦。 - ----------------- -需求补充 v1.5 - -1."互动组件学习记录"sheet 增加以下字段 -"互动组件名称"、"组件标题"、"组件配置摘要"、"知识点": -字段取值规则: -根据 c_type 及组件配置(从mysql表获取) 进行映射和处理: -``` -1).如果 c_type 开头为"mid" - -则读取下表:表名:middle_interaction_component - -获取以下字段值: -title (作为组件标题) -component_config (完整的组件配置) 获取其中 的 question 字段值 作为 组件配置摘要; -kp_relation_info 字段值 作为 知识点 - -"互动组件名称"规则: - -"物品互动": "mid_vocab_item", -"图片互动": "mid_vocab_image", -"填词互动": "mid_vocab_fillBlank", -"指令互动": "mid_vocab_instruction" -"对话互动-表达": "mid_sentence_dialogue", 且 component_config->question->mode == "express" -"对话互动-朗读": "mid_sentence_dialogue", 且 component_config->question->mode == "read" -"语音互动": "mid_sentence_voice", -"材料互动": "mid_sentence_material", -"造句互动": "mid_sentence_makeSentence" -"挖空互动": "mid_grammar_cloze", -"组句互动": "mid_grammar_sentence" -"发音互动": "mid_pron_pron" - - -2). 如果 c_type 开头为"core" -则读取下表:表名:core_interaction_component - -获取以下字段值: -title (作为组件标题) -component_config (完整的组件配置) 获取其中 的 taskInfo 字段值 作为 组件配置摘要 -kp_relation_info 字段值 作为 知识点 - -"互动组件名称"规则: -"口语快答": "core_speaking_reply", -"口语妙问": "core_speaking_inquiry", -"口语探讨": "core_speaking_explore", -"口语独白": "core_speaking_monologue" -"合作阅读": "core_reading_order", -"合作听力": "core_listening_order", -"看图组句": "core_writing_imgMakeSentence", -"看图撰写": "core_writing_imgWrite", -"问题组句": "core_writing_questionMakeSentence", -"问题撰写": "core_writing_questionWrite", -``` - -2."课程巩固记录" sheet 增加以下字段 -"正确率": 参考 export_lesson_review.py 中的计算逻辑 - -3. 新增一个"汇总统计"sheet -统计并展示以下内容 请以 可读性 比较好的方式排列、展示 - -a. "所有互动-按互动组件类型-通过情况统计" -以每种"互动组件名称"进行聚合 -统计play_result的取值分布情况,算以下指标: -总数量、Perfect数量、Good数量、Failed数量、Pass数量、Perfect比例、Good比例、Failed比例、Pass比例 - -b. "中互动组件-按知识点-通过情况统计" -以每个知识点进行聚合 - -其中 知识点配置格式如下: -``` -[{"kpId":"0000004","kpType":"sentence","kpTitle":"My name is ...","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000004","kpType":"sentence","kpTitle":"My name is ...","kpSkill":"sentence_meaning","kpSkillName":"语义"},{"kpId":"0000005","kpType":"sentence","kpTitle":"I'm… years old.","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000005","kpType":"sentence","kpTitle":"I'm… years old.","kpSkill":"sentence_meaning","kpSkillName":"语义"},{"kpId":"0000014","kpType":"sentence","kpTitle":"Nice to meet you.","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000014","kpType":"sentence","kpTitle":"Nice to meet you.","kpSkill":"sentence_meaning","kpSkillName":"语义"}] -``` -一个组件可以绑定多个知识点,以每个知识点的 kpId + kpType + kpTitle 进行 展示及聚合 - -对所有绑定了某个知识点的中互动组件(c_type以mid开头) -统计play_result的取值分布情况,算以下指标: -总数量、Perfect数量、Good数量、Failed数量、Pass数量、Perfect比例、Good比例、Failed比例、Pass比例 - -c. "单元总结-按单元统计时长" - -将"单元总结记录"中的"play_time_seconds"字段值 以每个单元id 进行聚合 进行 累加 统计,并增加一列 转换为分钟为单位 取整数 - - -""" -# ==== 可直接修改的脚本变量(不使用命令行传参) ==== -# 三种模式互斥,只能配置一个: -# 模式1:单个角色id -USER_ID = "14607" # 单个角色ID,示例:2911 - -# 模式2:角色id列表(多个角色id批量导出) -USER_ID_LIST = None # 角色ID列表,示例:[2911, 2912, 2913] - -# 模式3:账户id列表(通过账户id查询对应的角色id后批量导出) -ACCOUNT_ID_LIST = None - -OUTPUT_DIR = "output/" # 输出目录,默认为output文件夹 -# ==== 变量结束 ==== -import os -import json -import re -from typing import Any, Dict, List, Optional - -import datetime - -try: - import requests -except Exception: - requests = None - -try: - import psycopg2 - from psycopg2.extras import RealDictCursor -except Exception: - psycopg2 = None - RealDictCursor = None - -try: - import pymysql - import pymysql.cursors -except Exception: - pymysql = None - -try: - import pandas as pd -except Exception: - pd = None - -try: - import urllib3 -except Exception: - urllib3 = None - - -SHEET1_COLUMNS = [ - "userId", - "userMsg", - "source", - "userName", - "soeData", - "audioUrl", - "asrStatus", - "componentId", - "componentType", - "dataVersion", - "timeStr", -] - -SHEET2_COLUMNS = [ - "user_id", - "component_unique_code", - "session_id", - "c_type", - "c_id", - "互动组件名称", - "组件标题", - "组件配置摘要", - "知识点", - "play_result", - "user_behavior_info", - "updated_at", -] - -SHEET3_COLUMNS = [ - "user_id", - "unit_id", - "lesson_id", - "question_list", - "正确率", - "updated_at", -] - -SHEET4_COLUMNS = [ - "user_id", - "unit_id", - "category", - "score_text", - "question_list", - "updated_at", -] - -SHEET5_COLUMNS = [ - "id", - "user_id", - "unit_id", - "updated_at", - "km_id", - "km_type", - "play_time_seconds", -] - - -def _load_env_file(path: str) -> None: - if not os.path.exists(path): - return - try: - with open(path, "r", encoding="utf-8") as f: - for line in f: - line = line.strip() - if not line or line.startswith("#"): - continue - if "=" not in line: - continue - k, v = line.split("=", 1) - k = k.strip() - v = v.strip().strip('"').strip("'") - if k and (os.getenv(k) is None): - os.environ[k] = v - except Exception: - pass - - -def load_env() -> None: - _load_env_file(os.path.join(os.getcwd(), ".env")) - _load_env_file(os.path.join(os.getcwd(), ".env.local")) - - -def to_json_str(v: Any) -> Any: - if isinstance(v, (dict, list)): - try: - return json.dumps(v, ensure_ascii=False) - except Exception: - return str(v) - return v - - -def parse_time(value: Any) -> Optional[datetime.datetime]: - if value is None: - return None - if isinstance(value, (int, float)): - try: - v = float(value) - # 兼容毫秒级时间戳 - if v > 1e11: - v = v / 1000.0 - return datetime.datetime.fromtimestamp(v) - except Exception: - return None - if isinstance(value, str): - fmts = [ - "%Y-%m-%dT%H:%M:%S.%fZ", - "%Y-%m-%dT%H:%M:%S.%f%z", - "%Y-%m-%dT%H:%M:%S%z", - "%Y-%m-%d %H:%M:%S", - "%Y-%m-%d", - ] - for fmt in fmts: - try: - return datetime.datetime.strptime(value, fmt) - except Exception: - continue - try: - return datetime.datetime.fromisoformat(value) - except Exception: - return None - return None - - -def pick_time(source: Dict[str, Any]) -> Optional[datetime.datetime]: - candidates = [ - "updated_at", - "created_at", - "@timestamp", - "timestamp", - "updatedAt", - "createdAt", - "time", - "ts", - "timeStr", - "update_time", - "create_time", - ] - for key in candidates: - if key in source: - t = parse_time(source.get(key)) - if t is not None: - return t - # 宽松匹配:尝试扫描所有可能的时间相关字段 - for k, v in source.items(): - lk = str(k).lower() - if any(s in lk for s in ["time", "date", "_at", "timestamp"]): - t = parse_time(v) - if t is not None: - return t - return None - - -def extract_makee_id_from_user_msg(user_msg: Any) -> Optional[str]: - # 支持dict或字符串形式 - if isinstance(user_msg, dict): - mk = user_msg.get("makee_id") - if isinstance(mk, str) and mk: - return mk - if isinstance(user_msg, str) and user_msg: - # 1) 尝试整体解析为JSON - try: - obj = json.loads(user_msg) - mk = obj.get("makee_id") - if isinstance(mk, str) and mk: - return mk - except Exception: - pass - # 2) 尝试截取大括号中的JSON - try: - start = user_msg.find("{") - end = user_msg.rfind("}") - if start != -1 and end != -1 and end > start: - candidate = user_msg[start : end + 1] - obj = json.loads(candidate) - mk = obj.get("makee_id") - if isinstance(mk, str) and mk: - return mk - except Exception: - pass - # 3) 正则匹配 makee_id - m = re.search(r"\bmakee_id\b\s*:\s*\"([^\"]+)\"", user_msg) - if m: - return m.group(1) - return None - - -def fetch_es_asr_log(makee_id: str, es_cfg: Dict[str, Any]) -> Optional[Dict[str, Any]]: - if requests is None: - raise RuntimeError("缺少requests依赖,请安装后再运行。") - host = es_cfg.get("host") - port = es_cfg.get("port") - scheme = es_cfg.get("scheme", "http") - user = es_cfg.get("user") - password = es_cfg.get("password") - index = "llm_asr_log" - if not host: - return None - base = f"{scheme}://{host}:{port}" - url = f"{base}/{index}/_search" - headers = {"Content-Type": "application/json"} - body = { - "query": { - "bool": { - "should": [ - {"term": {"makee_id": {"value": str(makee_id)}}}, - {"term": {"makee_id.keyword": {"value": str(makee_id)}}}, - ], - "minimum_should_match": 1, - } - }, - "size": 10, - "_source": [ - "makee_id", - "result_text", - "source", - "updated_at", - "created_at", - "@timestamp", - "timestamp", - "updatedAt", - "createdAt", - "time", - "ts", - "timeStr", - "update_time", - "create_time", - ], - } - auth = (user, password) if user and password else None - try: - if scheme == "https" and urllib3 is not None: - try: - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - except Exception: - pass - resp = requests.post(url, headers=headers, json=body, auth=auth, timeout=20, verify=False if scheme == "https" else True) - resp.raise_for_status() - data = resp.json() - except Exception: - return None - hits = data.get("hits", {}).get("hits", []) - if not hits: - return None - # 选最新的 - chosen = None - best_t = None - for h in hits: - src = h.get("_source", {}) or {} - t = pick_time(src) - if t is None: - continue - if best_t is None or t > best_t: - best_t = t - chosen = src - if chosen is None: - # 如果都没有时间,选第一条 - chosen = (hits[0].get("_source", {}) or {}) - return chosen - - -def get_es_config() -> Dict[str, Any]: - return { - "host": os.getenv("ES_HOST"), - "port": os.getenv("ES_PORT", "9200"), - "scheme": os.getenv("ES_SCHEME", "http"), - "user": os.getenv("ES_USER"), - "password": os.getenv("ES_PASSWORD"), - "index": "user-audio", - } - - -def fetch_es_user_audio(user_id: str, es_cfg: Dict[str, Any]) -> List[Dict[str, Any]]: - if requests is None: - raise RuntimeError("缺少requests依赖,请安装后再运行。") - - print(f" [ES] 开始查询user-audio索引...") - start_time = datetime.datetime.now() - - host = es_cfg.get("host") - port = es_cfg.get("port") - scheme = es_cfg.get("scheme", "http") - user = es_cfg.get("user") - password = es_cfg.get("password") - index = es_cfg.get("index", "user-audio") - - if not host: - return [] - - base = f"{scheme}://{host}:{port}" - url = f"{base}/{index}/_search" - headers = {"Content-Type": "application/json"} - - body = { - "query": { - "bool": { - "should": [ - {"term": {"userId": {"value": str(user_id)}}}, - {"term": {"userId.keyword": {"value": str(user_id)}}}, - ], - "minimum_should_match": 1, - } - }, - "size": 10000, - "_source": [ - "userId", - "userMsg", - "userName", - "soeData", - "audioUrl", - "asrStatus", - "componentId", - "componentType", - "dataVersion", - "updated_at", - "created_at", - "@timestamp", - "timestamp", - "updatedAt", - "createdAt", - "time", - "ts", - "timeStr", - "update_time", - "create_time", - ], - } - - auth = (user, password) if user and password else None - - try: - # 抑制自签证书下的HTTPS不安全警告 - if scheme == "https" and urllib3 is not None: - try: - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - except Exception: - pass - resp = requests.post(url, headers=headers, json=body, auth=auth, timeout=30, verify=False if scheme == "https" else True) - resp.raise_for_status() - data = resp.json() - except Exception as e: - raise RuntimeError(f"ES查询失败: {e}") - - hits = data.get("hits", {}).get("hits", []) - print(f" [ES] 查询完成,获得{len(hits)}条记录,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - - if not hits: - return [] - - print(f" [ES] 开始处理音频数据...") - process_start = datetime.datetime.now() - - rows: List[Dict[str, Any]] = [] - asr_cache: Dict[str, Dict[str, Any]] = {} - makee_id_count = 0 - - for idx, h in enumerate(hits, 1): - # 每处理100条显示一次进度 - if idx % 100 == 0 or idx == len(hits): - print(f" [ES] 处理进度: {idx}/{len(hits)} ({idx*100//len(hits)}%)") - - src = h.get("_source", {}) or {} - row = { - "userId": src.get("userId"), - "userMsg": src.get("userMsg"), - "source": None, - "userName": src.get("userName"), - "soeData": to_json_str(src.get("soeData")), - "audioUrl": src.get("audioUrl"), - "asrStatus": src.get("asrStatus"), - "componentId": src.get("componentId"), - "componentType": src.get("componentType"), - "dataVersion": src.get("dataVersion"), - } - t = pick_time(src) - row["_time"] = t.isoformat() if t else None - row["timeStr"] = t.strftime("%Y-%m-%d %H:%M:%S") if t else None - # v1.2: 当userMsg包含makee_id时,补充查询llm_asr_log并回填 - mk = extract_makee_id_from_user_msg(row.get("userMsg")) - if mk: - makee_id_count += 1 - asr_doc = asr_cache.get(mk) - if asr_doc is None: - asr_doc = fetch_es_asr_log(mk, es_cfg) - if asr_doc is not None: - asr_cache[mk] = asr_doc - if asr_doc is not None: - rt = asr_doc.get("result_text") - if rt: - row["userMsg"] = rt - row["source"] = to_json_str(asr_doc.get("source")) - rows.append(row) - - print(f" [ES] 数据处理完成,发现{makee_id_count}条包含makee_id的记录,耗时{(datetime.datetime.now() - process_start).total_seconds():.2f}秒") - - print(f" [ES] 开始排序...") - rows.sort(key=lambda x: parse_time(x.get("_time")) or datetime.datetime.min, reverse=True) - print(f" [ES] 音频数据处理完成,总耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - - return rows - - -def get_pg_conn() -> Any: - if psycopg2 is None: - raise RuntimeError("缺少psycopg2依赖,请安装后再运行。") - host = os.getenv("PG_DB_HOST") - port = int(os.getenv("PG_DB_PORT", "5432")) - user = os.getenv("PG_DB_USER") - password = os.getenv("PG_DB_PASSWORD") - dbname = os.getenv("PG_DB_DATABASE") - if not host or not dbname: - raise RuntimeError("PG数据库环境变量未配置完整") - conn = psycopg2.connect(host=host, port=port, user=user, password=password, dbname=dbname) - return conn - - -def get_mysql_conn(database: str) -> Any: - """ - 获取MySQL数据库连接 - - Args: - database: 数据库名,可选值:'vala_user' 或 'vala_test' - vala_user 使用 online 配置(环境变量后缀 _online) - vala_test 使用默认配置 - - Returns: - MySQL连接对象 - """ - if pymysql is None: - raise RuntimeError("缺少pymysql依赖,请安装后再运行。") - - # 根据数据库选择不同的环境变量配置 - if database == "vala_user": - # vala_user 数据库使用 online 配置 - host = os.getenv("MYSQL_HOST_online") - port = int(os.getenv("MYSQL_PORT_online", "3306")) - user = os.getenv("MYSQL_USERNAME_online") - password = os.getenv("MYSQL_PASSWORD_online") - if not host: - raise RuntimeError("MySQL数据库环境变量未配置完整(缺少MYSQL_HOST_online)") - else: - # vala_test 等其他数据库使用默认配置 - host = os.getenv("MYSQL_HOST") - port = int(os.getenv("MYSQL_PORT", "3306")) - user = os.getenv("MYSQL_USERNAME") - password = os.getenv("MYSQL_PASSWORD") - if not host: - raise RuntimeError("MySQL数据库环境变量未配置完整(缺少MYSQL_HOST)") - - conn = pymysql.connect( - host=host, - port=port, - user=user, - password=password, - database=database, # 直接使用传入的数据库名 - charset="utf8mb4", - cursorclass=pymysql.cursors.DictCursor, - ) - return conn - - -def get_id_2_unit_index(conn: Any) -> Dict[int, int]: - """ - 从MySQL获取 story_id 到 unit_id 的映射关系 - - Args: - conn: MySQL数据库连接 - - Returns: - 映射字典 {story_id: unit_id} - """ - sql = """ - SELECT * - FROM `vala_game_info` - WHERE id > 0 - AND `vala_game_info`.`deleted_at` IS NULL - ORDER BY season_package_id asc, `index` asc - """ - try: - with conn.cursor() as cur: - cur.execute(sql) - rows = cur.fetchall() or [] - # 构建映射表:按查询结果的顺序,索引即为unit_id - id_2_unit_index = {} - for index, row in enumerate(rows): - id_2_unit_index[row["id"]] = index - return id_2_unit_index - except Exception as e: - print(f"[ERROR] 获取story_id到unit_id映射失败: {e}") - return {} - - -def get_chapter_id_to_lesson_id(conn: Any) -> Dict[int, int]: - """ - 从MySQL获取 chapter_id 到 lesson_id 的映射关系 - - Args: - conn: MySQL数据库连接 - - Returns: - 映射字典 {chapter_id: lesson_id} - """ - sql = """ - SELECT id, `index` - FROM `vala_game_chapter` - WHERE deleted_at IS NULL - """ - try: - with conn.cursor() as cur: - cur.execute(sql) - rows = cur.fetchall() or [] - # 构建映射表:chapter的index字段即为lesson_id - chapter_id_to_lesson_id = {} - for row in rows: - chapter_id_to_lesson_id[row["id"]] = row["index"] - return chapter_id_to_lesson_id - except Exception as e: - print(f"[ERROR] 获取chapter_id到lesson_id映射失败: {e}") - return {} - - -# 组件类型到组件名称的映射 -COMPONENT_TYPE_NAMES = { - "mid_vocab_item": "物品互动", - "mid_vocab_image": "图片互动", - "mid_vocab_fillBlank": "填词互动", - "mid_vocab_instruction": "指令互动", - "mid_sentence_dialogue": "对话互动", # 需要根据mode进一步判断 - "mid_sentence_voice": "语音互动", - "mid_sentence_material": "材料互动", - "mid_sentence_makeSentence": "造句互动", - "mid_grammar_cloze": "挖空互动", - "mid_grammar_sentence": "组句互动", - "mid_pron_pron": "发音互动", - "core_speaking_reply": "口语快答", - "core_speaking_inquiry": "口语妙问", - "core_speaking_explore": "口语探讨", - "core_speaking_monologue": "口语独白", - "core_reading_order": "合作阅读", - "core_listening_order": "合作听力", - "core_writing_imgMakeSentence": "看图组句", - "core_writing_imgWrite": "看图撰写", - "core_writing_questionMakeSentence": "问题组句", - "core_writing_questionWrite": "问题撰写", -} - - -def get_component_name(c_type: str, component_config: Optional[Dict[str, Any]]) -> str: - """ - 根据c_type和组件配置获取组件名称 - - Args: - c_type: 组件类型 - component_config: 组件配置(用于判断对话互动的mode) - - Returns: - 组件名称 - """ - if not c_type: - return "" - - # 特殊处理:对话互动需要根据mode判断 - if c_type == "mid_sentence_dialogue" and component_config: - try: - question = component_config.get("question", {}) - mode = question.get("mode", "") - if mode == "express": - return "对话互动-表达" - elif mode == "read": - return "对话互动-朗读" - except Exception: - pass - - return COMPONENT_TYPE_NAMES.get(c_type, "") - - -def batch_fetch_component_configs(play_records: List[Dict[str, Any]], mysql_conn: Any) -> Dict[str, Dict[str, Any]]: - """ - 批量查询组件配置信息 - - Args: - play_records: 播放记录列表 - mysql_conn: MySQL连接 - - Returns: - 组件配置映射 {c_type_c_id: {title, component_config, kp_relation_info}} - """ - print(f" [MySQL] 开始批量查询组件配置...") - start_time = datetime.datetime.now() - - # 收集需要查询的c_type和c_id - mid_c_ids = set() - core_c_ids = set() - mid_type_id_pairs = [] # 用于调试日志 - core_type_id_pairs = [] - - for record in play_records: - c_type = record.get("c_type", "") - c_id = record.get("c_id") - if c_type and c_id: - if c_type.startswith("mid"): - mid_c_ids.add(c_id) - mid_type_id_pairs.append((c_type, c_id)) - elif c_type.startswith("core"): - core_c_ids.add(c_id) - core_type_id_pairs.append((c_type, c_id)) - - print(f" [MySQL] 需要查询中互动组件: {len(mid_c_ids)}个, 核心互动组件: {len(core_c_ids)}个") - if mid_c_ids: - print(f" [MySQL] 中互动组件ID列表(前10个): {sorted(list(mid_c_ids))[:10]}") - if core_c_ids: - print(f" [MySQL] 核心互动组件ID列表(前10个): {sorted(list(core_c_ids))[:10]}") - - config_map = {} - - # 批量查询middle_interaction_component - if mid_c_ids: - try: - with mysql_conn.cursor() as cur: - placeholders = ','.join(['%s'] * len(mid_c_ids)) - sql = f""" - SELECT c_id, c_type, title, component_config, kp_relation_info - FROM middle_interaction_component - WHERE c_id IN ({placeholders}) AND deleted_at IS NULL - """ - print(f" [MySQL] 执行中互动组件查询,查询条件: c_id IN ({len(mid_c_ids)}个ID)") - cur.execute(sql, tuple(mid_c_ids)) - rows = cur.fetchall() or [] - print(f" [MySQL] 查询到{len(rows)}条中互动组件配置") - - if len(rows) == 0 and len(mid_c_ids) > 0: - print(f" [MySQL] [警告] 查询结果为空!可能的原因:") - print(f" [MySQL] - 数据库中没有匹配的c_id记录") - print(f" [MySQL] - deleted_at字段不为NULL") - print(f" [MySQL] - c_id不存在") - - for idx, row in enumerate(rows): - c_type = row.get("c_type", "") - c_id = row.get("c_id") - key = f"{c_type}_{c_id}" - - if idx < 3: # 输出前3条的详细信息 - print(f" [MySQL] [样例{idx+1}] id={c_id}, c_type={c_type}, key={key}") - print(f" [MySQL] [样例{idx+1}] title={row.get('title', '')[:50]}") - - # 解析component_config - component_config = row.get("component_config") - if isinstance(component_config, str): - try: - component_config = json.loads(component_config) - except Exception as e: - print(f" [MySQL] [警告] 解析component_config失败 (id={c_id}): {e}") - component_config = {} - - # 提取question字段作为摘要 - summary = "" - if isinstance(component_config, dict): - question = component_config.get("question") - summary = to_json_str(question) if question else "" - if idx < 3 and question: - print(f" [MySQL] [样例{idx+1}] 提取到question字段,长度: {len(summary)}") - - # 解析kp_relation_info - kp_relation_info = row.get("kp_relation_info") - if isinstance(kp_relation_info, str): - try: - kp_relation_info = json.loads(kp_relation_info) - except Exception: - kp_relation_info = [] - - config_map[key] = { - "title": row.get("title", ""), - "component_config": component_config, - "summary": summary, - "kp_relation_info": to_json_str(kp_relation_info), - } - - print(f" [MySQL] 中互动组件配置已加入config_map,当前map大小: {len(config_map)}") - except Exception as e: - print(f" [MySQL] [错误] 查询中互动组件配置失败: {e}") - import traceback - traceback.print_exc() - - # 批量查询core_interaction_component - if core_c_ids: - try: - with mysql_conn.cursor() as cur: - placeholders = ','.join(['%s'] * len(core_c_ids)) - sql = f""" - SELECT c_id, c_type, title, component_config, kp_relation_info - FROM core_interaction_component - WHERE c_id IN ({placeholders}) AND deleted_at IS NULL - """ - print(f" [MySQL] 执行核心互动组件查询,查询条件: c_id IN ({len(core_c_ids)}个ID)") - cur.execute(sql, tuple(core_c_ids)) - rows = cur.fetchall() or [] - print(f" [MySQL] 查询到{len(rows)}条核心互动组件配置") - - if len(rows) == 0 and len(core_c_ids) > 0: - print(f" [MySQL] [警告] 查询结果为空!可能的原因:") - print(f" [MySQL] - 数据库中没有匹配的c_id记录") - print(f" [MySQL] - deleted_at字段不为NULL") - print(f" [MySQL] - c_id不存在") - - for idx, row in enumerate(rows): - c_type = row.get("c_type", "") - c_id = row.get("c_id") - key = f"{c_type}_{c_id}" - - if idx < 3: # 输出前3条的详细信息 - print(f" [MySQL] [样例{idx+1}] id={c_id}, c_type={c_type}, key={key}") - print(f" [MySQL] [样例{idx+1}] title={row.get('title', '')[:50]}") - - # 解析component_config - component_config = row.get("component_config") - if isinstance(component_config, str): - try: - component_config = json.loads(component_config) - except Exception as e: - print(f" [MySQL] [警告] 解析component_config失败 (id={c_id}): {e}") - component_config = {} - - # 提取taskInfo字段作为摘要 - summary = "" - if isinstance(component_config, dict): - task_info = component_config.get("taskInfo") - summary = to_json_str(task_info) if task_info else "" - if idx < 3 and task_info: - print(f" [MySQL] [样例{idx+1}] 提取到taskInfo字段,长度: {len(summary)}") - - # 解析kp_relation_info - kp_relation_info = row.get("kp_relation_info") - if isinstance(kp_relation_info, str): - try: - kp_relation_info = json.loads(kp_relation_info) - except Exception: - kp_relation_info = [] - - config_map[key] = { - "title": row.get("title", ""), - "component_config": component_config, - "summary": summary, - "kp_relation_info": to_json_str(kp_relation_info), - } - - print(f" [MySQL] 核心互动组件配置已加入config_map,当前map大小: {len(config_map)}") - except Exception as e: - print(f" [MySQL] [错误] 查询核心互动组件配置失败: {e}") - import traceback - traceback.print_exc() - - print(f" [MySQL] 组件配置查询完成,共{len(config_map)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - return config_map - - -def calculate_accuracy(question_list: Any) -> float: - """ - 计算问题列表的正确率 - - Args: - question_list: 问题列表(可能是JSON字符串或list) - - Returns: - 正确率(百分比,保留2位小数) - """ - try: - if isinstance(question_list, str): - question_list = json.loads(question_list) - - if not isinstance(question_list, list) or len(question_list) == 0: - return 0.0 - - total = len(question_list) - correct = sum(1 for q in question_list if q.get('isRight') == True) - accuracy = round(correct / total * 100, 2) if total > 0 else 0.0 - - return accuracy - except Exception: - return 0.0 - - - -def fetch_character_ids_by_account(account_id: str, conn: Any) -> List[str]: - """根据账户id查询对应的角色id列表""" - sql = "SELECT id FROM vala_app_character WHERE account_id = %s" - try: - with conn.cursor() as cur: - cur.execute(sql, (account_id,)) - rows = cur.fetchall() or [] - return [str(row["id"]) for row in rows if row.get("id")] - except Exception as e: - print(f"[ERROR] 查询账户id={account_id}的角色id失败: {e}") - return [] - - -def fetch_pg_play_records(user_id: str, conn: Any, mysql_conn: Any) -> List[Dict[str, Any]]: - """ - 查询互动组件学习记录并补充组件配置信息 - - Args: - user_id: 用户ID(角色ID) - conn: PostgreSQL数据库连接 - mysql_conn: MySQL数据库连接 - - Returns: - 互动组件学习记录列表 - """ - print(f" [PG] 开始查询互动组件学习记录(8张分表)...") - start_time = datetime.datetime.now() - - tables = [f"user_component_play_record_{i}" for i in range(8)] - rows: List[Dict[str, Any]] = [] - with conn.cursor(cursor_factory=RealDictCursor) as cur: - for t in tables: - try: - cur.execute( - f""" - SELECT user_id, component_unique_code, session_id, c_type, c_id, - play_result, user_behavior_info, updated_at - FROM {t} - WHERE user_id = %s - ORDER BY updated_at DESC - """, - (user_id,), - ) - part = cur.fetchall() or [] - if part: - print(f" [PG] 表{t}查到{len(part)}条记录") - for r in part: - r = dict(r) - r["play_result"] = to_json_str(r.get("play_result")) - r["user_behavior_info"] = to_json_str(r.get("user_behavior_info")) - # 将带时区的时间转换为无时区,避免Excel写入报错 - upd = r.get("updated_at") - if isinstance(upd, datetime.datetime): - try: - if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: - r["updated_at"] = upd.replace(tzinfo=None) - except Exception: - # 回退为字符串 - r["updated_at"] = str(upd) - rows.append(r) - except Exception as e: - print(f" [PG] 表{t}查询失败: {e}") - continue - - rows.sort(key=lambda x: parse_time(x.get("updated_at")) or datetime.datetime.min, reverse=True) - print(f" [PG] 互动组件学习记录查询完成,共{len(rows)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - - # 批量查询组件配置 - if rows and mysql_conn: - config_map = batch_fetch_component_configs(rows, mysql_conn) - - # 补充组件信息 - print(f" [PG] 开始补充组件配置信息...") - filled_count = 0 - empty_count = 0 - sample_keys = [] - sample_mode_check = [] # 检查对话互动的mode - - for r in rows: - c_type = r.get("c_type", "") - c_id = r.get("c_id") - key = f"{c_type}_{c_id}" if c_type and c_id else "" - - config = config_map.get(key, {}) - component_config = config.get("component_config", {}) - - component_name = get_component_name(c_type, component_config) - r["互动组件名称"] = component_name - r["组件标题"] = config.get("title", "") - r["组件配置摘要"] = config.get("summary", "") - r["知识点"] = config.get("kp_relation_info", "") - - # 统计填充情况 - if config: - filled_count += 1 - if len(sample_keys) < 3: - sample_keys.append((key, component_name, r["组件标题"][:30] if r["组件标题"] else "")) - - # 检查对话互动的mode - if c_type == "mid_sentence_dialogue" and len(sample_mode_check) < 3: - mode = "" - if isinstance(component_config, dict): - question = component_config.get("question", {}) - if isinstance(question, dict): - mode = question.get("mode", "") - sample_mode_check.append({ - "key": key, - "mode": mode, - "component_name": component_name - }) - else: - empty_count += 1 - if empty_count <= 5: # 输出前5个未匹配的key - print(f" [PG] [警告] 未找到组件配置: key={key}") - - print(f" [PG] 组件配置信息补充完成") - print(f" [PG] 匹配到配置: {filled_count}条, 未匹配: {empty_count}条") - if sample_keys: - print(f" [PG] 样例数据(前3条):") - for key, name, title in sample_keys: - print(f" [PG] - key={key}, 名称={name}, 标题={title}") - - if sample_mode_check: - print(f" [PG] 对话互动mode检查(前3条):") - for s in sample_mode_check: - print(f" [PG] - key={s['key']}, mode={s['mode']}, 最终名称={s['component_name']}") - - return rows - - -def fetch_pg_unit_review(user_id: str, conn: Any, id_2_unit_index: Dict[int, int], chapter_id_to_lesson_id: Dict[int, int]) -> List[Dict[str, Any]]: - """ - 查询课程巩固记录 - - Args: - user_id: 用户ID(角色ID) - conn: PostgreSQL数据库连接 - id_2_unit_index: story_id到unit_id的映射字典 - chapter_id_to_lesson_id: chapter_id到lesson_id的映射字典 - - Returns: - 课程巩固记录列表 - """ - print(f" [PG] 开始查询课程巩固记录...") - start_time = datetime.datetime.now() - - sql = ( - "SELECT user_id, story_id, chapter_id, question_list, updated_at " - "FROM user_unit_review_question_result WHERE user_id = %s ORDER BY updated_at DESC" - ) - with conn.cursor(cursor_factory=RealDictCursor) as cur: - try: - cur.execute(sql, (user_id,)) - rows = cur.fetchall() or [] - except Exception as e: - print(f" [PG] 课程巩固记录查询失败: {e}") - rows = [] - out: List[Dict[str, Any]] = [] - for r in rows: - d = dict(r) - - # 映射 story_id 到 unit_id - story_id = d.get("story_id") - unit_id = id_2_unit_index.get(story_id) if story_id else None - d["unit_id"] = unit_id - - # 映射 chapter_id 到 lesson_id - chapter_id = d.get("chapter_id") - lesson_id = chapter_id_to_lesson_id.get(chapter_id) if chapter_id else None - d["lesson_id"] = lesson_id - - # 计算正确率 - question_list = d.get("question_list") - d["正确率"] = calculate_accuracy(question_list) - - d["question_list"] = to_json_str(question_list) - upd = d.get("updated_at") - if isinstance(upd, datetime.datetime): - try: - if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: - d["updated_at"] = upd.replace(tzinfo=None) - except Exception: - d["updated_at"] = str(upd) - out.append(d) - - print(f" [PG] 课程巩固记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - return out - - -def fetch_pg_unit_challenge(user_id: str, conn: Any, id_2_unit_index: Dict[int, int]) -> List[Dict[str, Any]]: - """ - 查询单元挑战记录 - - Args: - user_id: 用户ID(角色ID) - conn: PostgreSQL数据库连接 - id_2_unit_index: story_id到unit_id的映射字典 - - Returns: - 单元挑战记录列表 - """ - print(f" [PG] 开始查询单元挑战记录...") - start_time = datetime.datetime.now() - - sql = ( - "SELECT user_id, story_id, category, score_text, question_list, updated_at " - "FROM user_unit_challenge_question_result WHERE user_id = %s ORDER BY updated_at DESC" - ) - with conn.cursor(cursor_factory=RealDictCursor) as cur: - try: - cur.execute(sql, (user_id,)) - rows = cur.fetchall() or [] - except Exception as e: - print(f" [PG] 单元挑战记录查询失败: {e}") - rows = [] - out: List[Dict[str, Any]] = [] - for r in rows: - d = dict(r) - - # 映射 story_id 到 unit_id - story_id = d.get("story_id") - unit_id = id_2_unit_index.get(story_id) if story_id else None - d["unit_id"] = unit_id - - d["question_list"] = to_json_str(d.get("question_list")) - upd = d.get("updated_at") - if isinstance(upd, datetime.datetime): - try: - if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: - d["updated_at"] = upd.replace(tzinfo=None) - except Exception: - d["updated_at"] = str(upd) - out.append(d) - - print(f" [PG] 单元挑战记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - return out - - -def fetch_pg_unit_summary(user_id: str, conn: Any, id_2_unit_index: Dict[int, int]) -> List[Dict[str, Any]]: - """ - 查询单元总结知识点结果数据 - - Args: - user_id: 用户ID(角色ID) - conn: PostgreSQL数据库连接 - id_2_unit_index: story_id到unit_id的映射字典 - - Returns: - 单元总结记录列表 - """ - print(f" [PG] 开始查询单元总结记录...") - start_time = datetime.datetime.now() - - sql = ( - "SELECT id, user_id, story_id, updated_at, km_id, km_type, play_time " - "FROM user_unit_summary_km_result WHERE user_id = %s AND deleted_at IS NULL ORDER BY updated_at DESC" - ) - with conn.cursor(cursor_factory=RealDictCursor) as cur: - try: - cur.execute(sql, (user_id,)) - rows = cur.fetchall() or [] - except Exception as e: - print(f" [PG] 单元总结记录查询失败: {e}") - rows = [] - - out: List[Dict[str, Any]] = [] - for r in rows: - d = dict(r) - # 映射 story_id 到 unit_id - story_id = d.get("story_id") - unit_id = id_2_unit_index.get(story_id) if story_id else None - d["unit_id"] = unit_id - - # 转换 play_time (毫秒) 为秒 (整数) - play_time = d.get("play_time") - d["play_time_seconds"] = play_time // 1000 if play_time else 0 - - # 移除时区信息 - upd = d.get("updated_at") - if isinstance(upd, datetime.datetime): - try: - if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: - d["updated_at"] = upd.replace(tzinfo=None) - except Exception: - d["updated_at"] = str(upd) - out.append(d) - - print(f" [PG] 单元总结记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - return out - - -def generate_statistics(sheet2_rows: List[Dict[str, Any]], sheet5_rows: List[Dict[str, Any]]) -> tuple: - """ - 生成汇总统计数据 - - Args: - sheet2_rows: 互动组件学习记录 - sheet5_rows: 单元总结记录 - - Returns: - (组件统计DataFrame, 知识点统计DataFrame, 单元时长统计DataFrame) - """ - if pd is None: - raise RuntimeError("缺少pandas依赖,请安装后再运行。") - - print(f" [统计] 开始生成汇总统计数据...") - start_time = datetime.datetime.now() - - from collections import defaultdict - - # ============ a. 所有互动-按互动组件类型-通过情况统计 ============ - component_stats_data = [] - component_stats = defaultdict(lambda: {"Perfect": 0, "Good": 0, "Failed": 0, "Pass": 0, "Oops": 0, "total": 0}) - - # 用于调试 - sample_results = [] - parse_error_count = 0 - - for idx, record in enumerate(sheet2_rows): - component_name = record.get("互动组件名称", "") - if not component_name: - continue - - play_result_str = record.get("play_result", "") - - # 解析play_result - result = "" - try: - # 先判断是否是简单的字符串(Perfect/Good/Failed/Pass/Oops) - if isinstance(play_result_str, str): - # 去除空格后检查 - stripped = play_result_str.strip() - if stripped in ["Perfect", "Good", "Failed", "Pass", "Oops"]: - # 直接使用 - result = stripped - else: - # 尝试JSON解析 - try: - play_result = json.loads(play_result_str) - if isinstance(play_result, dict): - result = play_result.get("result", "") - else: - result = "" - except: - result = "" - else: - # 如果不是字符串,尝试当dict处理 - if isinstance(play_result_str, dict): - result = play_result_str.get("result", "") - else: - result = "" - - # 收集前3个样例 - if idx < 3: - sample_results.append({ - "component": component_name, - "raw": str(play_result_str)[:100], - "result": result - }) - except Exception as e: - parse_error_count += 1 - if parse_error_count <= 3: - print(f" [统计] [警告] 解析play_result失败 (第{idx+1}条): {e}, 原始值: {str(play_result_str)[:100]}") - result = "" - - component_stats[component_name]["total"] += 1 - if result in ["Perfect", "Good", "Failed", "Pass", "Oops"]: - component_stats[component_name][result] += 1 - - print(f" [统计] play_result解析样例(前3条):") - for s in sample_results: - print(f" [统计] - 组件: {s['component']}, 结果: {s['result']}, 原始: {s['raw']}") - if parse_error_count > 0: - print(f" [统计] play_result解析失败总数: {parse_error_count}") - - # 生成统计数据行 - for component_name in sorted(component_stats.keys()): - stats = component_stats[component_name] - total = stats["total"] - perfect = stats["Perfect"] - good = stats["Good"] - failed = stats["Failed"] - pass_count = stats["Pass"] - oops = stats["Oops"] - - perfect_ratio = round(perfect / total * 100, 2) if total > 0 else 0 - good_ratio = round(good / total * 100, 2) if total > 0 else 0 - failed_ratio = round(failed / total * 100, 2) if total > 0 else 0 - pass_ratio = round(pass_count / total * 100, 2) if total > 0 else 0 - oops_ratio = round(oops / total * 100, 2) if total > 0 else 0 - - component_stats_data.append({ - "互动组件名称": component_name, - "总数量": total, - "Perfect数量": perfect, - "Good数量": good, - "Failed数量": failed, - "Pass数量": pass_count, - "Oops数量": oops, - "Perfect比例(%)": perfect_ratio, - "Good比例(%)": good_ratio, - "Failed比例(%)": failed_ratio, - "Pass比例(%)": pass_ratio, - "Oops比例(%)": oops_ratio, - }) - - # ============ b. 中互动组件-按知识点-通过情况统计 ============ - kp_stats_data = [] - kp_stats = defaultdict(lambda: {"Perfect": 0, "Good": 0, "Failed": 0, "Pass": 0, "Oops": 0, "total": 0}) - - # 调试信息 - mid_count = 0 - has_kp_count = 0 - sample_kp_records = [] - - for idx, record in enumerate(sheet2_rows): - c_type = record.get("c_type", "") - if not c_type or not c_type.startswith("mid"): - continue - - mid_count += 1 - kp_relation_info_str = record.get("知识点", "") - - if not kp_relation_info_str: - continue - - has_kp_count += 1 - - # 解析知识点 - try: - if isinstance(kp_relation_info_str, str): - kp_relation_info = json.loads(kp_relation_info_str) - else: - kp_relation_info = kp_relation_info_str - - if not isinstance(kp_relation_info, list): - continue - - # 收集样例 - if len(sample_kp_records) < 3: - sample_kp_records.append({ - "c_type": c_type, - "kp_count": len(kp_relation_info), - "kp_info": str(kp_relation_info)[:200] - }) - - # 解析play_result(使用相同的逻辑) - play_result_str = record.get("play_result", "") - result = "" - if isinstance(play_result_str, str): - stripped = play_result_str.strip() - if stripped in ["Perfect", "Good", "Failed", "Pass", "Oops"]: - result = stripped - else: - try: - play_result = json.loads(play_result_str) - if isinstance(play_result, dict): - result = play_result.get("result", "") - except: - pass - elif isinstance(play_result_str, dict): - result = play_result_str.get("result", "") - - # 为每个知识点统计 - for kp in kp_relation_info: - if not isinstance(kp, dict): - continue - - kp_id = kp.get("kpId", "") - kp_type = kp.get("kpType", "") - kp_title = kp.get("kpTitle", "") - - if not kp_id: - continue - - kp_key = f"{kp_id}|{kp_type}|{kp_title}" - kp_stats[kp_key]["total"] += 1 - if result in ["Perfect", "Good", "Failed", "Pass", "Oops"]: - kp_stats[kp_key][result] += 1 - - except Exception as e: - if len(sample_kp_records) < 5: - print(f" [统计] [警告] 解析知识点失败: {e}, 原始值: {str(kp_relation_info_str)[:100]}") - continue - - print(f" [统计] 中互动组件统计: 总数={mid_count}, 有知识点={has_kp_count}, 知识点条目数={len(kp_stats)}") - if sample_kp_records: - print(f" [统计] 知识点样例(前3条):") - for s in sample_kp_records: - print(f" [统计] - c_type={s['c_type']}, 知识点数量={s['kp_count']}, 内容={s['kp_info']}") - - # 生成知识点统计数据行 - for kp_key in sorted(kp_stats.keys()): - parts = kp_key.split("|") - if len(parts) != 3: - continue - - kp_id, kp_type, kp_title = parts - stats = kp_stats[kp_key] - total = stats["total"] - perfect = stats["Perfect"] - good = stats["Good"] - failed = stats["Failed"] - pass_count = stats["Pass"] - oops = stats["Oops"] - - perfect_ratio = round(perfect / total * 100, 2) if total > 0 else 0 - good_ratio = round(good / total * 100, 2) if total > 0 else 0 - failed_ratio = round(failed / total * 100, 2) if total > 0 else 0 - pass_ratio = round(pass_count / total * 100, 2) if total > 0 else 0 - oops_ratio = round(oops / total * 100, 2) if total > 0 else 0 - - kp_stats_data.append({ - "知识点ID": kp_id, - "知识点类型": kp_type, - "知识点标题": kp_title, - "总数量": total, - "Perfect数量": perfect, - "Good数量": good, - "Failed数量": failed, - "Pass数量": pass_count, - "Oops数量": oops, - "Perfect比例(%)": perfect_ratio, - "Good比例(%)": good_ratio, - "Failed比例(%)": failed_ratio, - "Pass比例(%)": pass_ratio, - "Oops比例(%)": oops_ratio, - }) - - # ============ c. 单元总结-按单元统计时长 ============ - unit_time_stats_data = [] - unit_time_stats = defaultdict(int) - - for record in sheet5_rows: - unit_id = record.get("unit_id") - play_time_seconds = record.get("play_time_seconds", 0) - - if unit_id is not None: - unit_time_stats[unit_id] += play_time_seconds - - # 生成单元时长统计数据行 - for unit_id in sorted(unit_time_stats.keys()): - total_seconds = unit_time_stats[unit_id] - total_minutes = int(total_seconds / 60) - - unit_time_stats_data.append({ - "单元ID": f"unit_{unit_id}", - "总时长(秒)": total_seconds, - "总时长(分钟)": total_minutes, - }) - - print(f" [统计] 汇总统计数据生成完成,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - print(f" [统计] 生成了{len(component_stats_data)}条组件统计, {len(kp_stats_data)}条知识点统计, {len(unit_time_stats_data)}条单元时长统计") - - return ( - pd.DataFrame(component_stats_data), - pd.DataFrame(kp_stats_data), - pd.DataFrame(unit_time_stats_data) - ) - - - -def write_excel(path: str, sheet1_rows: List[Dict[str, Any]], sheet2_rows: List[Dict[str, Any]], sheet3_rows: List[Dict[str, Any]], sheet4_rows: List[Dict[str, Any]], sheet5_rows: List[Dict[str, Any]], stats_component_df: Any, stats_kp_df: Any, stats_unit_time_df: Any) -> None: - if pd is None: - raise RuntimeError("缺少pandas依赖,请安装后再运行。") - - print(f" [Excel] 开始写入Excel文件: {path}") - start_time = datetime.datetime.now() - - out_dir = os.path.dirname(path) or "." - os.makedirs(out_dir, exist_ok=True) - with pd.ExcelWriter(path, engine="openpyxl") as writer: - pd.DataFrame(sheet1_rows, columns=SHEET1_COLUMNS).to_excel(writer, sheet_name="全部音频数据", index=False) - pd.DataFrame(sheet2_rows, columns=SHEET2_COLUMNS).to_excel(writer, sheet_name="互动组件学习记录", index=False) - pd.DataFrame(sheet3_rows, columns=SHEET3_COLUMNS).to_excel(writer, sheet_name="课程巩固记录", index=False) - pd.DataFrame(sheet4_rows, columns=SHEET4_COLUMNS).to_excel(writer, sheet_name="单元挑战记录", index=False) - pd.DataFrame(sheet5_rows, columns=SHEET5_COLUMNS).to_excel(writer, sheet_name="单元总结记录", index=False) - stats_component_df.to_excel(writer, sheet_name="统计-互动组件通过情况", index=False) - stats_kp_df.to_excel(writer, sheet_name="统计-知识点通过情况", index=False) - stats_unit_time_df.to_excel(writer, sheet_name="统计-单元总结时长", index=False) - - print(f" [Excel] 写入完成,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - - -def get_date_str() -> str: - """获取当前日期字符串 格式:YYYYMMDD""" - return datetime.datetime.now().strftime("%Y%m%d") - - -def export_single_user(user_id: str, es_cfg: Dict[str, Any], pg_conn: Any, mysql_conn: Any, output_path: str, id_2_unit_index: Dict[int, int], chapter_id_to_lesson_id: Dict[int, int]) -> bool: - """ - 导出单个角色id的数据 - - Args: - user_id: 角色ID - es_cfg: ES配置 - pg_conn: PostgreSQL连接 - mysql_conn: MySQL连接 - output_path: 输出路径 - id_2_unit_index: story_id到unit_id的映射字典 - chapter_id_to_lesson_id: chapter_id到lesson_id的映射字典 - - Returns: - True表示成功,False表示失败 - """ - try: - print(f"\n[INFO] ========== 开始导出角色id={user_id} ==========") - total_start_time = datetime.datetime.now() - - # 查询ES数据 - sheet1_rows = fetch_es_user_audio(user_id, es_cfg) - - # 查询PG数据 - sheet2_rows = fetch_pg_play_records(user_id, pg_conn, mysql_conn) - sheet3_rows = fetch_pg_unit_review(user_id, pg_conn, id_2_unit_index, chapter_id_to_lesson_id) - sheet4_rows = fetch_pg_unit_challenge(user_id, pg_conn, id_2_unit_index) - sheet5_rows = fetch_pg_unit_summary(user_id, pg_conn, id_2_unit_index) - - # 检查是否有有效数据 - total_records = len(sheet1_rows) + len(sheet2_rows) + len(sheet3_rows) + len(sheet4_rows) + len(sheet5_rows) - print(f" [统计] 数据汇总:") - print(f" - 全部音频数据: {len(sheet1_rows)}条") - print(f" - 互动组件学习记录: {len(sheet2_rows)}条") - print(f" - 课程巩固记录: {len(sheet3_rows)}条") - print(f" - 单元挑战记录: {len(sheet4_rows)}条") - print(f" - 单元总结记录: {len(sheet5_rows)}条") - print(f" - 总计: {total_records}条") - - if total_records == 0: - print(f"[WARN] 角色id={user_id} 没有找到任何有效记录,跳过导出") - return False - - # 生成汇总统计数据 - stats_component_df, stats_kp_df, stats_unit_time_df = generate_statistics(sheet2_rows, sheet5_rows) - - # 写入Excel - write_excel(output_path, sheet1_rows, sheet2_rows, sheet3_rows, sheet4_rows, sheet5_rows, stats_component_df, stats_kp_df, stats_unit_time_df) - - total_time = (datetime.datetime.now() - total_start_time).total_seconds() - print(f"[INFO] 角色id={user_id} 导出成功") - print(f"[INFO] 文件路径: {output_path}") - print(f"[INFO] 总耗时: {total_time:.2f}秒") - print(f"[INFO] ========== 完成 ==========\n") - return True - - except Exception as e: - print(f"[ERROR] 角色id={user_id} 导出失败: {e}") - import traceback - traceback.print_exc() - return False - - -def main(): - load_env() - - # 确定运行模式并收集需要导出的角色id列表 - user_id_list: List[tuple] = [] # [(user_id, account_id or None), ...] - date_str = get_date_str() - - # 检查三种模式的配置 - has_user_id = USER_ID is not None - has_user_id_list = USER_ID_LIST is not None and len(USER_ID_LIST) > 0 - has_account_id_list = ACCOUNT_ID_LIST is not None and len(ACCOUNT_ID_LIST) > 0 - - # 验证只能配置一种模式 - mode_count = sum([has_user_id, has_user_id_list, has_account_id_list]) - if mode_count == 0: - raise RuntimeError("请配置 USER_ID、USER_ID_LIST 或 ACCOUNT_ID_LIST 中的一个") - if mode_count > 1: - raise RuntimeError("USER_ID、USER_ID_LIST、ACCOUNT_ID_LIST 只能配置一个,请检查配置") - - # 模式1:单个角色id - if has_user_id: - user_id_list = [(str(USER_ID), None)] - print(f"[INFO] 运行模式:单个角色id") - - # 模式2:角色id列表 - elif has_user_id_list: - user_id_list = [(str(uid), None) for uid in USER_ID_LIST] - print(f"[INFO] 运行模式:角色id列表,共{len(user_id_list)}个角色") - - # 模式3:账户id列表 - elif has_account_id_list: - print(f"[INFO] 运行模式:账户id列表,共{len(ACCOUNT_ID_LIST)}个账户") - mysql_conn = None - try: - mysql_conn = get_mysql_conn("vala_user") # 查询用户表,使用 vala_user 数据库 - for account_id in ACCOUNT_ID_LIST: - account_id_str = str(account_id) - print(f"[INFO] 查询账户id={account_id_str}对应的角色id...") - character_ids = fetch_character_ids_by_account(account_id_str, mysql_conn) - if not character_ids: - print(f"[WARN] 账户id={account_id_str} 未找到关联的角色id,跳过") - continue - print(f"[INFO] 账户id={account_id_str} 找到{len(character_ids)}个角色id: {character_ids}") - for cid in character_ids: - user_id_list.append((cid, account_id_str)) - finally: - if mysql_conn: - try: - mysql_conn.close() - except Exception: - pass - - if not user_id_list: - print("[WARN] 没有需要导出的角色id,程序退出") - return - - # 初始化连接 - es_cfg = get_es_config() - pg_conn = get_pg_conn() - - # 获取映射表(只需要查询一次,所有角色共用) - print(f"\n[INFO] ===== 准备工作:获取映射表 =====") - mysql_conn = None - id_2_unit_index = {} - chapter_id_to_lesson_id = {} - try: - print(f"[INFO] 正在连接MySQL数据库(vala_test)...") - mysql_conn = get_mysql_conn("vala_test") # 查询游戏配置表,使用 vala_test 数据库 - print(f"[INFO] 正在获取 story_id 到 unit_id 的映射...") - id_2_unit_index = get_id_2_unit_index(mysql_conn) - print(f"[INFO] 成功获取 {len(id_2_unit_index)} 个 story_id 映射") - print(f"[INFO] 正在获取 chapter_id 到 lesson_id 的映射...") - chapter_id_to_lesson_id = get_chapter_id_to_lesson_id(mysql_conn) - print(f"[INFO] 成功获取 {len(chapter_id_to_lesson_id)} 个 chapter_id 映射") - except Exception as e: - print(f"[ERROR] 获取映射表失败: {e}") - import traceback - traceback.print_exc() - if pg_conn: - try: - pg_conn.close() - except Exception: - pass - if mysql_conn: - try: - mysql_conn.close() - except Exception: - pass - return - - try: - # 统计信息 - success_count = 0 - skip_count = 0 - - print(f"\n[INFO] ===== 开始批量导出 =====") - print(f"[INFO] 共需导出{len(user_id_list)}个角色\n") - batch_start_time = datetime.datetime.now() - - # 循环处理每个角色id - for idx, (user_id, account_id) in enumerate(user_id_list, 1): - print(f"\n{'='*60}") - print(f"[INFO] 进度: {idx}/{len(user_id_list)} ({idx*100//len(user_id_list)}%)") - print(f"{'='*60}") - - # 生成输出文件名 - if account_id is None: - # 模式1和模式2:角色id_{}_导出时间_{}.xlsx - filename = f"角色id_{user_id}_导出时间_{date_str}.xlsx" - else: - # 模式3:账户id_{}_角色id_{}_导出时间_{}.xlsx - filename = f"账户id_{account_id}_角色id_{user_id}_导出时间_{date_str}.xlsx" - - output_path = os.path.join(OUTPUT_DIR, filename) - - # 导出单个角色的数据 - result = export_single_user(user_id, es_cfg, pg_conn, mysql_conn, output_path, id_2_unit_index, chapter_id_to_lesson_id) - if result: - success_count += 1 - else: - skip_count += 1 - - # 输出统计信息 - batch_total_time = (datetime.datetime.now() - batch_start_time).total_seconds() - print(f"\n{'='*60}") - print(f"[INFO] ===== 全部导出完成 =====") - print(f"[INFO] 总计: {len(user_id_list)}个角色") - print(f"[INFO] 成功: {success_count}个") - print(f"[INFO] 跳过: {skip_count}个") - print(f"[INFO] 总耗时: {batch_total_time:.2f}秒 ({batch_total_time/60:.2f}分钟)") - if success_count > 0: - print(f"[INFO] 平均每个角色: {batch_total_time/success_count:.2f}秒") - print(f"{'='*60}\n") - - finally: - if pg_conn: - try: - pg_conn.close() - except Exception: - pass - if mysql_conn: - try: - mysql_conn.close() - except Exception: - pass - - -if __name__ == "__main__": - main() diff --git a/new_export/export_only_12698.py b/new_export/export_only_12698.py deleted file mode 100644 index 60f36b5..0000000 --- a/new_export/export_only_12698.py +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env python3 -"""单独测试角色12698的导出,查看具体报错""" - -import os -import json -import sys -import datetime -from typing import Any, Dict, List - -# 加载环境变量 -def load_env(): - env_path = os.path.join(os.getcwd(), ".env") - if os.path.exists(env_path): - with open(env_path, "r", encoding="utf-8") as f: - for line in f: - line = line.strip() - if not line or line.startswith("#") or "=" not in line: - continue - k, v = line.split("=", 1) - os.environ[k.strip()] = v.strip().strip('"').strip("'") - -load_env() - -import psycopg2 -from psycopg2.extras import RealDictCursor -import pymysql -import requests -from requests.auth import HTTPBasicAuth -import warnings -warnings.filterwarnings('ignore') - -def test_role_12698(): - print("="*60) - print("单独测试角色ID=12698的查询") - print("="*60) - - # 连接PG - try: - conn = psycopg2.connect( - host=os.getenv("PG_DB_HOST"), - port=int(os.getenv("PG_DB_PORT")), - user=os.getenv("PG_DB_USER"), - password=os.getenv("PG_DB_PASSWORD"), - dbname=os.getenv("PG_DB_DATABASE"), - connect_timeout=10 - ) - print("✅ PG连接成功") - except Exception as e: - print(f"❌ PG连接失败: {e}") - return - - user_id = "12698" - - # 测试第一个查询:user_component_play_record_0 - print(f"\n测试查询表 user_component_play_record_0,user_id={user_id}") - try: - with conn.cursor(cursor_factory=RealDictCursor) as cur: - sql = f""" - SELECT user_id, component_unique_code, session_id, c_type, c_id, - play_result, user_behavior_info, updated_at - FROM user_component_play_record_0 - WHERE user_id = %s - ORDER BY updated_at DESC - """ - cur.execute(sql, (user_id,)) - rows = cur.fetchall() - print(f"✅ 查询成功,返回{len(rows)}条记录") - except Exception as e: - print(f"❌ 查询失败: {e}") - print(f"错误类型: {type(e).__name__}") - - # 回滚事务 - print("\n尝试回滚事务...") - try: - conn.rollback() - print("✅ 事务回滚成功") - except Exception as e2: - print(f"❌ 回滚失败: {e2}") - - # 测试查询课程巩固记录表 - print(f"\n测试查询表 user_unit_review_question_result,user_id={user_id}") - try: - with conn.cursor(cursor_factory=RealDictCursor) as cur: - sql = f""" - SELECT user_id, story_id, chapter_id, question_list, updated_at - FROM user_unit_review_question_result - WHERE user_id = %s - ORDER BY updated_at DESC - """ - cur.execute(sql, (user_id,)) - rows = cur.fetchall() - print(f"✅ 查询成功,返回{len(rows)}条记录") - except Exception as e: - print(f"❌ 查询失败: {e}") - print(f"错误类型: {type(e).__name__}") - - # 回滚事务 - print("\n尝试回滚事务...") - try: - conn.rollback() - print("✅ 事务回滚成功") - except Exception as e2: - print(f"❌ 回滚失败: {e2}") - - # 测试查询单元挑战记录表 - print(f"\n测试查询表 user_unit_challenge_question_result,user_id={user_id}") - try: - with conn.cursor(cursor_factory=RealDictCursor) as cur: - sql = f""" - SELECT user_id, story_id, category, score_text, question_list, updated_at - FROM user_unit_challenge_question_result - WHERE user_id = %s - ORDER BY updated_at DESC - """ - cur.execute(sql, (user_id,)) - rows = cur.fetchall() - print(f"✅ 查询成功,返回{len(rows)}条记录") - except Exception as e: - print(f"❌ 查询失败: {e}") - print(f"错误类型: {type(e).__name__}") - - # 测试查询单元总结记录表 - print(f"\n测试查询表 user_unit_summary_record,user_id={user_id}") - try: - with conn.cursor(cursor_factory=RealDictCursor) as cur: - sql = f""" - SELECT id, user_id, unit_id, updated_at, km_id, km_type, play_time_seconds - FROM user_unit_summary_record - WHERE user_id = %s - ORDER BY updated_at DESC - """ - cur.execute(sql, (user_id,)) - rows = cur.fetchall() - print(f"✅ 查询成功,返回{len(rows)}条记录") - except Exception as e: - print(f"❌ 查询失败: {e}") - print(f"错误类型: {type(e).__name__}") - import traceback - traceback.print_exc() - - conn.close() - -if __name__ == "__main__": - test_role_12698() diff --git a/new_export/export_user_id_data.py b/new_export/export_user_id_data.py deleted file mode 100644 index 478b2e0..0000000 --- a/new_export/export_user_id_data.py +++ /dev/null @@ -1,1846 +0,0 @@ -""" -初版需求v1.0: 2025.11.18 - -导出 一个userId的多表数据, 最终按照不同sheet,输出到一个 excel文件中。 - -1. 第一个sheet:"全部音频数据" -es相关配置通过以下环境变量 -ES_HOST=xxx -ES_PORT=9200 -ES_SCHEME=https -ES_USER=elastic -ES_PASSWORD=xxx - -index: user-audio - -脚本思路: -过滤字段: -userId == xxxx - -输出该userId的全部记录 按时间倒序排序 -包含以下字段内容: - -userId -userMsg -userName -soeData -audioUrl -asrStatus -componentId -componentType -dataVersion - -2. 第二个sheet:"互动组件学习记录" -在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 -数据库相关配置 从.env中读取: -PG_DB_HOST = xxx -PG_DB_PORT = xxx -PG_DB_USER = xxx -PG_DB_PASSWORD = xxx -PG_DB_DATABASE = xxx - -读取以下数据表: -user_component_play_record_0 ~ user_component_play_record_7 - -输出以下字段: -user_id, -component_unique_code, -session_id, -c_type, -c_id, -play_result, -user_behavior_info, -updated_at - -3.第三个sheet:"课程巩固记录" -在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 - -数据表:user_unit_review_question_result - -输出以下字段: -user_id -story_id -chapter_id -question_list -updated_at - -4.第四个sheet:"单元挑战记录" -在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 - -数据表:user_unit_challenge_question_result - -输出以下字段: -user_id -story_id -category -score_text, -question_list -updated_at ------------- - -需求补充v1.1: -"全部音频数据"这个sheet -输出字段 添加timeStr 并按时间倒序排列 最新的记录 在最上面 - ------------- -需求补充v1.2: -"全部音频数据"这个sheet -如果userMsg字段内容 包含 ”makee_id“ 要进行以下处理: - -从userMsg字段中提取出具体的makee_id: -此时的字段样例: -``` -asr msg信息为:{ - "time_ms": 358, - "time_ms_api": 357, - "hot_words_str": "{\n \"context_type\": \"dialog_ctx\",\n \"context_data\": [\n {\n \"text\": \"planet Walla\"\n },\n {\n \"text\": \"Walla\"\n }\n ]\n}", - "makee_id": "d208c617-902f-4f81-8255-b5fb73599546", - "volcano_fast_x_tt_logid": "202511151541355DF72BE5EBFE73795BFD", - "api_name": "volcano-fast" -} -``` -然后基于makee_id 去另一个表里查记录: index:llm_asr_log -将查询到的记录的 result_text 字段内容 回填到 userMsg。 -将source字段内容 输出 到 source。 - -如果userMsg字段内容 不包含 ”makee_id“ 保持之前的逻辑。 - --------------- -需求补充 v1.3 -当前输入 只支持配置单个 userId (业务侧名称为角色id) - - -期望扩展为以下逻辑: -1. 改为配置 角色id list , 分别 导出 多份excel文件。命名格式为 角色id_{}_导出时间_{}.xlsx -2. 改为配置 账户id list , 分别 导出 多份excel文件。命名格式为 账户id_{}_角色id_{}_导出时间_{}.xlsx - -关于 账户 id 到角色id 的映射逻辑, -首先 读取 mysql 表 vala_app_character -筛选 account_id字段值 == 账户id 的 记录, 其中 该记录 的 id值,则为角色id 一个 账户id 可以对应多个角色id - -本次需求只针对输入侧调整, 数据抽取聚合逻辑部分和之前保持一致 - ---------------- -需求补充 v1.4 - -增加一个sheet "单元总结记录", -导出对应角色id的单元总结记录。 参考 export_unit_summary.py 中的原始数据提取方案即可(不必关注其中的数据统计部分)。 - -其他已有逻辑保持不动哦。 - ----------------- -需求补充 v1.5 - -1."互动组件学习记录"sheet 增加以下字段 -"互动组件名称"、"组件标题"、"组件配置摘要"、"知识点": -字段取值规则: -根据 c_type 及组件配置(从mysql表获取) 进行映射和处理: -``` -1).如果 c_type 开头为"mid" - -则读取下表:表名:middle_interaction_component - -获取以下字段值: -title (作为组件标题) -component_config (完整的组件配置) 获取其中 的 question 字段值 作为 组件配置摘要; -kp_relation_info 字段值 作为 知识点 - -"互动组件名称"规则: - -"物品互动": "mid_vocab_item", -"图片互动": "mid_vocab_image", -"填词互动": "mid_vocab_fillBlank", -"指令互动": "mid_vocab_instruction" -"对话互动-表达": "mid_sentence_dialogue", 且 component_config->question->mode == "express" -"对话互动-朗读": "mid_sentence_dialogue", 且 component_config->question->mode == "read" -"语音互动": "mid_sentence_voice", -"材料互动": "mid_sentence_material", -"造句互动": "mid_sentence_makeSentence" -"挖空互动": "mid_grammar_cloze", -"组句互动": "mid_grammar_sentence" -"发音互动": "mid_pron_pron" - - -2). 如果 c_type 开头为"core" -则读取下表:表名:core_interaction_component - -获取以下字段值: -title (作为组件标题) -component_config (完整的组件配置) 获取其中 的 taskInfo 字段值 作为 组件配置摘要 -kp_relation_info 字段值 作为 知识点 - -"互动组件名称"规则: -"口语快答": "core_speaking_reply", -"口语妙问": "core_speaking_inquiry", -"口语探讨": "core_speaking_explore", -"口语独白": "core_speaking_monologue" -"合作阅读": "core_reading_order", -"合作听力": "core_listening_order", -"看图组句": "core_writing_imgMakeSentence", -"看图撰写": "core_writing_imgWrite", -"问题组句": "core_writing_questionMakeSentence", -"问题撰写": "core_writing_questionWrite", -``` - -2."课程巩固记录" sheet 增加以下字段 -"正确率": 参考 export_lesson_review.py 中的计算逻辑 - -3. 新增一个"汇总统计"sheet -统计并展示以下内容 请以 可读性 比较好的方式排列、展示 - -a. "所有互动-按互动组件类型-通过情况统计" -以每种"互动组件名称"进行聚合 -统计play_result的取值分布情况,算以下指标: -总数量、Perfect数量、Good数量、Failed数量、Pass数量、Perfect比例、Good比例、Failed比例、Pass比例 - -b. "中互动组件-按知识点-通过情况统计" -以每个知识点进行聚合 - -其中 知识点配置格式如下: -``` -[{"kpId":"0000004","kpType":"sentence","kpTitle":"My name is ...","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000004","kpType":"sentence","kpTitle":"My name is ...","kpSkill":"sentence_meaning","kpSkillName":"语义"},{"kpId":"0000005","kpType":"sentence","kpTitle":"I'm… years old.","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000005","kpType":"sentence","kpTitle":"I'm… years old.","kpSkill":"sentence_meaning","kpSkillName":"语义"},{"kpId":"0000014","kpType":"sentence","kpTitle":"Nice to meet you.","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000014","kpType":"sentence","kpTitle":"Nice to meet you.","kpSkill":"sentence_meaning","kpSkillName":"语义"}] -``` -一个组件可以绑定多个知识点,以每个知识点的 kpId + kpType + kpTitle 进行 展示及聚合 - -对所有绑定了某个知识点的中互动组件(c_type以mid开头) -统计play_result的取值分布情况,算以下指标: -总数量、Perfect数量、Good数量、Failed数量、Pass数量、Perfect比例、Good比例、Failed比例、Pass比例 - -c. "单元总结-按单元统计时长" - -将"单元总结记录"中的"play_time_seconds"字段值 以每个单元id 进行聚合 进行 累加 统计,并增加一列 转换为分钟为单位 取整数 - - -""" -# ==== 可直接修改的脚本变量(不使用命令行传参) ==== -# 三种模式互斥,只能配置一个: -# 模式1:单个角色id -USER_ID = None # 单个角色ID,示例:2911 - -# 模式2:角色id列表(多个角色id批量导出) -USER_ID_LIST = None # 角色ID列表,示例:[2911, 2912, 2913] - -# 模式3:账户id列表(通过账户id查询对应的角色id后批量导出) -ACCOUNT_ID_LIST = [9343] # 账户ID列表,示例:[100, 101, 102] - -OUTPUT_DIR = "output/" # 输出目录,默认为output文件夹 -# ==== 变量结束 ==== -import os -import json -import re -from typing import Any, Dict, List, Optional - -import datetime - -try: - import requests -except Exception: - requests = None - -try: - import psycopg2 - from psycopg2.extras import RealDictCursor -except Exception: - psycopg2 = None - RealDictCursor = None - -try: - import pymysql - import pymysql.cursors -except Exception: - pymysql = None - -try: - import pandas as pd -except Exception: - pd = None - -try: - import urllib3 -except Exception: - urllib3 = None - - -SHEET1_COLUMNS = [ - "userId", - "userMsg", - "source", - "userName", - "soeData", - "audioUrl", - "asrStatus", - "componentId", - "componentType", - "dataVersion", - "timeStr", -] - -SHEET2_COLUMNS = [ - "user_id", - "component_unique_code", - "session_id", - "c_type", - "c_id", - "互动组件名称", - "组件标题", - "组件配置摘要", - "知识点", - "play_result", - "user_behavior_info", - "updated_at", -] - -SHEET3_COLUMNS = [ - "user_id", - "unit_id", - "lesson_id", - "question_list", - "正确率", - "updated_at", -] - -SHEET4_COLUMNS = [ - "user_id", - "unit_id", - "category", - "score_text", - "question_list", - "updated_at", -] - -SHEET5_COLUMNS = [ - "id", - "user_id", - "unit_id", - "updated_at", - "km_id", - "km_type", - "play_time_seconds", -] - - -def _load_env_file(path: str) -> None: - if not os.path.exists(path): - return - try: - with open(path, "r", encoding="utf-8") as f: - for line in f: - line = line.strip() - if not line or line.startswith("#"): - continue - if "=" not in line: - continue - k, v = line.split("=", 1) - k = k.strip() - v = v.strip().strip('"').strip("'") - if k and (os.getenv(k) is None): - os.environ[k] = v - except Exception: - pass - - -def load_env() -> None: - _load_env_file(os.path.join(os.getcwd(), ".env")) - _load_env_file(os.path.join(os.getcwd(), ".env.local")) - - -def to_json_str(v: Any) -> Any: - if isinstance(v, (dict, list)): - try: - return json.dumps(v, ensure_ascii=False) - except Exception: - return str(v) - return v - - -def parse_time(value: Any) -> Optional[datetime.datetime]: - if value is None: - return None - if isinstance(value, (int, float)): - try: - v = float(value) - # 兼容毫秒级时间戳 - if v > 1e11: - v = v / 1000.0 - return datetime.datetime.fromtimestamp(v) - except Exception: - return None - if isinstance(value, str): - fmts = [ - "%Y-%m-%dT%H:%M:%S.%fZ", - "%Y-%m-%dT%H:%M:%S.%f%z", - "%Y-%m-%dT%H:%M:%S%z", - "%Y-%m-%d %H:%M:%S", - "%Y-%m-%d", - ] - for fmt in fmts: - try: - return datetime.datetime.strptime(value, fmt) - except Exception: - continue - try: - return datetime.datetime.fromisoformat(value) - except Exception: - return None - return None - - -def pick_time(source: Dict[str, Any]) -> Optional[datetime.datetime]: - candidates = [ - "updated_at", - "created_at", - "@timestamp", - "timestamp", - "updatedAt", - "createdAt", - "time", - "ts", - "timeStr", - "update_time", - "create_time", - ] - for key in candidates: - if key in source: - t = parse_time(source.get(key)) - if t is not None: - return t - # 宽松匹配:尝试扫描所有可能的时间相关字段 - for k, v in source.items(): - lk = str(k).lower() - if any(s in lk for s in ["time", "date", "_at", "timestamp"]): - t = parse_time(v) - if t is not None: - return t - return None - - -def extract_makee_id_from_user_msg(user_msg: Any) -> Optional[str]: - # 支持dict或字符串形式 - if isinstance(user_msg, dict): - mk = user_msg.get("makee_id") - if isinstance(mk, str) and mk: - return mk - if isinstance(user_msg, str) and user_msg: - # 1) 尝试整体解析为JSON - try: - obj = json.loads(user_msg) - mk = obj.get("makee_id") - if isinstance(mk, str) and mk: - return mk - except Exception: - pass - # 2) 尝试截取大括号中的JSON - try: - start = user_msg.find("{") - end = user_msg.rfind("}") - if start != -1 and end != -1 and end > start: - candidate = user_msg[start : end + 1] - obj = json.loads(candidate) - mk = obj.get("makee_id") - if isinstance(mk, str) and mk: - return mk - except Exception: - pass - # 3) 正则匹配 makee_id - m = re.search(r"\bmakee_id\b\s*:\s*\"([^\"]+)\"", user_msg) - if m: - return m.group(1) - return None - - -def fetch_es_asr_log(makee_id: str, es_cfg: Dict[str, Any]) -> Optional[Dict[str, Any]]: - if requests is None: - raise RuntimeError("缺少requests依赖,请安装后再运行。") - host = es_cfg.get("host") - port = es_cfg.get("port") - scheme = es_cfg.get("scheme", "http") - user = es_cfg.get("user") - password = es_cfg.get("password") - index = "llm_asr_log" - if not host: - return None - base = f"{scheme}://{host}:{port}" - url = f"{base}/{index}/_search" - headers = {"Content-Type": "application/json"} - body = { - "query": { - "bool": { - "should": [ - {"term": {"makee_id": {"value": str(makee_id)}}}, - {"term": {"makee_id.keyword": {"value": str(makee_id)}}}, - ], - "minimum_should_match": 1, - } - }, - "size": 10, - "_source": [ - "makee_id", - "result_text", - "source", - "updated_at", - "created_at", - "@timestamp", - "timestamp", - "updatedAt", - "createdAt", - "time", - "ts", - "timeStr", - "update_time", - "create_time", - ], - } - auth = (user, password) if user and password else None - try: - if scheme == "https" and urllib3 is not None: - try: - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - except Exception: - pass - resp = requests.post(url, headers=headers, json=body, auth=auth, timeout=20, verify=False if scheme == "https" else True) - resp.raise_for_status() - data = resp.json() - except Exception: - return None - hits = data.get("hits", {}).get("hits", []) - if not hits: - return None - # 选最新的 - chosen = None - best_t = None - for h in hits: - src = h.get("_source", {}) or {} - t = pick_time(src) - if t is None: - continue - if best_t is None or t > best_t: - best_t = t - chosen = src - if chosen is None: - # 如果都没有时间,选第一条 - chosen = (hits[0].get("_source", {}) or {}) - return chosen - - -def get_es_config() -> Dict[str, Any]: - return { - "host": os.getenv("ES_HOST"), - "port": os.getenv("ES_PORT", "9200"), - "scheme": os.getenv("ES_SCHEME", "http"), - "user": os.getenv("ES_USER"), - "password": os.getenv("ES_PASSWORD"), - "index": "user-audio", - } - - -def fetch_es_user_audio(user_id: str, es_cfg: Dict[str, Any]) -> List[Dict[str, Any]]: - if requests is None: - raise RuntimeError("缺少requests依赖,请安装后再运行。") - - print(f" [ES] 开始查询user-audio索引...") - start_time = datetime.datetime.now() - - host = es_cfg.get("host") - port = es_cfg.get("port") - scheme = es_cfg.get("scheme", "http") - user = es_cfg.get("user") - password = es_cfg.get("password") - index = es_cfg.get("index", "user-audio") - - if not host: - return [] - - base = f"{scheme}://{host}:{port}" - url = f"{base}/{index}/_search" - headers = {"Content-Type": "application/json"} - - body = { - "query": { - "bool": { - "should": [ - {"term": {"userId": {"value": str(user_id)}}}, - {"term": {"userId.keyword": {"value": str(user_id)}}}, - ], - "minimum_should_match": 1, - } - }, - "size": 10000, - "_source": [ - "userId", - "userMsg", - "userName", - "soeData", - "audioUrl", - "asrStatus", - "componentId", - "componentType", - "dataVersion", - "updated_at", - "created_at", - "@timestamp", - "timestamp", - "updatedAt", - "createdAt", - "time", - "ts", - "timeStr", - "update_time", - "create_time", - ], - } - - auth = (user, password) if user and password else None - - try: - # 抑制自签证书下的HTTPS不安全警告 - if scheme == "https" and urllib3 is not None: - try: - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - except Exception: - pass - resp = requests.post(url, headers=headers, json=body, auth=auth, timeout=30, verify=False if scheme == "https" else True) - resp.raise_for_status() - data = resp.json() - except Exception as e: - raise RuntimeError(f"ES查询失败: {e}") - - hits = data.get("hits", {}).get("hits", []) - print(f" [ES] 查询完成,获得{len(hits)}条记录,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - - if not hits: - return [] - - print(f" [ES] 开始处理音频数据...") - process_start = datetime.datetime.now() - - rows: List[Dict[str, Any]] = [] - asr_cache: Dict[str, Dict[str, Any]] = {} - makee_id_count = 0 - - for idx, h in enumerate(hits, 1): - # 每处理100条显示一次进度 - if idx % 100 == 0 or idx == len(hits): - print(f" [ES] 处理进度: {idx}/{len(hits)} ({idx*100//len(hits)}%)") - - src = h.get("_source", {}) or {} - row = { - "userId": src.get("userId"), - "userMsg": src.get("userMsg"), - "source": None, - "userName": src.get("userName"), - "soeData": to_json_str(src.get("soeData")), - "audioUrl": src.get("audioUrl"), - "asrStatus": src.get("asrStatus"), - "componentId": src.get("componentId"), - "componentType": src.get("componentType"), - "dataVersion": src.get("dataVersion"), - } - t = pick_time(src) - row["_time"] = t.isoformat() if t else None - row["timeStr"] = t.strftime("%Y-%m-%d %H:%M:%S") if t else None - # v1.2: 当userMsg包含makee_id时,补充查询llm_asr_log并回填 - mk = extract_makee_id_from_user_msg(row.get("userMsg")) - if mk: - makee_id_count += 1 - asr_doc = asr_cache.get(mk) - if asr_doc is None: - asr_doc = fetch_es_asr_log(mk, es_cfg) - if asr_doc is not None: - asr_cache[mk] = asr_doc - if asr_doc is not None: - rt = asr_doc.get("result_text") - if rt: - row["userMsg"] = rt - row["source"] = to_json_str(asr_doc.get("source")) - rows.append(row) - - print(f" [ES] 数据处理完成,发现{makee_id_count}条包含makee_id的记录,耗时{(datetime.datetime.now() - process_start).total_seconds():.2f}秒") - - print(f" [ES] 开始排序...") - rows.sort(key=lambda x: parse_time(x.get("_time")) or datetime.datetime.min, reverse=True) - print(f" [ES] 音频数据处理完成,总耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - - return rows - - -def get_pg_conn() -> Any: - if psycopg2 is None: - raise RuntimeError("缺少psycopg2依赖,请安装后再运行。") - host = os.getenv("PG_DB_HOST") - port = int(os.getenv("PG_DB_PORT", "5432")) - user = os.getenv("PG_DB_USER") - password = os.getenv("PG_DB_PASSWORD") - dbname = os.getenv("PG_DB_DATABASE") - if not host or not dbname: - raise RuntimeError("PG数据库环境变量未配置完整") - conn = psycopg2.connect(host=host, port=port, user=user, password=password, dbname=dbname) - return conn - - -def get_mysql_conn(database: str) -> Any: - """ - 获取MySQL数据库连接 - - Args: - database: 数据库名,可选值:'vala_user' 或 'vala_test' - vala_user 使用 online 配置(环境变量后缀 _online) - vala_test 使用默认配置 - - Returns: - MySQL连接对象 - """ - if pymysql is None: - raise RuntimeError("缺少pymysql依赖,请安装后再运行。") - - # 根据数据库选择不同的环境变量配置 - if database == "vala_user": - # vala_user 数据库使用 online 配置 - host = os.getenv("MYSQL_HOST_online") - port = int(os.getenv("MYSQL_PORT_online", "3306")) - user = os.getenv("MYSQL_USERNAME_online") - password = os.getenv("MYSQL_PASSWORD_online") - if not host: - raise RuntimeError("MySQL数据库环境变量未配置完整(缺少MYSQL_HOST_online)") - else: - # vala_test 等其他数据库使用默认配置 - host = os.getenv("MYSQL_HOST") - port = int(os.getenv("MYSQL_PORT", "3306")) - user = os.getenv("MYSQL_USERNAME") - password = os.getenv("MYSQL_PASSWORD") - if not host: - raise RuntimeError("MySQL数据库环境变量未配置完整(缺少MYSQL_HOST)") - - conn = pymysql.connect( - host=host, - port=port, - user=user, - password=password, - database=database, # 直接使用传入的数据库名 - charset="utf8mb4", - cursorclass=pymysql.cursors.DictCursor, - ) - return conn - - -def get_id_2_unit_index(conn: Any) -> Dict[int, int]: - """ - 从MySQL获取 story_id 到 unit_id 的映射关系 - - Args: - conn: MySQL数据库连接 - - Returns: - 映射字典 {story_id: unit_id} - """ - sql = """ - SELECT * - FROM `vala_game_info` - WHERE id > 0 - AND `vala_game_info`.`deleted_at` IS NULL - ORDER BY season_package_id asc, `index` asc - """ - try: - with conn.cursor() as cur: - cur.execute(sql) - rows = cur.fetchall() or [] - # 构建映射表:按查询结果的顺序,索引即为unit_id - id_2_unit_index = {} - for index, row in enumerate(rows): - id_2_unit_index[row["id"]] = index - return id_2_unit_index - except Exception as e: - print(f"[ERROR] 获取story_id到unit_id映射失败: {e}") - return {} - - -def get_chapter_id_to_lesson_id(conn: Any) -> Dict[int, int]: - """ - 从MySQL获取 chapter_id 到 lesson_id 的映射关系 - - Args: - conn: MySQL数据库连接 - - Returns: - 映射字典 {chapter_id: lesson_id} - """ - sql = """ - SELECT id, `index` - FROM `vala_game_chapter` - WHERE deleted_at IS NULL - """ - try: - with conn.cursor() as cur: - cur.execute(sql) - rows = cur.fetchall() or [] - # 构建映射表:chapter的index字段即为lesson_id - chapter_id_to_lesson_id = {} - for row in rows: - chapter_id_to_lesson_id[row["id"]] = row["index"] - return chapter_id_to_lesson_id - except Exception as e: - print(f"[ERROR] 获取chapter_id到lesson_id映射失败: {e}") - return {} - - -# 组件类型到组件名称的映射 -COMPONENT_TYPE_NAMES = { - "mid_vocab_item": "物品互动", - "mid_vocab_image": "图片互动", - "mid_vocab_fillBlank": "填词互动", - "mid_vocab_instruction": "指令互动", - "mid_sentence_dialogue": "对话互动", # 需要根据mode进一步判断 - "mid_sentence_voice": "语音互动", - "mid_sentence_material": "材料互动", - "mid_sentence_makeSentence": "造句互动", - "mid_grammar_cloze": "挖空互动", - "mid_grammar_sentence": "组句互动", - "mid_pron_pron": "发音互动", - "core_speaking_reply": "口语快答", - "core_speaking_inquiry": "口语妙问", - "core_speaking_explore": "口语探讨", - "core_speaking_monologue": "口语独白", - "core_reading_order": "合作阅读", - "core_listening_order": "合作听力", - "core_writing_imgMakeSentence": "看图组句", - "core_writing_imgWrite": "看图撰写", - "core_writing_questionMakeSentence": "问题组句", - "core_writing_questionWrite": "问题撰写", -} - - -def get_component_name(c_type: str, component_config: Optional[Dict[str, Any]]) -> str: - """ - 根据c_type和组件配置获取组件名称 - - Args: - c_type: 组件类型 - component_config: 组件配置(用于判断对话互动的mode) - - Returns: - 组件名称 - """ - if not c_type: - return "" - - # 特殊处理:对话互动需要根据mode判断 - if c_type == "mid_sentence_dialogue" and component_config: - try: - question = component_config.get("question", {}) - mode = question.get("mode", "") - if mode == "express": - return "对话互动-表达" - elif mode == "read": - return "对话互动-朗读" - except Exception: - pass - - return COMPONENT_TYPE_NAMES.get(c_type, "") - - -def batch_fetch_component_configs(play_records: List[Dict[str, Any]], mysql_conn: Any) -> Dict[str, Dict[str, Any]]: - """ - 批量查询组件配置信息 - - Args: - play_records: 播放记录列表 - mysql_conn: MySQL连接 - - Returns: - 组件配置映射 {c_type_c_id: {title, component_config, kp_relation_info}} - """ - print(f" [MySQL] 开始批量查询组件配置...") - start_time = datetime.datetime.now() - - # 收集需要查询的c_type和c_id - mid_c_ids = set() - core_c_ids = set() - mid_type_id_pairs = [] # 用于调试日志 - core_type_id_pairs = [] - - for record in play_records: - c_type = record.get("c_type", "") - c_id = record.get("c_id") - if c_type and c_id: - if c_type.startswith("mid"): - mid_c_ids.add(c_id) - mid_type_id_pairs.append((c_type, c_id)) - elif c_type.startswith("core"): - core_c_ids.add(c_id) - core_type_id_pairs.append((c_type, c_id)) - - print(f" [MySQL] 需要查询中互动组件: {len(mid_c_ids)}个, 核心互动组件: {len(core_c_ids)}个") - if mid_c_ids: - print(f" [MySQL] 中互动组件ID列表(前10个): {sorted(list(mid_c_ids))[:10]}") - if core_c_ids: - print(f" [MySQL] 核心互动组件ID列表(前10个): {sorted(list(core_c_ids))[:10]}") - - config_map = {} - - # 批量查询middle_interaction_component - if mid_c_ids: - try: - with mysql_conn.cursor() as cur: - placeholders = ','.join(['%s'] * len(mid_c_ids)) - sql = f""" - SELECT c_id, c_type, title, component_config, kp_relation_info - FROM middle_interaction_component - WHERE c_id IN ({placeholders}) AND deleted_at IS NULL - """ - print(f" [MySQL] 执行中互动组件查询,查询条件: c_id IN ({len(mid_c_ids)}个ID)") - cur.execute(sql, tuple(mid_c_ids)) - rows = cur.fetchall() or [] - print(f" [MySQL] 查询到{len(rows)}条中互动组件配置") - - if len(rows) == 0 and len(mid_c_ids) > 0: - print(f" [MySQL] [警告] 查询结果为空!可能的原因:") - print(f" [MySQL] - 数据库中没有匹配的c_id记录") - print(f" [MySQL] - deleted_at字段不为NULL") - print(f" [MySQL] - c_id不存在") - - for idx, row in enumerate(rows): - c_type = row.get("c_type", "") - c_id = row.get("c_id") - key = f"{c_type}_{c_id}" - - if idx < 3: # 输出前3条的详细信息 - print(f" [MySQL] [样例{idx+1}] id={c_id}, c_type={c_type}, key={key}") - print(f" [MySQL] [样例{idx+1}] title={row.get('title', '')[:50]}") - - # 解析component_config - component_config = row.get("component_config") - if isinstance(component_config, str): - try: - component_config = json.loads(component_config) - except Exception as e: - print(f" [MySQL] [警告] 解析component_config失败 (id={c_id}): {e}") - component_config = {} - - # 提取question字段作为摘要 - summary = "" - if isinstance(component_config, dict): - question = component_config.get("question") - summary = to_json_str(question) if question else "" - if idx < 3 and question: - print(f" [MySQL] [样例{idx+1}] 提取到question字段,长度: {len(summary)}") - - # 解析kp_relation_info - kp_relation_info = row.get("kp_relation_info") - if isinstance(kp_relation_info, str): - try: - kp_relation_info = json.loads(kp_relation_info) - except Exception: - kp_relation_info = [] - - config_map[key] = { - "title": row.get("title", ""), - "component_config": component_config, - "summary": summary, - "kp_relation_info": to_json_str(kp_relation_info), - } - - print(f" [MySQL] 中互动组件配置已加入config_map,当前map大小: {len(config_map)}") - except Exception as e: - print(f" [MySQL] [错误] 查询中互动组件配置失败: {e}") - import traceback - traceback.print_exc() - - # 批量查询core_interaction_component - if core_c_ids: - try: - with mysql_conn.cursor() as cur: - placeholders = ','.join(['%s'] * len(core_c_ids)) - sql = f""" - SELECT c_id, c_type, title, component_config, kp_relation_info - FROM core_interaction_component - WHERE c_id IN ({placeholders}) AND deleted_at IS NULL - """ - print(f" [MySQL] 执行核心互动组件查询,查询条件: c_id IN ({len(core_c_ids)}个ID)") - cur.execute(sql, tuple(core_c_ids)) - rows = cur.fetchall() or [] - print(f" [MySQL] 查询到{len(rows)}条核心互动组件配置") - - if len(rows) == 0 and len(core_c_ids) > 0: - print(f" [MySQL] [警告] 查询结果为空!可能的原因:") - print(f" [MySQL] - 数据库中没有匹配的c_id记录") - print(f" [MySQL] - deleted_at字段不为NULL") - print(f" [MySQL] - c_id不存在") - - for idx, row in enumerate(rows): - c_type = row.get("c_type", "") - c_id = row.get("c_id") - key = f"{c_type}_{c_id}" - - if idx < 3: # 输出前3条的详细信息 - print(f" [MySQL] [样例{idx+1}] id={c_id}, c_type={c_type}, key={key}") - print(f" [MySQL] [样例{idx+1}] title={row.get('title', '')[:50]}") - - # 解析component_config - component_config = row.get("component_config") - if isinstance(component_config, str): - try: - component_config = json.loads(component_config) - except Exception as e: - print(f" [MySQL] [警告] 解析component_config失败 (id={c_id}): {e}") - component_config = {} - - # 提取taskInfo字段作为摘要 - summary = "" - if isinstance(component_config, dict): - task_info = component_config.get("taskInfo") - summary = to_json_str(task_info) if task_info else "" - if idx < 3 and task_info: - print(f" [MySQL] [样例{idx+1}] 提取到taskInfo字段,长度: {len(summary)}") - - # 解析kp_relation_info - kp_relation_info = row.get("kp_relation_info") - if isinstance(kp_relation_info, str): - try: - kp_relation_info = json.loads(kp_relation_info) - except Exception: - kp_relation_info = [] - - config_map[key] = { - "title": row.get("title", ""), - "component_config": component_config, - "summary": summary, - "kp_relation_info": to_json_str(kp_relation_info), - } - - print(f" [MySQL] 核心互动组件配置已加入config_map,当前map大小: {len(config_map)}") - except Exception as e: - print(f" [MySQL] [错误] 查询核心互动组件配置失败: {e}") - import traceback - traceback.print_exc() - - print(f" [MySQL] 组件配置查询完成,共{len(config_map)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - return config_map - - -def calculate_accuracy(question_list: Any) -> float: - """ - 计算问题列表的正确率 - - Args: - question_list: 问题列表(可能是JSON字符串或list) - - Returns: - 正确率(百分比,保留2位小数) - """ - try: - if isinstance(question_list, str): - question_list = json.loads(question_list) - - if not isinstance(question_list, list) or len(question_list) == 0: - return 0.0 - - total = len(question_list) - correct = sum(1 for q in question_list if q.get('isRight') == True) - accuracy = round(correct / total * 100, 2) if total > 0 else 0.0 - - return accuracy - except Exception: - return 0.0 - - - -def fetch_character_ids_by_account(account_id: str, conn: Any) -> List[str]: - """根据账户id查询对应的角色id列表""" - sql = "SELECT id FROM vala_app_character WHERE account_id = %s" - try: - with conn.cursor() as cur: - cur.execute(sql, (account_id,)) - rows = cur.fetchall() or [] - return [str(row["id"]) for row in rows if row.get("id")] - except Exception as e: - print(f"[ERROR] 查询账户id={account_id}的角色id失败: {e}") - return [] - - -def fetch_pg_play_records(user_id: str, conn: Any, mysql_conn: Any) -> List[Dict[str, Any]]: - """ - 查询互动组件学习记录并补充组件配置信息 - - Args: - user_id: 用户ID(角色ID) - conn: PostgreSQL数据库连接 - mysql_conn: MySQL数据库连接 - - Returns: - 互动组件学习记录列表 - """ - print(f" [PG] 开始查询互动组件学习记录(8张分表)...") - start_time = datetime.datetime.now() - - tables = [f"user_component_play_record_{i}" for i in range(8)] - rows: List[Dict[str, Any]] = [] - with conn.cursor(cursor_factory=RealDictCursor) as cur: - for t in tables: - try: - cur.execute( - f""" - SELECT user_id, component_unique_code, session_id, c_type, c_id, - play_result, user_behavior_info, updated_at - FROM {t} - WHERE user_id = %s - ORDER BY updated_at DESC - """, - (user_id,), - ) - part = cur.fetchall() or [] - if part: - print(f" [PG] 表{t}查到{len(part)}条记录") - for r in part: - r = dict(r) - r["play_result"] = to_json_str(r.get("play_result")) - r["user_behavior_info"] = to_json_str(r.get("user_behavior_info")) - # 将带时区的时间转换为无时区,避免Excel写入报错 - upd = r.get("updated_at") - if isinstance(upd, datetime.datetime): - try: - if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: - r["updated_at"] = upd.replace(tzinfo=None) - except Exception: - # 回退为字符串 - r["updated_at"] = str(upd) - rows.append(r) - except Exception as e: - print(f" [PG] 表{t}查询失败: {e}") - continue - - rows.sort(key=lambda x: parse_time(x.get("updated_at")) or datetime.datetime.min, reverse=True) - print(f" [PG] 互动组件学习记录查询完成,共{len(rows)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - - # 批量查询组件配置 - if rows and mysql_conn: - config_map = batch_fetch_component_configs(rows, mysql_conn) - - # 补充组件信息 - print(f" [PG] 开始补充组件配置信息...") - filled_count = 0 - empty_count = 0 - sample_keys = [] - sample_mode_check = [] # 检查对话互动的mode - - for r in rows: - c_type = r.get("c_type", "") - c_id = r.get("c_id") - key = f"{c_type}_{c_id}" if c_type and c_id else "" - - config = config_map.get(key, {}) - component_config = config.get("component_config", {}) - - component_name = get_component_name(c_type, component_config) - r["互动组件名称"] = component_name - r["组件标题"] = config.get("title", "") - r["组件配置摘要"] = config.get("summary", "") - r["知识点"] = config.get("kp_relation_info", "") - - # 统计填充情况 - if config: - filled_count += 1 - if len(sample_keys) < 3: - sample_keys.append((key, component_name, r["组件标题"][:30] if r["组件标题"] else "")) - - # 检查对话互动的mode - if c_type == "mid_sentence_dialogue" and len(sample_mode_check) < 3: - mode = "" - if isinstance(component_config, dict): - question = component_config.get("question", {}) - if isinstance(question, dict): - mode = question.get("mode", "") - sample_mode_check.append({ - "key": key, - "mode": mode, - "component_name": component_name - }) - else: - empty_count += 1 - if empty_count <= 5: # 输出前5个未匹配的key - print(f" [PG] [警告] 未找到组件配置: key={key}") - - print(f" [PG] 组件配置信息补充完成") - print(f" [PG] 匹配到配置: {filled_count}条, 未匹配: {empty_count}条") - if sample_keys: - print(f" [PG] 样例数据(前3条):") - for key, name, title in sample_keys: - print(f" [PG] - key={key}, 名称={name}, 标题={title}") - - if sample_mode_check: - print(f" [PG] 对话互动mode检查(前3条):") - for s in sample_mode_check: - print(f" [PG] - key={s['key']}, mode={s['mode']}, 最终名称={s['component_name']}") - - return rows - - -def fetch_pg_unit_review(user_id: str, conn: Any, id_2_unit_index: Dict[int, int], chapter_id_to_lesson_id: Dict[int, int]) -> List[Dict[str, Any]]: - """ - 查询课程巩固记录 - - Args: - user_id: 用户ID(角色ID) - conn: PostgreSQL数据库连接 - id_2_unit_index: story_id到unit_id的映射字典 - chapter_id_to_lesson_id: chapter_id到lesson_id的映射字典 - - Returns: - 课程巩固记录列表 - """ - print(f" [PG] 开始查询课程巩固记录...") - start_time = datetime.datetime.now() - - sql = ( - "SELECT user_id, story_id, chapter_id, question_list, updated_at " - "FROM user_unit_review_question_result WHERE user_id = %s ORDER BY updated_at DESC" - ) - with conn.cursor(cursor_factory=RealDictCursor) as cur: - try: - cur.execute(sql, (user_id,)) - rows = cur.fetchall() or [] - except Exception as e: - print(f" [PG] 课程巩固记录查询失败: {e}") - rows = [] - out: List[Dict[str, Any]] = [] - for r in rows: - d = dict(r) - - # 映射 story_id 到 unit_id - story_id = d.get("story_id") - unit_id = id_2_unit_index.get(story_id) if story_id else None - d["unit_id"] = unit_id - - # 映射 chapter_id 到 lesson_id - chapter_id = d.get("chapter_id") - lesson_id = chapter_id_to_lesson_id.get(chapter_id) if chapter_id else None - d["lesson_id"] = lesson_id - - # 计算正确率 - question_list = d.get("question_list") - d["正确率"] = calculate_accuracy(question_list) - - d["question_list"] = to_json_str(question_list) - upd = d.get("updated_at") - if isinstance(upd, datetime.datetime): - try: - if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: - d["updated_at"] = upd.replace(tzinfo=None) - except Exception: - d["updated_at"] = str(upd) - out.append(d) - - print(f" [PG] 课程巩固记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - return out - - -def fetch_pg_unit_challenge(user_id: str, conn: Any, id_2_unit_index: Dict[int, int]) -> List[Dict[str, Any]]: - """ - 查询单元挑战记录 - - Args: - user_id: 用户ID(角色ID) - conn: PostgreSQL数据库连接 - id_2_unit_index: story_id到unit_id的映射字典 - - Returns: - 单元挑战记录列表 - """ - print(f" [PG] 开始查询单元挑战记录...") - start_time = datetime.datetime.now() - - sql = ( - "SELECT user_id, story_id, category, score_text, question_list, updated_at " - "FROM user_unit_challenge_question_result WHERE user_id = %s ORDER BY updated_at DESC" - ) - with conn.cursor(cursor_factory=RealDictCursor) as cur: - try: - cur.execute(sql, (user_id,)) - rows = cur.fetchall() or [] - except Exception as e: - print(f" [PG] 单元挑战记录查询失败: {e}") - rows = [] - out: List[Dict[str, Any]] = [] - for r in rows: - d = dict(r) - - # 映射 story_id 到 unit_id - story_id = d.get("story_id") - unit_id = id_2_unit_index.get(story_id) if story_id else None - d["unit_id"] = unit_id - - d["question_list"] = to_json_str(d.get("question_list")) - upd = d.get("updated_at") - if isinstance(upd, datetime.datetime): - try: - if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: - d["updated_at"] = upd.replace(tzinfo=None) - except Exception: - d["updated_at"] = str(upd) - out.append(d) - - print(f" [PG] 单元挑战记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - return out - - -def fetch_pg_unit_summary(user_id: str, conn: Any, id_2_unit_index: Dict[int, int]) -> List[Dict[str, Any]]: - """ - 查询单元总结知识点结果数据 - - Args: - user_id: 用户ID(角色ID) - conn: PostgreSQL数据库连接 - id_2_unit_index: story_id到unit_id的映射字典 - - Returns: - 单元总结记录列表 - """ - print(f" [PG] 开始查询单元总结记录...") - start_time = datetime.datetime.now() - - sql = ( - "SELECT id, user_id, story_id, updated_at, km_id, km_type, play_time " - "FROM user_unit_summary_km_result WHERE user_id = %s AND deleted_at IS NULL ORDER BY updated_at DESC" - ) - with conn.cursor(cursor_factory=RealDictCursor) as cur: - try: - cur.execute(sql, (user_id,)) - rows = cur.fetchall() or [] - except Exception as e: - print(f" [PG] 单元总结记录查询失败: {e}") - rows = [] - - out: List[Dict[str, Any]] = [] - for r in rows: - d = dict(r) - # 映射 story_id 到 unit_id - story_id = d.get("story_id") - unit_id = id_2_unit_index.get(story_id) if story_id else None - d["unit_id"] = unit_id - - # 转换 play_time (毫秒) 为秒 (整数) - play_time = d.get("play_time") - d["play_time_seconds"] = play_time // 1000 if play_time else 0 - - # 移除时区信息 - upd = d.get("updated_at") - if isinstance(upd, datetime.datetime): - try: - if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: - d["updated_at"] = upd.replace(tzinfo=None) - except Exception: - d["updated_at"] = str(upd) - out.append(d) - - print(f" [PG] 单元总结记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - return out - - -def generate_statistics(sheet2_rows: List[Dict[str, Any]], sheet5_rows: List[Dict[str, Any]]) -> tuple: - """ - 生成汇总统计数据 - - Args: - sheet2_rows: 互动组件学习记录 - sheet5_rows: 单元总结记录 - - Returns: - (组件统计DataFrame, 知识点统计DataFrame, 单元时长统计DataFrame) - """ - if pd is None: - raise RuntimeError("缺少pandas依赖,请安装后再运行。") - - print(f" [统计] 开始生成汇总统计数据...") - start_time = datetime.datetime.now() - - from collections import defaultdict - - # ============ a. 所有互动-按互动组件类型-通过情况统计 ============ - component_stats_data = [] - component_stats = defaultdict(lambda: {"Perfect": 0, "Good": 0, "Failed": 0, "Pass": 0, "Oops": 0, "total": 0}) - - # 用于调试 - sample_results = [] - parse_error_count = 0 - - for idx, record in enumerate(sheet2_rows): - component_name = record.get("互动组件名称", "") - if not component_name: - continue - - play_result_str = record.get("play_result", "") - - # 解析play_result - result = "" - try: - # 先判断是否是简单的字符串(Perfect/Good/Failed/Pass/Oops) - if isinstance(play_result_str, str): - # 去除空格后检查 - stripped = play_result_str.strip() - if stripped in ["Perfect", "Good", "Failed", "Pass", "Oops"]: - # 直接使用 - result = stripped - else: - # 尝试JSON解析 - try: - play_result = json.loads(play_result_str) - if isinstance(play_result, dict): - result = play_result.get("result", "") - else: - result = "" - except: - result = "" - else: - # 如果不是字符串,尝试当dict处理 - if isinstance(play_result_str, dict): - result = play_result_str.get("result", "") - else: - result = "" - - # 收集前3个样例 - if idx < 3: - sample_results.append({ - "component": component_name, - "raw": str(play_result_str)[:100], - "result": result - }) - except Exception as e: - parse_error_count += 1 - if parse_error_count <= 3: - print(f" [统计] [警告] 解析play_result失败 (第{idx+1}条): {e}, 原始值: {str(play_result_str)[:100]}") - result = "" - - component_stats[component_name]["total"] += 1 - if result in ["Perfect", "Good", "Failed", "Pass", "Oops"]: - component_stats[component_name][result] += 1 - - print(f" [统计] play_result解析样例(前3条):") - for s in sample_results: - print(f" [统计] - 组件: {s['component']}, 结果: {s['result']}, 原始: {s['raw']}") - if parse_error_count > 0: - print(f" [统计] play_result解析失败总数: {parse_error_count}") - - # 生成统计数据行 - for component_name in sorted(component_stats.keys()): - stats = component_stats[component_name] - total = stats["total"] - perfect = stats["Perfect"] - good = stats["Good"] - failed = stats["Failed"] - pass_count = stats["Pass"] - oops = stats["Oops"] - - perfect_ratio = round(perfect / total * 100, 2) if total > 0 else 0 - good_ratio = round(good / total * 100, 2) if total > 0 else 0 - failed_ratio = round(failed / total * 100, 2) if total > 0 else 0 - pass_ratio = round(pass_count / total * 100, 2) if total > 0 else 0 - oops_ratio = round(oops / total * 100, 2) if total > 0 else 0 - - component_stats_data.append({ - "互动组件名称": component_name, - "总数量": total, - "Perfect数量": perfect, - "Good数量": good, - "Failed数量": failed, - "Pass数量": pass_count, - "Oops数量": oops, - "Perfect比例(%)": perfect_ratio, - "Good比例(%)": good_ratio, - "Failed比例(%)": failed_ratio, - "Pass比例(%)": pass_ratio, - "Oops比例(%)": oops_ratio, - }) - - # ============ b. 中互动组件-按知识点-通过情况统计 ============ - kp_stats_data = [] - kp_stats = defaultdict(lambda: {"Perfect": 0, "Good": 0, "Failed": 0, "Pass": 0, "Oops": 0, "total": 0}) - - # 调试信息 - mid_count = 0 - has_kp_count = 0 - sample_kp_records = [] - - for idx, record in enumerate(sheet2_rows): - c_type = record.get("c_type", "") - if not c_type or not c_type.startswith("mid"): - continue - - mid_count += 1 - kp_relation_info_str = record.get("知识点", "") - - if not kp_relation_info_str: - continue - - has_kp_count += 1 - - # 解析知识点 - try: - if isinstance(kp_relation_info_str, str): - kp_relation_info = json.loads(kp_relation_info_str) - else: - kp_relation_info = kp_relation_info_str - - if not isinstance(kp_relation_info, list): - continue - - # 收集样例 - if len(sample_kp_records) < 3: - sample_kp_records.append({ - "c_type": c_type, - "kp_count": len(kp_relation_info), - "kp_info": str(kp_relation_info)[:200] - }) - - # 解析play_result(使用相同的逻辑) - play_result_str = record.get("play_result", "") - result = "" - if isinstance(play_result_str, str): - stripped = play_result_str.strip() - if stripped in ["Perfect", "Good", "Failed", "Pass", "Oops"]: - result = stripped - else: - try: - play_result = json.loads(play_result_str) - if isinstance(play_result, dict): - result = play_result.get("result", "") - except: - pass - elif isinstance(play_result_str, dict): - result = play_result_str.get("result", "") - - # 为每个知识点统计 - for kp in kp_relation_info: - if not isinstance(kp, dict): - continue - - kp_id = kp.get("kpId", "") - kp_type = kp.get("kpType", "") - kp_title = kp.get("kpTitle", "") - - if not kp_id: - continue - - kp_key = f"{kp_id}|{kp_type}|{kp_title}" - kp_stats[kp_key]["total"] += 1 - if result in ["Perfect", "Good", "Failed", "Pass", "Oops"]: - kp_stats[kp_key][result] += 1 - - except Exception as e: - if len(sample_kp_records) < 5: - print(f" [统计] [警告] 解析知识点失败: {e}, 原始值: {str(kp_relation_info_str)[:100]}") - continue - - print(f" [统计] 中互动组件统计: 总数={mid_count}, 有知识点={has_kp_count}, 知识点条目数={len(kp_stats)}") - if sample_kp_records: - print(f" [统计] 知识点样例(前3条):") - for s in sample_kp_records: - print(f" [统计] - c_type={s['c_type']}, 知识点数量={s['kp_count']}, 内容={s['kp_info']}") - - # 生成知识点统计数据行 - for kp_key in sorted(kp_stats.keys()): - parts = kp_key.split("|") - if len(parts) != 3: - continue - - kp_id, kp_type, kp_title = parts - stats = kp_stats[kp_key] - total = stats["total"] - perfect = stats["Perfect"] - good = stats["Good"] - failed = stats["Failed"] - pass_count = stats["Pass"] - oops = stats["Oops"] - - perfect_ratio = round(perfect / total * 100, 2) if total > 0 else 0 - good_ratio = round(good / total * 100, 2) if total > 0 else 0 - failed_ratio = round(failed / total * 100, 2) if total > 0 else 0 - pass_ratio = round(pass_count / total * 100, 2) if total > 0 else 0 - oops_ratio = round(oops / total * 100, 2) if total > 0 else 0 - - kp_stats_data.append({ - "知识点ID": kp_id, - "知识点类型": kp_type, - "知识点标题": kp_title, - "总数量": total, - "Perfect数量": perfect, - "Good数量": good, - "Failed数量": failed, - "Pass数量": pass_count, - "Oops数量": oops, - "Perfect比例(%)": perfect_ratio, - "Good比例(%)": good_ratio, - "Failed比例(%)": failed_ratio, - "Pass比例(%)": pass_ratio, - "Oops比例(%)": oops_ratio, - }) - - # ============ c. 单元总结-按单元统计时长 ============ - unit_time_stats_data = [] - unit_time_stats = defaultdict(int) - - for record in sheet5_rows: - unit_id = record.get("unit_id") - play_time_seconds = record.get("play_time_seconds", 0) - - if unit_id is not None: - unit_time_stats[unit_id] += play_time_seconds - - # 生成单元时长统计数据行 - for unit_id in sorted(unit_time_stats.keys()): - total_seconds = unit_time_stats[unit_id] - total_minutes = int(total_seconds / 60) - - unit_time_stats_data.append({ - "单元ID": f"unit_{unit_id}", - "总时长(秒)": total_seconds, - "总时长(分钟)": total_minutes, - }) - - print(f" [统计] 汇总统计数据生成完成,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - print(f" [统计] 生成了{len(component_stats_data)}条组件统计, {len(kp_stats_data)}条知识点统计, {len(unit_time_stats_data)}条单元时长统计") - - return ( - pd.DataFrame(component_stats_data), - pd.DataFrame(kp_stats_data), - pd.DataFrame(unit_time_stats_data) - ) - - - -def write_excel(path: str, sheet1_rows: List[Dict[str, Any]], sheet2_rows: List[Dict[str, Any]], sheet3_rows: List[Dict[str, Any]], sheet4_rows: List[Dict[str, Any]], sheet5_rows: List[Dict[str, Any]], stats_component_df: Any, stats_kp_df: Any, stats_unit_time_df: Any) -> None: - if pd is None: - raise RuntimeError("缺少pandas依赖,请安装后再运行。") - - print(f" [Excel] 开始写入Excel文件: {path}") - start_time = datetime.datetime.now() - - out_dir = os.path.dirname(path) or "." - os.makedirs(out_dir, exist_ok=True) - with pd.ExcelWriter(path, engine="openpyxl") as writer: - pd.DataFrame(sheet1_rows, columns=SHEET1_COLUMNS).to_excel(writer, sheet_name="全部音频数据", index=False) - pd.DataFrame(sheet2_rows, columns=SHEET2_COLUMNS).to_excel(writer, sheet_name="互动组件学习记录", index=False) - pd.DataFrame(sheet3_rows, columns=SHEET3_COLUMNS).to_excel(writer, sheet_name="课程巩固记录", index=False) - pd.DataFrame(sheet4_rows, columns=SHEET4_COLUMNS).to_excel(writer, sheet_name="单元挑战记录", index=False) - pd.DataFrame(sheet5_rows, columns=SHEET5_COLUMNS).to_excel(writer, sheet_name="单元总结记录", index=False) - stats_component_df.to_excel(writer, sheet_name="统计-互动组件通过情况", index=False) - stats_kp_df.to_excel(writer, sheet_name="统计-知识点通过情况", index=False) - stats_unit_time_df.to_excel(writer, sheet_name="统计-单元总结时长", index=False) - - print(f" [Excel] 写入完成,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - - -def get_date_str() -> str: - """获取当前日期字符串 格式:YYYYMMDD""" - return datetime.datetime.now().strftime("%Y%m%d") - - -def export_single_user(user_id: str, es_cfg: Dict[str, Any], pg_conn: Any, mysql_conn: Any, output_path: str, id_2_unit_index: Dict[int, int], chapter_id_to_lesson_id: Dict[int, int]) -> bool: - """ - 导出单个角色id的数据 - - Args: - user_id: 角色ID - es_cfg: ES配置 - pg_conn: PostgreSQL连接 - mysql_conn: MySQL连接 - output_path: 输出路径 - id_2_unit_index: story_id到unit_id的映射字典 - chapter_id_to_lesson_id: chapter_id到lesson_id的映射字典 - - Returns: - True表示成功,False表示失败 - """ - try: - print(f"\n[INFO] ========== 开始导出角色id={user_id} ==========") - total_start_time = datetime.datetime.now() - - # 查询ES数据 - sheet1_rows = fetch_es_user_audio(user_id, es_cfg) - - # 查询PG数据 - sheet2_rows = fetch_pg_play_records(user_id, pg_conn, mysql_conn) - sheet3_rows = fetch_pg_unit_review(user_id, pg_conn, id_2_unit_index, chapter_id_to_lesson_id) - sheet4_rows = fetch_pg_unit_challenge(user_id, pg_conn, id_2_unit_index) - sheet5_rows = fetch_pg_unit_summary(user_id, pg_conn, id_2_unit_index) - - # 检查是否有有效数据 - total_records = len(sheet1_rows) + len(sheet2_rows) + len(sheet3_rows) + len(sheet4_rows) + len(sheet5_rows) - print(f" [统计] 数据汇总:") - print(f" - 全部音频数据: {len(sheet1_rows)}条") - print(f" - 互动组件学习记录: {len(sheet2_rows)}条") - print(f" - 课程巩固记录: {len(sheet3_rows)}条") - print(f" - 单元挑战记录: {len(sheet4_rows)}条") - print(f" - 单元总结记录: {len(sheet5_rows)}条") - print(f" - 总计: {total_records}条") - - if total_records == 0: - print(f"[WARN] 角色id={user_id} 没有找到任何有效记录,跳过导出") - return False - - # 生成汇总统计数据 - stats_component_df, stats_kp_df, stats_unit_time_df = generate_statistics(sheet2_rows, sheet5_rows) - - # 写入Excel - write_excel(output_path, sheet1_rows, sheet2_rows, sheet3_rows, sheet4_rows, sheet5_rows, stats_component_df, stats_kp_df, stats_unit_time_df) - - total_time = (datetime.datetime.now() - total_start_time).total_seconds() - print(f"[INFO] 角色id={user_id} 导出成功") - print(f"[INFO] 文件路径: {output_path}") - print(f"[INFO] 总耗时: {total_time:.2f}秒") - print(f"[INFO] ========== 完成 ==========\n") - return True - - except Exception as e: - print(f"[ERROR] 角色id={user_id} 导出失败: {e}") - import traceback - traceback.print_exc() - return False - - -def main(): - load_env() - - # 确定运行模式并收集需要导出的角色id列表 - user_id_list: List[tuple] = [] # [(user_id, account_id or None), ...] - date_str = get_date_str() - - # 检查三种模式的配置 - has_user_id = USER_ID is not None - has_user_id_list = USER_ID_LIST is not None and len(USER_ID_LIST) > 0 - has_account_id_list = ACCOUNT_ID_LIST is not None and len(ACCOUNT_ID_LIST) > 0 - - # 验证只能配置一种模式 - mode_count = sum([has_user_id, has_user_id_list, has_account_id_list]) - if mode_count == 0: - raise RuntimeError("请配置 USER_ID、USER_ID_LIST 或 ACCOUNT_ID_LIST 中的一个") - if mode_count > 1: - raise RuntimeError("USER_ID、USER_ID_LIST、ACCOUNT_ID_LIST 只能配置一个,请检查配置") - - # 模式1:单个角色id - if has_user_id: - user_id_list = [(str(USER_ID), None)] - print(f"[INFO] 运行模式:单个角色id") - - # 模式2:角色id列表 - elif has_user_id_list: - user_id_list = [(str(uid), None) for uid in USER_ID_LIST] - print(f"[INFO] 运行模式:角色id列表,共{len(user_id_list)}个角色") - - # 模式3:账户id列表 - elif has_account_id_list: - print(f"[INFO] 运行模式:账户id列表,共{len(ACCOUNT_ID_LIST)}个账户") - mysql_conn = None - try: - mysql_conn = get_mysql_conn("vala_user") # 查询用户表,使用 vala_user 数据库 - for account_id in ACCOUNT_ID_LIST: - account_id_str = str(account_id) - print(f"[INFO] 查询账户id={account_id_str}对应的角色id...") - character_ids = fetch_character_ids_by_account(account_id_str, mysql_conn) - if not character_ids: - print(f"[WARN] 账户id={account_id_str} 未找到关联的角色id,跳过") - continue - print(f"[INFO] 账户id={account_id_str} 找到{len(character_ids)}个角色id: {character_ids}") - for cid in character_ids: - user_id_list.append((cid, account_id_str)) - finally: - if mysql_conn: - try: - mysql_conn.close() - except Exception: - pass - - if not user_id_list: - print("[WARN] 没有需要导出的角色id,程序退出") - return - - # 初始化连接 - es_cfg = get_es_config() - pg_conn = get_pg_conn() - - # 获取映射表(只需要查询一次,所有角色共用) - print(f"\n[INFO] ===== 准备工作:获取映射表 =====") - mysql_conn = None - id_2_unit_index = {} - chapter_id_to_lesson_id = {} - try: - print(f"[INFO] 正在连接MySQL数据库(vala_test)...") - mysql_conn = get_mysql_conn("vala_test") # 查询游戏配置表,使用 vala_test 数据库 - print(f"[INFO] 正在获取 story_id 到 unit_id 的映射...") - id_2_unit_index = get_id_2_unit_index(mysql_conn) - print(f"[INFO] 成功获取 {len(id_2_unit_index)} 个 story_id 映射") - print(f"[INFO] 正在获取 chapter_id 到 lesson_id 的映射...") - chapter_id_to_lesson_id = get_chapter_id_to_lesson_id(mysql_conn) - print(f"[INFO] 成功获取 {len(chapter_id_to_lesson_id)} 个 chapter_id 映射") - except Exception as e: - print(f"[ERROR] 获取映射表失败: {e}") - import traceback - traceback.print_exc() - if pg_conn: - try: - pg_conn.close() - except Exception: - pass - if mysql_conn: - try: - mysql_conn.close() - except Exception: - pass - return - - try: - # 统计信息 - success_count = 0 - skip_count = 0 - - print(f"\n[INFO] ===== 开始批量导出 =====") - print(f"[INFO] 共需导出{len(user_id_list)}个角色\n") - batch_start_time = datetime.datetime.now() - - # 循环处理每个角色id - for idx, (user_id, account_id) in enumerate(user_id_list, 1): - print(f"\n{'='*60}") - print(f"[INFO] 进度: {idx}/{len(user_id_list)} ({idx*100//len(user_id_list)}%)") - print(f"{'='*60}") - - # 生成输出文件名 - if account_id is None: - # 模式1和模式2:角色id_{}_导出时间_{}.xlsx - filename = f"角色id_{user_id}_导出时间_{date_str}.xlsx" - else: - # 模式3:账户id_{}_角色id_{}_导出时间_{}.xlsx - filename = f"账户id_{account_id}_角色id_{user_id}_导出时间_{date_str}.xlsx" - - output_path = os.path.join(OUTPUT_DIR, filename) - - # 导出单个角色的数据 - result = export_single_user(user_id, es_cfg, pg_conn, mysql_conn, output_path, id_2_unit_index, chapter_id_to_lesson_id) - if result: - success_count += 1 - else: - skip_count += 1 - - # 输出统计信息 - batch_total_time = (datetime.datetime.now() - batch_start_time).total_seconds() - print(f"\n{'='*60}") - print(f"[INFO] ===== 全部导出完成 =====") - print(f"[INFO] 总计: {len(user_id_list)}个角色") - print(f"[INFO] 成功: {success_count}个") - print(f"[INFO] 跳过: {skip_count}个") - print(f"[INFO] 总耗时: {batch_total_time:.2f}秒 ({batch_total_time/60:.2f}分钟)") - if success_count > 0: - print(f"[INFO] 平均每个角色: {batch_total_time/success_count:.2f}秒") - print(f"{'='*60}\n") - - finally: - if pg_conn: - try: - pg_conn.close() - except Exception: - pass - if mysql_conn: - try: - mysql_conn.close() - except Exception: - pass - - -if __name__ == "__main__": - main() diff --git a/new_export/export_user_id_data_debug.py b/new_export/export_user_id_data_debug.py deleted file mode 100644 index 4be3cb8..0000000 --- a/new_export/export_user_id_data_debug.py +++ /dev/null @@ -1,1845 +0,0 @@ -""" -初版需求v1.0: 2025.11.18 - -导出 一个userId的多表数据, 最终按照不同sheet,输出到一个 excel文件中。 - -1. 第一个sheet:"全部音频数据" -es相关配置通过以下环境变量 -ES_HOST=xxx -ES_PORT=9200 -ES_SCHEME=https -ES_USER=elastic -ES_PASSWORD=xxx - -index: user-audio - -脚本思路: -过滤字段: -userId == xxxx - -输出该userId的全部记录 按时间倒序排序 -包含以下字段内容: - -userId -userMsg -userName -soeData -audioUrl -asrStatus -componentId -componentType -dataVersion - -2. 第二个sheet:"互动组件学习记录" -在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 -数据库相关配置 从.env中读取: -PG_DB_HOST = xxx -PG_DB_PORT = xxx -PG_DB_USER = xxx -PG_DB_PASSWORD = xxx -PG_DB_DATABASE = xxx - -读取以下数据表: -user_component_play_record_0 ~ user_component_play_record_7 - -输出以下字段: -user_id, -component_unique_code, -session_id, -c_type, -c_id, -play_result, -user_behavior_info, -updated_at - -3.第三个sheet:"课程巩固记录" -在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 - -数据表:user_unit_review_question_result - -输出以下字段: -user_id -story_id -chapter_id -question_list -updated_at - -4.第四个sheet:"单元挑战记录" -在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 - -数据表:user_unit_challenge_question_result - -输出以下字段: -user_id -story_id -category -score_text, -question_list -updated_at ------------- - -需求补充v1.1: -"全部音频数据"这个sheet -输出字段 添加timeStr 并按时间倒序排列 最新的记录 在最上面 - ------------- -需求补充v1.2: -"全部音频数据"这个sheet -如果userMsg字段内容 包含 ”makee_id“ 要进行以下处理: - -从userMsg字段中提取出具体的makee_id: -此时的字段样例: -``` -asr msg信息为:{ - "time_ms": 358, - "time_ms_api": 357, - "hot_words_str": "{\n \"context_type\": \"dialog_ctx\",\n \"context_data\": [\n {\n \"text\": \"planet Walla\"\n },\n {\n \"text\": \"Walla\"\n }\n ]\n}", - "makee_id": "d208c617-902f-4f81-8255-b5fb73599546", - "volcano_fast_x_tt_logid": "202511151541355DF72BE5EBFE73795BFD", - "api_name": "volcano-fast" -} -``` -然后基于makee_id 去另一个表里查记录: index:llm_asr_log -将查询到的记录的 result_text 字段内容 回填到 userMsg。 -将source字段内容 输出 到 source。 - -如果userMsg字段内容 不包含 ”makee_id“ 保持之前的逻辑。 - --------------- -需求补充 v1.3 -当前输入 只支持配置单个 userId (业务侧名称为角色id) - - -期望扩展为以下逻辑: -1. 改为配置 角色id list , 分别 导出 多份excel文件。命名格式为 角色id_{}_导出时间_{}.xlsx -2. 改为配置 账户id list , 分别 导出 多份excel文件。命名格式为 账户id_{}_角色id_{}_导出时间_{}.xlsx - -关于 账户 id 到角色id 的映射逻辑, -首先 读取 mysql 表 vala_app_character -筛选 account_id字段值 == 账户id 的 记录, 其中 该记录 的 id值,则为角色id 一个 账户id 可以对应多个角色id - -本次需求只针对输入侧调整, 数据抽取聚合逻辑部分和之前保持一致 - ---------------- -需求补充 v1.4 - -增加一个sheet "单元总结记录", -导出对应角色id的单元总结记录。 参考 export_unit_summary.py 中的原始数据提取方案即可(不必关注其中的数据统计部分)。 - -其他已有逻辑保持不动哦。 - ----------------- -需求补充 v1.5 - -1."互动组件学习记录"sheet 增加以下字段 -"互动组件名称"、"组件标题"、"组件配置摘要"、"知识点": -字段取值规则: -根据 c_type 及组件配置(从mysql表获取) 进行映射和处理: -``` -1).如果 c_type 开头为"mid" - -则读取下表:表名:middle_interaction_component - -获取以下字段值: -title (作为组件标题) -component_config (完整的组件配置) 获取其中 的 question 字段值 作为 组件配置摘要; -kp_relation_info 字段值 作为 知识点 - -"互动组件名称"规则: - -"物品互动": "mid_vocab_item", -"图片互动": "mid_vocab_image", -"填词互动": "mid_vocab_fillBlank", -"指令互动": "mid_vocab_instruction" -"对话互动-表达": "mid_sentence_dialogue", 且 component_config->question->mode == "express" -"对话互动-朗读": "mid_sentence_dialogue", 且 component_config->question->mode == "read" -"语音互动": "mid_sentence_voice", -"材料互动": "mid_sentence_material", -"造句互动": "mid_sentence_makeSentence" -"挖空互动": "mid_grammar_cloze", -"组句互动": "mid_grammar_sentence" -"发音互动": "mid_pron_pron" - - -2). 如果 c_type 开头为"core" -则读取下表:表名:core_interaction_component - -获取以下字段值: -title (作为组件标题) -component_config (完整的组件配置) 获取其中 的 taskInfo 字段值 作为 组件配置摘要 -kp_relation_info 字段值 作为 知识点 - -"互动组件名称"规则: -"口语快答": "core_speaking_reply", -"口语妙问": "core_speaking_inquiry", -"口语探讨": "core_speaking_explore", -"口语独白": "core_speaking_monologue" -"合作阅读": "core_reading_order", -"合作听力": "core_listening_order", -"看图组句": "core_writing_imgMakeSentence", -"看图撰写": "core_writing_imgWrite", -"问题组句": "core_writing_questionMakeSentence", -"问题撰写": "core_writing_questionWrite", -``` - -2."课程巩固记录" sheet 增加以下字段 -"正确率": 参考 export_lesson_review.py 中的计算逻辑 - -3. 新增一个"汇总统计"sheet -统计并展示以下内容 请以 可读性 比较好的方式排列、展示 - -a. "所有互动-按互动组件类型-通过情况统计" -以每种"互动组件名称"进行聚合 -统计play_result的取值分布情况,算以下指标: -总数量、Perfect数量、Good数量、Failed数量、Pass数量、Perfect比例、Good比例、Failed比例、Pass比例 - -b. "中互动组件-按知识点-通过情况统计" -以每个知识点进行聚合 - -其中 知识点配置格式如下: -``` -[{"kpId":"0000004","kpType":"sentence","kpTitle":"My name is ...","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000004","kpType":"sentence","kpTitle":"My name is ...","kpSkill":"sentence_meaning","kpSkillName":"语义"},{"kpId":"0000005","kpType":"sentence","kpTitle":"I'm… years old.","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000005","kpType":"sentence","kpTitle":"I'm… years old.","kpSkill":"sentence_meaning","kpSkillName":"语义"},{"kpId":"0000014","kpType":"sentence","kpTitle":"Nice to meet you.","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000014","kpType":"sentence","kpTitle":"Nice to meet you.","kpSkill":"sentence_meaning","kpSkillName":"语义"}] -``` -一个组件可以绑定多个知识点,以每个知识点的 kpId + kpType + kpTitle 进行 展示及聚合 - -对所有绑定了某个知识点的中互动组件(c_type以mid开头) -统计play_result的取值分布情况,算以下指标: -总数量、Perfect数量、Good数量、Failed数量、Pass数量、Perfect比例、Good比例、Failed比例、Pass比例 - -c. "单元总结-按单元统计时长" - -将"单元总结记录"中的"play_time_seconds"字段值 以每个单元id 进行聚合 进行 累加 统计,并增加一列 转换为分钟为单位 取整数 - - -""" -# ==== 可直接修改的脚本变量(不使用命令行传参) ==== -# 三种模式互斥,只能配置一个: -# 模式1:单个角色id -USER_ID = None # 单个角色ID,示例:2911 - -# 模式2:角色id列表(多个角色id批量导出) -USER_ID_LIST = None # 角色ID列表,示例:[2911, 2912, 2913] - -# 模式3:账户id列表(通过账户id查询对应的角色id后批量导出) -ACCOUNT_ID_LIST = [9343] # 账户ID列表,示例:[100, 101, 102] - -OUTPUT_DIR = "output/" # 输出目录,默认为output文件夹 -# ==== 变量结束 ==== -import os -import json -import re -from typing import Any, Dict, List, Optional - -import datetime - -try: - import requests -except Exception: - requests = None - -try: - import psycopg2 - from psycopg2.extras import RealDictCursor -except Exception: - psycopg2 = None - RealDictCursor = None - -try: - import pymysql - import pymysql.cursors -except Exception: - pymysql = None - -try: - import pandas as pd -except Exception: - pd = None - -try: - import urllib3 -except Exception: - urllib3 = None - - -SHEET1_COLUMNS = [ - "userId", - "userMsg", - "source", - "userName", - "soeData", - "audioUrl", - "asrStatus", - "componentId", - "componentType", - "dataVersion", - "timeStr", -] - -SHEET2_COLUMNS = [ - "user_id", - "component_unique_code", - "session_id", - "c_type", - "c_id", - "互动组件名称", - "组件标题", - "组件配置摘要", - "知识点", - "play_result", - "user_behavior_info", - "updated_at", -] - -SHEET3_COLUMNS = [ - "user_id", - "unit_id", - "lesson_id", - "question_list", - "正确率", - "updated_at", -] - -SHEET4_COLUMNS = [ - "user_id", - "unit_id", - "category", - "score_text", - "question_list", - "updated_at", -] - -SHEET5_COLUMNS = [ - "id", - "user_id", - "unit_id", - "updated_at", - "km_id", - "km_type", - "play_time_seconds", -] - - -def _load_env_file(path: str) -> None: - if not os.path.exists(path): - return - try: - with open(path, "r", encoding="utf-8") as f: - for line in f: - line = line.strip() - if not line or line.startswith("#"): - continue - if "=" not in line: - continue - k, v = line.split("=", 1) - k = k.strip() - v = v.strip().strip('"').strip("'") - if k and (os.getenv(k) is None): - os.environ[k] = v - except Exception: - pass - - -def load_env() -> None: - _load_env_file(os.path.join(os.getcwd(), ".env")) - _load_env_file(os.path.join(os.getcwd(), ".env.local")) - - -def to_json_str(v: Any) -> Any: - if isinstance(v, (dict, list)): - try: - return json.dumps(v, ensure_ascii=False) - except Exception: - return str(v) - return v - - -def parse_time(value: Any) -> Optional[datetime.datetime]: - if value is None: - return None - if isinstance(value, (int, float)): - try: - v = float(value) - # 兼容毫秒级时间戳 - if v > 1e11: - v = v / 1000.0 - return datetime.datetime.fromtimestamp(v) - except Exception: - return None - if isinstance(value, str): - fmts = [ - "%Y-%m-%dT%H:%M:%S.%fZ", - "%Y-%m-%dT%H:%M:%S.%f%z", - "%Y-%m-%dT%H:%M:%S%z", - "%Y-%m-%d %H:%M:%S", - "%Y-%m-%d", - ] - for fmt in fmts: - try: - return datetime.datetime.strptime(value, fmt) - except Exception: - continue - try: - return datetime.datetime.fromisoformat(value) - except Exception: - return None - return None - - -def pick_time(source: Dict[str, Any]) -> Optional[datetime.datetime]: - candidates = [ - "updated_at", - "created_at", - "@timestamp", - "timestamp", - "updatedAt", - "createdAt", - "time", - "ts", - "timeStr", - "update_time", - "create_time", - ] - for key in candidates: - if key in source: - t = parse_time(source.get(key)) - if t is not None: - return t - # 宽松匹配:尝试扫描所有可能的时间相关字段 - for k, v in source.items(): - lk = str(k).lower() - if any(s in lk for s in ["time", "date", "_at", "timestamp"]): - t = parse_time(v) - if t is not None: - return t - return None - - -def extract_makee_id_from_user_msg(user_msg: Any) -> Optional[str]: - # 支持dict或字符串形式 - if isinstance(user_msg, dict): - mk = user_msg.get("makee_id") - if isinstance(mk, str) and mk: - return mk - if isinstance(user_msg, str) and user_msg: - # 1) 尝试整体解析为JSON - try: - obj = json.loads(user_msg) - mk = obj.get("makee_id") - if isinstance(mk, str) and mk: - return mk - except Exception: - pass - # 2) 尝试截取大括号中的JSON - try: - start = user_msg.find("{") - end = user_msg.rfind("}") - if start != -1 and end != -1 and end > start: - candidate = user_msg[start : end + 1] - obj = json.loads(candidate) - mk = obj.get("makee_id") - if isinstance(mk, str) and mk: - return mk - except Exception: - pass - # 3) 正则匹配 makee_id - m = re.search(r"\bmakee_id\b\s*:\s*\"([^\"]+)\"", user_msg) - if m: - return m.group(1) - return None - - -def fetch_es_asr_log(makee_id: str, es_cfg: Dict[str, Any]) -> Optional[Dict[str, Any]]: - if requests is None: - raise RuntimeError("缺少requests依赖,请安装后再运行。") - host = es_cfg.get("host") - port = es_cfg.get("port") - scheme = es_cfg.get("scheme", "http") - user = es_cfg.get("user") - password = es_cfg.get("password") - index = "llm_asr_log" - if not host: - return None - base = f"{scheme}://{host}:{port}" - url = f"{base}/{index}/_search" - headers = {"Content-Type": "application/json"} - body = { - "query": { - "bool": { - "should": [ - {"term": {"makee_id": {"value": str(makee_id)}}}, - {"term": {"makee_id.keyword": {"value": str(makee_id)}}}, - ], - "minimum_should_match": 1, - } - }, - "size": 10, - "_source": [ - "makee_id", - "result_text", - "source", - "updated_at", - "created_at", - "@timestamp", - "timestamp", - "updatedAt", - "createdAt", - "time", - "ts", - "timeStr", - "update_time", - "create_time", - ], - } - auth = (user, password) if user and password else None - try: - if scheme == "https" and urllib3 is not None: - try: - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - except Exception: - pass - resp = requests.post(url, headers=headers, json=body, auth=auth, timeout=20, verify=False if scheme == "https" else True) - resp.raise_for_status() - data = resp.json() - except Exception: - return None - hits = data.get("hits", {}).get("hits", []) - if not hits: - return None - # 选最新的 - chosen = None - best_t = None - for h in hits: - src = h.get("_source", {}) or {} - t = pick_time(src) - if t is None: - continue - if best_t is None or t > best_t: - best_t = t - chosen = src - if chosen is None: - # 如果都没有时间,选第一条 - chosen = (hits[0].get("_source", {}) or {}) - return chosen - - -def get_es_config() -> Dict[str, Any]: - return { - "host": os.getenv("ES_HOST"), - "port": os.getenv("ES_PORT", "9200"), - "scheme": os.getenv("ES_SCHEME", "http"), - "user": os.getenv("ES_USER"), - "password": os.getenv("ES_PASSWORD"), - "index": "user-audio", - } - - -def fetch_es_user_audio(user_id: str, es_cfg: Dict[str, Any]) -> List[Dict[str, Any]]: - if requests is None: - raise RuntimeError("缺少requests依赖,请安装后再运行。") - - print(f" [ES] 开始查询user-audio索引...") - start_time = datetime.datetime.now() - - host = es_cfg.get("host") - port = es_cfg.get("port") - scheme = es_cfg.get("scheme", "http") - user = es_cfg.get("user") - password = es_cfg.get("password") - index = es_cfg.get("index", "user-audio") - - if not host: - return [] - - base = f"{scheme}://{host}:{port}" - url = f"{base}/{index}/_search" - headers = {"Content-Type": "application/json"} - - body = { - "query": { - "bool": { - "should": [ - {"term": {"userId": {"value": str(user_id)}}}, - {"term": {"userId.keyword": {"value": str(user_id)}}}, - ], - "minimum_should_match": 1, - } - }, - "size": 10000, - "_source": [ - "userId", - "userMsg", - "userName", - "soeData", - "audioUrl", - "asrStatus", - "componentId", - "componentType", - "dataVersion", - "updated_at", - "created_at", - "@timestamp", - "timestamp", - "updatedAt", - "createdAt", - "time", - "ts", - "timeStr", - "update_time", - "create_time", - ], - } - - auth = (user, password) if user and password else None - - try: - # 抑制自签证书下的HTTPS不安全警告 - if scheme == "https" and urllib3 is not None: - try: - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - except Exception: - pass - resp = requests.post(url, headers=headers, json=body, auth=auth, timeout=30, verify=False if scheme == "https" else True) - resp.raise_for_status() - data = resp.json() - except Exception as e: - raise RuntimeError(f"ES查询失败: {e}") - - hits = data.get("hits", {}).get("hits", []) - print(f" [ES] 查询完成,获得{len(hits)}条记录,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - - if not hits: - return [] - - print(f" [ES] 开始处理音频数据...") - process_start = datetime.datetime.now() - - rows: List[Dict[str, Any]] = [] - asr_cache: Dict[str, Dict[str, Any]] = {} - makee_id_count = 0 - - for idx, h in enumerate(hits, 1): - # 每处理100条显示一次进度 - if idx % 100 == 0 or idx == len(hits): - print(f" [ES] 处理进度: {idx}/{len(hits)} ({idx*100//len(hits)}%)") - - src = h.get("_source", {}) or {} - row = { - "userId": src.get("userId"), - "userMsg": src.get("userMsg"), - "source": None, - "userName": src.get("userName"), - "soeData": to_json_str(src.get("soeData")), - "audioUrl": src.get("audioUrl"), - "asrStatus": src.get("asrStatus"), - "componentId": src.get("componentId"), - "componentType": src.get("componentType"), - "dataVersion": src.get("dataVersion"), - } - t = pick_time(src) - row["_time"] = t.isoformat() if t else None - row["timeStr"] = t.strftime("%Y-%m-%d %H:%M:%S") if t else None - # v1.2: 当userMsg包含makee_id时,补充查询llm_asr_log并回填 - mk = extract_makee_id_from_user_msg(row.get("userMsg")) - if mk: - makee_id_count += 1 - asr_doc = asr_cache.get(mk) - if asr_doc is None: - asr_doc = fetch_es_asr_log(mk, es_cfg) - if asr_doc is not None: - asr_cache[mk] = asr_doc - if asr_doc is not None: - rt = asr_doc.get("result_text") - if rt: - row["userMsg"] = rt - row["source"] = to_json_str(asr_doc.get("source")) - rows.append(row) - - print(f" [ES] 数据处理完成,发现{makee_id_count}条包含makee_id的记录,耗时{(datetime.datetime.now() - process_start).total_seconds():.2f}秒") - - print(f" [ES] 开始排序...") - rows.sort(key=lambda x: parse_time(x.get("_time")) or datetime.datetime.min, reverse=True) - print(f" [ES] 音频数据处理完成,总耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - - return rows - - -def get_pg_conn() -> Any: - if psycopg2 is None: - raise RuntimeError("缺少psycopg2依赖,请安装后再运行。") - host = os.getenv("PG_DB_HOST") - port = int(os.getenv("PG_DB_PORT", "5432")) - user = os.getenv("PG_DB_USER") - password = os.getenv("PG_DB_PASSWORD") - dbname = os.getenv("PG_DB_DATABASE") - if not host or not dbname: - raise RuntimeError("PG数据库环境变量未配置完整") - conn = psycopg2.connect(host=host, port=port, user=user, password=password, dbname=dbname) - return conn - - -def get_mysql_conn(database: str) -> Any: - """ - 获取MySQL数据库连接 - - Args: - database: 数据库名,可选值:'vala_user' 或 'vala_test' - vala_user 使用 online 配置(环境变量后缀 _online) - vala_test 使用默认配置 - - Returns: - MySQL连接对象 - """ - if pymysql is None: - raise RuntimeError("缺少pymysql依赖,请安装后再运行。") - - # 根据数据库选择不同的环境变量配置 - if database == "vala_user": - # vala_user 数据库使用 online 配置 - host = os.getenv("MYSQL_HOST_online") - port = int(os.getenv("MYSQL_PORT_online", "3306")) - user = os.getenv("MYSQL_USERNAME_online") - password = os.getenv("MYSQL_PASSWORD_online") - if not host: - raise RuntimeError("MySQL数据库环境变量未配置完整(缺少MYSQL_HOST_online)") - else: - # vala_test 等其他数据库使用默认配置 - host = os.getenv("MYSQL_HOST") - port = int(os.getenv("MYSQL_PORT", "3306")) - user = os.getenv("MYSQL_USERNAME") - password = os.getenv("MYSQL_PASSWORD") - if not host: - raise RuntimeError("MySQL数据库环境变量未配置完整(缺少MYSQL_HOST)") - - conn = pymysql.connect( - host=host, - port=port, - user=user, - password=password, - database=database, # 直接使用传入的数据库名 - charset="utf8mb4", - cursorclass=pymysql.cursors.DictCursor, - ) - return conn - - -def get_id_2_unit_index(conn: Any) -> Dict[int, int]: - """ - 从MySQL获取 story_id 到 unit_id 的映射关系 - - Args: - conn: MySQL数据库连接 - - Returns: - 映射字典 {story_id: unit_id} - """ - sql = """ - SELECT * - FROM `vala_game_info` - WHERE id > 0 - AND `vala_game_info`.`deleted_at` IS NULL - ORDER BY season_package_id asc, `index` asc - """ - try: - with conn.cursor() as cur: - cur.execute(sql) - rows = cur.fetchall() or [] - # 构建映射表:按查询结果的顺序,索引即为unit_id - id_2_unit_index = {} - for index, row in enumerate(rows): - id_2_unit_index[row["id"]] = index - return id_2_unit_index - except Exception as e: - print(f"[ERROR] 获取story_id到unit_id映射失败: {e}") - return {} - - -def get_chapter_id_to_lesson_id(conn: Any) -> Dict[int, int]: - """ - 从MySQL获取 chapter_id 到 lesson_id 的映射关系 - - Args: - conn: MySQL数据库连接 - - Returns: - 映射字典 {chapter_id: lesson_id} - """ - sql = """ - SELECT id, `index` - FROM `vala_game_chapter` - WHERE deleted_at IS NULL - """ - try: - with conn.cursor() as cur: - cur.execute(sql) - rows = cur.fetchall() or [] - # 构建映射表:chapter的index字段即为lesson_id - chapter_id_to_lesson_id = {} - for row in rows: - chapter_id_to_lesson_id[row["id"]] = row["index"] - return chapter_id_to_lesson_id - except Exception as e: - print(f"[ERROR] 获取chapter_id到lesson_id映射失败: {e}") - return {} - - -# 组件类型到组件名称的映射 -COMPONENT_TYPE_NAMES = { - "mid_vocab_item": "物品互动", - "mid_vocab_image": "图片互动", - "mid_vocab_fillBlank": "填词互动", - "mid_vocab_instruction": "指令互动", - "mid_sentence_dialogue": "对话互动", # 需要根据mode进一步判断 - "mid_sentence_voice": "语音互动", - "mid_sentence_material": "材料互动", - "mid_sentence_makeSentence": "造句互动", - "mid_grammar_cloze": "挖空互动", - "mid_grammar_sentence": "组句互动", - "mid_pron_pron": "发音互动", - "core_speaking_reply": "口语快答", - "core_speaking_inquiry": "口语妙问", - "core_speaking_explore": "口语探讨", - "core_speaking_monologue": "口语独白", - "core_reading_order": "合作阅读", - "core_listening_order": "合作听力", - "core_writing_imgMakeSentence": "看图组句", - "core_writing_imgWrite": "看图撰写", - "core_writing_questionMakeSentence": "问题组句", - "core_writing_questionWrite": "问题撰写", -} - - -def get_component_name(c_type: str, component_config: Optional[Dict[str, Any]]) -> str: - """ - 根据c_type和组件配置获取组件名称 - - Args: - c_type: 组件类型 - component_config: 组件配置(用于判断对话互动的mode) - - Returns: - 组件名称 - """ - if not c_type: - return "" - - # 特殊处理:对话互动需要根据mode判断 - if c_type == "mid_sentence_dialogue" and component_config: - try: - question = component_config.get("question", {}) - mode = question.get("mode", "") - if mode == "express": - return "对话互动-表达" - elif mode == "read": - return "对话互动-朗读" - except Exception: - pass - - return COMPONENT_TYPE_NAMES.get(c_type, "") - - -def batch_fetch_component_configs(play_records: List[Dict[str, Any]], mysql_conn: Any) -> Dict[str, Dict[str, Any]]: - """ - 批量查询组件配置信息 - - Args: - play_records: 播放记录列表 - mysql_conn: MySQL连接 - - Returns: - 组件配置映射 {c_type_c_id: {title, component_config, kp_relation_info}} - """ - print(f" [MySQL] 开始批量查询组件配置...") - start_time = datetime.datetime.now() - - # 收集需要查询的c_type和c_id - mid_c_ids = set() - core_c_ids = set() - mid_type_id_pairs = [] # 用于调试日志 - core_type_id_pairs = [] - - for record in play_records: - c_type = record.get("c_type", "") - c_id = record.get("c_id") - if c_type and c_id: - if c_type.startswith("mid"): - mid_c_ids.add(c_id) - mid_type_id_pairs.append((c_type, c_id)) - elif c_type.startswith("core"): - core_c_ids.add(c_id) - core_type_id_pairs.append((c_type, c_id)) - - print(f" [MySQL] 需要查询中互动组件: {len(mid_c_ids)}个, 核心互动组件: {len(core_c_ids)}个") - if mid_c_ids: - print(f" [MySQL] 中互动组件ID列表(前10个): {sorted(list(mid_c_ids))[:10]}") - if core_c_ids: - print(f" [MySQL] 核心互动组件ID列表(前10个): {sorted(list(core_c_ids))[:10]}") - - config_map = {} - - # 批量查询middle_interaction_component - if mid_c_ids: - try: - with mysql_conn.cursor() as cur: - placeholders = ','.join(['%s'] * len(mid_c_ids)) - sql = f""" - SELECT c_id, c_type, title, component_config, kp_relation_info - FROM middle_interaction_component - WHERE c_id IN ({placeholders}) AND deleted_at IS NULL - """ - print(f" [MySQL] 执行中互动组件查询,查询条件: c_id IN ({len(mid_c_ids)}个ID)") - cur.execute(sql, tuple(mid_c_ids)) - rows = cur.fetchall() or [] - print(f" [MySQL] 查询到{len(rows)}条中互动组件配置") - - if len(rows) == 0 and len(mid_c_ids) > 0: - print(f" [MySQL] [警告] 查询结果为空!可能的原因:") - print(f" [MySQL] - 数据库中没有匹配的c_id记录") - print(f" [MySQL] - deleted_at字段不为NULL") - print(f" [MySQL] - c_id不存在") - - for idx, row in enumerate(rows): - c_type = row.get("c_type", "") - c_id = row.get("c_id") - key = f"{c_type}_{c_id}" - - if idx < 3: # 输出前3条的详细信息 - print(f" [MySQL] [样例{idx+1}] id={c_id}, c_type={c_type}, key={key}") - print(f" [MySQL] [样例{idx+1}] title={row.get('title', '')[:50]}") - - # 解析component_config - component_config = row.get("component_config") - if isinstance(component_config, str): - try: - component_config = json.loads(component_config) - except Exception as e: - print(f" [MySQL] [警告] 解析component_config失败 (id={c_id}): {e}") - component_config = {} - - # 提取question字段作为摘要 - summary = "" - if isinstance(component_config, dict): - question = component_config.get("question") - summary = to_json_str(question) if question else "" - if idx < 3 and question: - print(f" [MySQL] [样例{idx+1}] 提取到question字段,长度: {len(summary)}") - - # 解析kp_relation_info - kp_relation_info = row.get("kp_relation_info") - if isinstance(kp_relation_info, str): - try: - kp_relation_info = json.loads(kp_relation_info) - except Exception: - kp_relation_info = [] - - config_map[key] = { - "title": row.get("title", ""), - "component_config": component_config, - "summary": summary, - "kp_relation_info": to_json_str(kp_relation_info), - } - - print(f" [MySQL] 中互动组件配置已加入config_map,当前map大小: {len(config_map)}") - except Exception as e: - print(f" [MySQL] [错误] 查询中互动组件配置失败: {e}") - import traceback - traceback.print_exc() - - # 批量查询core_interaction_component - if core_c_ids: - try: - with mysql_conn.cursor() as cur: - placeholders = ','.join(['%s'] * len(core_c_ids)) - sql = f""" - SELECT c_id, c_type, title, component_config, kp_relation_info - FROM core_interaction_component - WHERE c_id IN ({placeholders}) AND deleted_at IS NULL - """ - print(f" [MySQL] 执行核心互动组件查询,查询条件: c_id IN ({len(core_c_ids)}个ID)") - cur.execute(sql, tuple(core_c_ids)) - rows = cur.fetchall() or [] - print(f" [MySQL] 查询到{len(rows)}条核心互动组件配置") - - if len(rows) == 0 and len(core_c_ids) > 0: - print(f" [MySQL] [警告] 查询结果为空!可能的原因:") - print(f" [MySQL] - 数据库中没有匹配的c_id记录") - print(f" [MySQL] - deleted_at字段不为NULL") - print(f" [MySQL] - c_id不存在") - - for idx, row in enumerate(rows): - c_type = row.get("c_type", "") - c_id = row.get("c_id") - key = f"{c_type}_{c_id}" - - if idx < 3: # 输出前3条的详细信息 - print(f" [MySQL] [样例{idx+1}] id={c_id}, c_type={c_type}, key={key}") - print(f" [MySQL] [样例{idx+1}] title={row.get('title', '')[:50]}") - - # 解析component_config - component_config = row.get("component_config") - if isinstance(component_config, str): - try: - component_config = json.loads(component_config) - except Exception as e: - print(f" [MySQL] [警告] 解析component_config失败 (id={c_id}): {e}") - component_config = {} - - # 提取taskInfo字段作为摘要 - summary = "" - if isinstance(component_config, dict): - task_info = component_config.get("taskInfo") - summary = to_json_str(task_info) if task_info else "" - if idx < 3 and task_info: - print(f" [MySQL] [样例{idx+1}] 提取到taskInfo字段,长度: {len(summary)}") - - # 解析kp_relation_info - kp_relation_info = row.get("kp_relation_info") - if isinstance(kp_relation_info, str): - try: - kp_relation_info = json.loads(kp_relation_info) - except Exception: - kp_relation_info = [] - - config_map[key] = { - "title": row.get("title", ""), - "component_config": component_config, - "summary": summary, - "kp_relation_info": to_json_str(kp_relation_info), - } - - print(f" [MySQL] 核心互动组件配置已加入config_map,当前map大小: {len(config_map)}") - except Exception as e: - print(f" [MySQL] [错误] 查询核心互动组件配置失败: {e}") - import traceback - traceback.print_exc() - - print(f" [MySQL] 组件配置查询完成,共{len(config_map)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - return config_map - - -def calculate_accuracy(question_list: Any) -> float: - """ - 计算问题列表的正确率 - - Args: - question_list: 问题列表(可能是JSON字符串或list) - - Returns: - 正确率(百分比,保留2位小数) - """ - try: - if isinstance(question_list, str): - question_list = json.loads(question_list) - - if not isinstance(question_list, list) or len(question_list) == 0: - return 0.0 - - total = len(question_list) - correct = sum(1 for q in question_list if q.get('isRight') == True) - accuracy = round(correct / total * 100, 2) if total > 0 else 0.0 - - return accuracy - except Exception: - return 0.0 - - - -def fetch_character_ids_by_account(account_id: str, conn: Any) -> List[str]: - """根据账户id查询对应的角色id列表""" - sql = "SELECT id FROM vala_app_character WHERE account_id = %s" - try: - with conn.cursor() as cur: - cur.execute(sql, (account_id,)) - rows = cur.fetchall() or [] - return [str(row["id"]) for row in rows if row.get("id")] - except Exception as e: - print(f"[ERROR] 查询账户id={account_id}的角色id失败: {e}") - return [] - - -def fetch_pg_play_records(user_id: str, conn: Any, mysql_conn: Any) -> List[Dict[str, Any]]: - """ - 查询互动组件学习记录并补充组件配置信息 - - Args: - user_id: 用户ID(角色ID) - conn: PostgreSQL数据库连接 - mysql_conn: MySQL数据库连接 - - Returns: - 互动组件学习记录列表 - """ - print(f" [PG] 开始查询互动组件学习记录(8张分表)...") - start_time = datetime.datetime.now() - - tables = [f"user_component_play_record_{i}" for i in range(8)] - rows: List[Dict[str, Any]] = [] - with conn.cursor(cursor_factory=RealDictCursor) as cur: - for t in tables: - try: - sql = f""" - SELECT user_id, component_unique_code, session_id, c_type, c_id, - play_result, user_behavior_info, updated_at - FROM {t} - WHERE user_id = %s - ORDER BY updated_at DESC - """ - print(f" [PG_DEBUG] 准备查询表 {t},SQL:{sql.strip()},参数:{user_id}") - cur.execute(sql, (user_id,)) - part = cur.fetchall() or [] - if part: - print(f" [PG] 表{t}查到{len(part)}条记录") - for r in part: - r = dict(r) - r["play_result"] = to_json_str(r.get("play_result")) - r["user_behavior_info"] = to_json_str(r.get("user_behavior_info")) - # 将带时区的时间转换为无时区,避免Excel写入报错 - upd = r.get("updated_at") - if isinstance(upd, datetime.datetime): - try: - if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: - r["updated_at"] = upd.replace(tzinfo=None) - except Exception: - # 回退为字符串 - r["updated_at"] = str(upd) - rows.append(r) - except Exception as e: - print(f" [PG] 表{t}查询失败: {e}") - continue - - rows.sort(key=lambda x: parse_time(x.get("updated_at")) or datetime.datetime.min, reverse=True) - print(f" [PG] 互动组件学习记录查询完成,共{len(rows)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - - # 批量查询组件配置 - if rows and mysql_conn: - config_map = batch_fetch_component_configs(rows, mysql_conn) - - # 补充组件信息 - print(f" [PG] 开始补充组件配置信息...") - filled_count = 0 - empty_count = 0 - sample_keys = [] - sample_mode_check = [] # 检查对话互动的mode - - for r in rows: - c_type = r.get("c_type", "") - c_id = r.get("c_id") - key = f"{c_type}_{c_id}" if c_type and c_id else "" - - config = config_map.get(key, {}) - component_config = config.get("component_config", {}) - - component_name = get_component_name(c_type, component_config) - r["互动组件名称"] = component_name - r["组件标题"] = config.get("title", "") - r["组件配置摘要"] = config.get("summary", "") - r["知识点"] = config.get("kp_relation_info", "") - - # 统计填充情况 - if config: - filled_count += 1 - if len(sample_keys) < 3: - sample_keys.append((key, component_name, r["组件标题"][:30] if r["组件标题"] else "")) - - # 检查对话互动的mode - if c_type == "mid_sentence_dialogue" and len(sample_mode_check) < 3: - mode = "" - if isinstance(component_config, dict): - question = component_config.get("question", {}) - if isinstance(question, dict): - mode = question.get("mode", "") - sample_mode_check.append({ - "key": key, - "mode": mode, - "component_name": component_name - }) - else: - empty_count += 1 - if empty_count <= 5: # 输出前5个未匹配的key - print(f" [PG] [警告] 未找到组件配置: key={key}") - - print(f" [PG] 组件配置信息补充完成") - print(f" [PG] 匹配到配置: {filled_count}条, 未匹配: {empty_count}条") - if sample_keys: - print(f" [PG] 样例数据(前3条):") - for key, name, title in sample_keys: - print(f" [PG] - key={key}, 名称={name}, 标题={title}") - - if sample_mode_check: - print(f" [PG] 对话互动mode检查(前3条):") - for s in sample_mode_check: - print(f" [PG] - key={s['key']}, mode={s['mode']}, 最终名称={s['component_name']}") - - return rows - - -def fetch_pg_unit_review(user_id: str, conn: Any, id_2_unit_index: Dict[int, int], chapter_id_to_lesson_id: Dict[int, int]) -> List[Dict[str, Any]]: - """ - 查询课程巩固记录 - - Args: - user_id: 用户ID(角色ID) - conn: PostgreSQL数据库连接 - id_2_unit_index: story_id到unit_id的映射字典 - chapter_id_to_lesson_id: chapter_id到lesson_id的映射字典 - - Returns: - 课程巩固记录列表 - """ - print(f" [PG] 开始查询课程巩固记录...") - start_time = datetime.datetime.now() - - sql = ( - "SELECT user_id, story_id, chapter_id, question_list, updated_at " - "FROM user_unit_review_question_result WHERE user_id = %s ORDER BY updated_at DESC" - ) - with conn.cursor(cursor_factory=RealDictCursor) as cur: - try: - cur.execute(sql, (user_id,)) - rows = cur.fetchall() or [] - except Exception as e: - print(f" [PG] 课程巩固记录查询失败: {e}") - rows = [] - out: List[Dict[str, Any]] = [] - for r in rows: - d = dict(r) - - # 映射 story_id 到 unit_id - story_id = d.get("story_id") - unit_id = id_2_unit_index.get(story_id) if story_id else None - d["unit_id"] = unit_id - - # 映射 chapter_id 到 lesson_id - chapter_id = d.get("chapter_id") - lesson_id = chapter_id_to_lesson_id.get(chapter_id) if chapter_id else None - d["lesson_id"] = lesson_id - - # 计算正确率 - question_list = d.get("question_list") - d["正确率"] = calculate_accuracy(question_list) - - d["question_list"] = to_json_str(question_list) - upd = d.get("updated_at") - if isinstance(upd, datetime.datetime): - try: - if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: - d["updated_at"] = upd.replace(tzinfo=None) - except Exception: - d["updated_at"] = str(upd) - out.append(d) - - print(f" [PG] 课程巩固记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - return out - - -def fetch_pg_unit_challenge(user_id: str, conn: Any, id_2_unit_index: Dict[int, int]) -> List[Dict[str, Any]]: - """ - 查询单元挑战记录 - - Args: - user_id: 用户ID(角色ID) - conn: PostgreSQL数据库连接 - id_2_unit_index: story_id到unit_id的映射字典 - - Returns: - 单元挑战记录列表 - """ - print(f" [PG] 开始查询单元挑战记录...") - start_time = datetime.datetime.now() - - sql = ( - "SELECT user_id, story_id, category, score_text, question_list, updated_at " - "FROM user_unit_challenge_question_result WHERE user_id = %s ORDER BY updated_at DESC" - ) - with conn.cursor(cursor_factory=RealDictCursor) as cur: - try: - cur.execute(sql, (user_id,)) - rows = cur.fetchall() or [] - except Exception as e: - print(f" [PG] 单元挑战记录查询失败: {e}") - rows = [] - out: List[Dict[str, Any]] = [] - for r in rows: - d = dict(r) - - # 映射 story_id 到 unit_id - story_id = d.get("story_id") - unit_id = id_2_unit_index.get(story_id) if story_id else None - d["unit_id"] = unit_id - - d["question_list"] = to_json_str(d.get("question_list")) - upd = d.get("updated_at") - if isinstance(upd, datetime.datetime): - try: - if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: - d["updated_at"] = upd.replace(tzinfo=None) - except Exception: - d["updated_at"] = str(upd) - out.append(d) - - print(f" [PG] 单元挑战记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - return out - - -def fetch_pg_unit_summary(user_id: str, conn: Any, id_2_unit_index: Dict[int, int]) -> List[Dict[str, Any]]: - """ - 查询单元总结知识点结果数据 - - Args: - user_id: 用户ID(角色ID) - conn: PostgreSQL数据库连接 - id_2_unit_index: story_id到unit_id的映射字典 - - Returns: - 单元总结记录列表 - """ - print(f" [PG] 开始查询单元总结记录...") - start_time = datetime.datetime.now() - - sql = ( - "SELECT id, user_id, story_id, updated_at, km_id, km_type, play_time " - "FROM user_unit_summary_km_result WHERE user_id = %s AND deleted_at IS NULL ORDER BY updated_at DESC" - ) - with conn.cursor(cursor_factory=RealDictCursor) as cur: - try: - cur.execute(sql, (user_id,)) - rows = cur.fetchall() or [] - except Exception as e: - print(f" [PG] 单元总结记录查询失败: {e}") - rows = [] - - out: List[Dict[str, Any]] = [] - for r in rows: - d = dict(r) - # 映射 story_id 到 unit_id - story_id = d.get("story_id") - unit_id = id_2_unit_index.get(story_id) if story_id else None - d["unit_id"] = unit_id - - # 转换 play_time (毫秒) 为秒 (整数) - play_time = d.get("play_time") - d["play_time_seconds"] = play_time // 1000 if play_time else 0 - - # 移除时区信息 - upd = d.get("updated_at") - if isinstance(upd, datetime.datetime): - try: - if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: - d["updated_at"] = upd.replace(tzinfo=None) - except Exception: - d["updated_at"] = str(upd) - out.append(d) - - print(f" [PG] 单元总结记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - return out - - -def generate_statistics(sheet2_rows: List[Dict[str, Any]], sheet5_rows: List[Dict[str, Any]]) -> tuple: - """ - 生成汇总统计数据 - - Args: - sheet2_rows: 互动组件学习记录 - sheet5_rows: 单元总结记录 - - Returns: - (组件统计DataFrame, 知识点统计DataFrame, 单元时长统计DataFrame) - """ - if pd is None: - raise RuntimeError("缺少pandas依赖,请安装后再运行。") - - print(f" [统计] 开始生成汇总统计数据...") - start_time = datetime.datetime.now() - - from collections import defaultdict - - # ============ a. 所有互动-按互动组件类型-通过情况统计 ============ - component_stats_data = [] - component_stats = defaultdict(lambda: {"Perfect": 0, "Good": 0, "Failed": 0, "Pass": 0, "Oops": 0, "total": 0}) - - # 用于调试 - sample_results = [] - parse_error_count = 0 - - for idx, record in enumerate(sheet2_rows): - component_name = record.get("互动组件名称", "") - if not component_name: - continue - - play_result_str = record.get("play_result", "") - - # 解析play_result - result = "" - try: - # 先判断是否是简单的字符串(Perfect/Good/Failed/Pass/Oops) - if isinstance(play_result_str, str): - # 去除空格后检查 - stripped = play_result_str.strip() - if stripped in ["Perfect", "Good", "Failed", "Pass", "Oops"]: - # 直接使用 - result = stripped - else: - # 尝试JSON解析 - try: - play_result = json.loads(play_result_str) - if isinstance(play_result, dict): - result = play_result.get("result", "") - else: - result = "" - except: - result = "" - else: - # 如果不是字符串,尝试当dict处理 - if isinstance(play_result_str, dict): - result = play_result_str.get("result", "") - else: - result = "" - - # 收集前3个样例 - if idx < 3: - sample_results.append({ - "component": component_name, - "raw": str(play_result_str)[:100], - "result": result - }) - except Exception as e: - parse_error_count += 1 - if parse_error_count <= 3: - print(f" [统计] [警告] 解析play_result失败 (第{idx+1}条): {e}, 原始值: {str(play_result_str)[:100]}") - result = "" - - component_stats[component_name]["total"] += 1 - if result in ["Perfect", "Good", "Failed", "Pass", "Oops"]: - component_stats[component_name][result] += 1 - - print(f" [统计] play_result解析样例(前3条):") - for s in sample_results: - print(f" [统计] - 组件: {s['component']}, 结果: {s['result']}, 原始: {s['raw']}") - if parse_error_count > 0: - print(f" [统计] play_result解析失败总数: {parse_error_count}") - - # 生成统计数据行 - for component_name in sorted(component_stats.keys()): - stats = component_stats[component_name] - total = stats["total"] - perfect = stats["Perfect"] - good = stats["Good"] - failed = stats["Failed"] - pass_count = stats["Pass"] - oops = stats["Oops"] - - perfect_ratio = round(perfect / total * 100, 2) if total > 0 else 0 - good_ratio = round(good / total * 100, 2) if total > 0 else 0 - failed_ratio = round(failed / total * 100, 2) if total > 0 else 0 - pass_ratio = round(pass_count / total * 100, 2) if total > 0 else 0 - oops_ratio = round(oops / total * 100, 2) if total > 0 else 0 - - component_stats_data.append({ - "互动组件名称": component_name, - "总数量": total, - "Perfect数量": perfect, - "Good数量": good, - "Failed数量": failed, - "Pass数量": pass_count, - "Oops数量": oops, - "Perfect比例(%)": perfect_ratio, - "Good比例(%)": good_ratio, - "Failed比例(%)": failed_ratio, - "Pass比例(%)": pass_ratio, - "Oops比例(%)": oops_ratio, - }) - - # ============ b. 中互动组件-按知识点-通过情况统计 ============ - kp_stats_data = [] - kp_stats = defaultdict(lambda: {"Perfect": 0, "Good": 0, "Failed": 0, "Pass": 0, "Oops": 0, "total": 0}) - - # 调试信息 - mid_count = 0 - has_kp_count = 0 - sample_kp_records = [] - - for idx, record in enumerate(sheet2_rows): - c_type = record.get("c_type", "") - if not c_type or not c_type.startswith("mid"): - continue - - mid_count += 1 - kp_relation_info_str = record.get("知识点", "") - - if not kp_relation_info_str: - continue - - has_kp_count += 1 - - # 解析知识点 - try: - if isinstance(kp_relation_info_str, str): - kp_relation_info = json.loads(kp_relation_info_str) - else: - kp_relation_info = kp_relation_info_str - - if not isinstance(kp_relation_info, list): - continue - - # 收集样例 - if len(sample_kp_records) < 3: - sample_kp_records.append({ - "c_type": c_type, - "kp_count": len(kp_relation_info), - "kp_info": str(kp_relation_info)[:200] - }) - - # 解析play_result(使用相同的逻辑) - play_result_str = record.get("play_result", "") - result = "" - if isinstance(play_result_str, str): - stripped = play_result_str.strip() - if stripped in ["Perfect", "Good", "Failed", "Pass", "Oops"]: - result = stripped - else: - try: - play_result = json.loads(play_result_str) - if isinstance(play_result, dict): - result = play_result.get("result", "") - except: - pass - elif isinstance(play_result_str, dict): - result = play_result_str.get("result", "") - - # 为每个知识点统计 - for kp in kp_relation_info: - if not isinstance(kp, dict): - continue - - kp_id = kp.get("kpId", "") - kp_type = kp.get("kpType", "") - kp_title = kp.get("kpTitle", "") - - if not kp_id: - continue - - kp_key = f"{kp_id}|{kp_type}|{kp_title}" - kp_stats[kp_key]["total"] += 1 - if result in ["Perfect", "Good", "Failed", "Pass", "Oops"]: - kp_stats[kp_key][result] += 1 - - except Exception as e: - if len(sample_kp_records) < 5: - print(f" [统计] [警告] 解析知识点失败: {e}, 原始值: {str(kp_relation_info_str)[:100]}") - continue - - print(f" [统计] 中互动组件统计: 总数={mid_count}, 有知识点={has_kp_count}, 知识点条目数={len(kp_stats)}") - if sample_kp_records: - print(f" [统计] 知识点样例(前3条):") - for s in sample_kp_records: - print(f" [统计] - c_type={s['c_type']}, 知识点数量={s['kp_count']}, 内容={s['kp_info']}") - - # 生成知识点统计数据行 - for kp_key in sorted(kp_stats.keys()): - parts = kp_key.split("|") - if len(parts) != 3: - continue - - kp_id, kp_type, kp_title = parts - stats = kp_stats[kp_key] - total = stats["total"] - perfect = stats["Perfect"] - good = stats["Good"] - failed = stats["Failed"] - pass_count = stats["Pass"] - oops = stats["Oops"] - - perfect_ratio = round(perfect / total * 100, 2) if total > 0 else 0 - good_ratio = round(good / total * 100, 2) if total > 0 else 0 - failed_ratio = round(failed / total * 100, 2) if total > 0 else 0 - pass_ratio = round(pass_count / total * 100, 2) if total > 0 else 0 - oops_ratio = round(oops / total * 100, 2) if total > 0 else 0 - - kp_stats_data.append({ - "知识点ID": kp_id, - "知识点类型": kp_type, - "知识点标题": kp_title, - "总数量": total, - "Perfect数量": perfect, - "Good数量": good, - "Failed数量": failed, - "Pass数量": pass_count, - "Oops数量": oops, - "Perfect比例(%)": perfect_ratio, - "Good比例(%)": good_ratio, - "Failed比例(%)": failed_ratio, - "Pass比例(%)": pass_ratio, - "Oops比例(%)": oops_ratio, - }) - - # ============ c. 单元总结-按单元统计时长 ============ - unit_time_stats_data = [] - unit_time_stats = defaultdict(int) - - for record in sheet5_rows: - unit_id = record.get("unit_id") - play_time_seconds = record.get("play_time_seconds", 0) - - if unit_id is not None: - unit_time_stats[unit_id] += play_time_seconds - - # 生成单元时长统计数据行 - for unit_id in sorted(unit_time_stats.keys()): - total_seconds = unit_time_stats[unit_id] - total_minutes = int(total_seconds / 60) - - unit_time_stats_data.append({ - "单元ID": f"unit_{unit_id}", - "总时长(秒)": total_seconds, - "总时长(分钟)": total_minutes, - }) - - print(f" [统计] 汇总统计数据生成完成,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - print(f" [统计] 生成了{len(component_stats_data)}条组件统计, {len(kp_stats_data)}条知识点统计, {len(unit_time_stats_data)}条单元时长统计") - - return ( - pd.DataFrame(component_stats_data), - pd.DataFrame(kp_stats_data), - pd.DataFrame(unit_time_stats_data) - ) - - - -def write_excel(path: str, sheet1_rows: List[Dict[str, Any]], sheet2_rows: List[Dict[str, Any]], sheet3_rows: List[Dict[str, Any]], sheet4_rows: List[Dict[str, Any]], sheet5_rows: List[Dict[str, Any]], stats_component_df: Any, stats_kp_df: Any, stats_unit_time_df: Any) -> None: - if pd is None: - raise RuntimeError("缺少pandas依赖,请安装后再运行。") - - print(f" [Excel] 开始写入Excel文件: {path}") - start_time = datetime.datetime.now() - - out_dir = os.path.dirname(path) or "." - os.makedirs(out_dir, exist_ok=True) - with pd.ExcelWriter(path, engine="openpyxl") as writer: - pd.DataFrame(sheet1_rows, columns=SHEET1_COLUMNS).to_excel(writer, sheet_name="全部音频数据", index=False) - pd.DataFrame(sheet2_rows, columns=SHEET2_COLUMNS).to_excel(writer, sheet_name="互动组件学习记录", index=False) - pd.DataFrame(sheet3_rows, columns=SHEET3_COLUMNS).to_excel(writer, sheet_name="课程巩固记录", index=False) - pd.DataFrame(sheet4_rows, columns=SHEET4_COLUMNS).to_excel(writer, sheet_name="单元挑战记录", index=False) - pd.DataFrame(sheet5_rows, columns=SHEET5_COLUMNS).to_excel(writer, sheet_name="单元总结记录", index=False) - stats_component_df.to_excel(writer, sheet_name="统计-互动组件通过情况", index=False) - stats_kp_df.to_excel(writer, sheet_name="统计-知识点通过情况", index=False) - stats_unit_time_df.to_excel(writer, sheet_name="统计-单元总结时长", index=False) - - print(f" [Excel] 写入完成,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - - -def get_date_str() -> str: - """获取当前日期字符串 格式:YYYYMMDD""" - return datetime.datetime.now().strftime("%Y%m%d") - - -def export_single_user(user_id: str, es_cfg: Dict[str, Any], pg_conn: Any, mysql_conn: Any, output_path: str, id_2_unit_index: Dict[int, int], chapter_id_to_lesson_id: Dict[int, int]) -> bool: - """ - 导出单个角色id的数据 - - Args: - user_id: 角色ID - es_cfg: ES配置 - pg_conn: PostgreSQL连接 - mysql_conn: MySQL连接 - output_path: 输出路径 - id_2_unit_index: story_id到unit_id的映射字典 - chapter_id_to_lesson_id: chapter_id到lesson_id的映射字典 - - Returns: - True表示成功,False表示失败 - """ - try: - print(f"\n[INFO] ========== 开始导出角色id={user_id} ==========") - total_start_time = datetime.datetime.now() - - # 查询ES数据 - sheet1_rows = fetch_es_user_audio(user_id, es_cfg) - - # 查询PG数据 - sheet2_rows = fetch_pg_play_records(user_id, pg_conn, mysql_conn) - sheet3_rows = fetch_pg_unit_review(user_id, pg_conn, id_2_unit_index, chapter_id_to_lesson_id) - sheet4_rows = fetch_pg_unit_challenge(user_id, pg_conn, id_2_unit_index) - sheet5_rows = fetch_pg_unit_summary(user_id, pg_conn, id_2_unit_index) - - # 检查是否有有效数据 - total_records = len(sheet1_rows) + len(sheet2_rows) + len(sheet3_rows) + len(sheet4_rows) + len(sheet5_rows) - print(f" [统计] 数据汇总:") - print(f" - 全部音频数据: {len(sheet1_rows)}条") - print(f" - 互动组件学习记录: {len(sheet2_rows)}条") - print(f" - 课程巩固记录: {len(sheet3_rows)}条") - print(f" - 单元挑战记录: {len(sheet4_rows)}条") - print(f" - 单元总结记录: {len(sheet5_rows)}条") - print(f" - 总计: {total_records}条") - - if total_records == 0: - print(f"[WARN] 角色id={user_id} 没有找到任何有效记录,跳过导出") - return False - - # 生成汇总统计数据 - stats_component_df, stats_kp_df, stats_unit_time_df = generate_statistics(sheet2_rows, sheet5_rows) - - # 写入Excel - write_excel(output_path, sheet1_rows, sheet2_rows, sheet3_rows, sheet4_rows, sheet5_rows, stats_component_df, stats_kp_df, stats_unit_time_df) - - total_time = (datetime.datetime.now() - total_start_time).total_seconds() - print(f"[INFO] 角色id={user_id} 导出成功") - print(f"[INFO] 文件路径: {output_path}") - print(f"[INFO] 总耗时: {total_time:.2f}秒") - print(f"[INFO] ========== 完成 ==========\n") - return True - - except Exception as e: - print(f"[ERROR] 角色id={user_id} 导出失败: {e}") - import traceback - traceback.print_exc() - return False - - -def main(): - load_env() - - # 确定运行模式并收集需要导出的角色id列表 - user_id_list: List[tuple] = [] # [(user_id, account_id or None), ...] - date_str = get_date_str() - - # 检查三种模式的配置 - has_user_id = USER_ID is not None - has_user_id_list = USER_ID_LIST is not None and len(USER_ID_LIST) > 0 - has_account_id_list = ACCOUNT_ID_LIST is not None and len(ACCOUNT_ID_LIST) > 0 - - # 验证只能配置一种模式 - mode_count = sum([has_user_id, has_user_id_list, has_account_id_list]) - if mode_count == 0: - raise RuntimeError("请配置 USER_ID、USER_ID_LIST 或 ACCOUNT_ID_LIST 中的一个") - if mode_count > 1: - raise RuntimeError("USER_ID、USER_ID_LIST、ACCOUNT_ID_LIST 只能配置一个,请检查配置") - - # 模式1:单个角色id - if has_user_id: - user_id_list = [(str(USER_ID), None)] - print(f"[INFO] 运行模式:单个角色id") - - # 模式2:角色id列表 - elif has_user_id_list: - user_id_list = [(str(uid), None) for uid in USER_ID_LIST] - print(f"[INFO] 运行模式:角色id列表,共{len(user_id_list)}个角色") - - # 模式3:账户id列表 - elif has_account_id_list: - print(f"[INFO] 运行模式:账户id列表,共{len(ACCOUNT_ID_LIST)}个账户") - mysql_conn = None - try: - mysql_conn = get_mysql_conn("vala_user") # 查询用户表,使用 vala_user 数据库 - for account_id in ACCOUNT_ID_LIST: - account_id_str = str(account_id) - print(f"[INFO] 查询账户id={account_id_str}对应的角色id...") - character_ids = fetch_character_ids_by_account(account_id_str, mysql_conn) - if not character_ids: - print(f"[WARN] 账户id={account_id_str} 未找到关联的角色id,跳过") - continue - print(f"[INFO] 账户id={account_id_str} 找到{len(character_ids)}个角色id: {character_ids}") - for cid in character_ids: - user_id_list.append((cid, account_id_str)) - finally: - if mysql_conn: - try: - mysql_conn.close() - except Exception: - pass - - if not user_id_list: - print("[WARN] 没有需要导出的角色id,程序退出") - return - - # 初始化连接 - es_cfg = get_es_config() - pg_conn = get_pg_conn() - - # 获取映射表(只需要查询一次,所有角色共用) - print(f"\n[INFO] ===== 准备工作:获取映射表 =====") - mysql_conn = None - id_2_unit_index = {} - chapter_id_to_lesson_id = {} - try: - print(f"[INFO] 正在连接MySQL数据库(vala_test)...") - mysql_conn = get_mysql_conn("vala_test") # 查询游戏配置表,使用 vala_test 数据库 - print(f"[INFO] 正在获取 story_id 到 unit_id 的映射...") - id_2_unit_index = get_id_2_unit_index(mysql_conn) - print(f"[INFO] 成功获取 {len(id_2_unit_index)} 个 story_id 映射") - print(f"[INFO] 正在获取 chapter_id 到 lesson_id 的映射...") - chapter_id_to_lesson_id = get_chapter_id_to_lesson_id(mysql_conn) - print(f"[INFO] 成功获取 {len(chapter_id_to_lesson_id)} 个 chapter_id 映射") - except Exception as e: - print(f"[ERROR] 获取映射表失败: {e}") - import traceback - traceback.print_exc() - if pg_conn: - try: - pg_conn.close() - except Exception: - pass - if mysql_conn: - try: - mysql_conn.close() - except Exception: - pass - return - - try: - # 统计信息 - success_count = 0 - skip_count = 0 - - print(f"\n[INFO] ===== 开始批量导出 =====") - print(f"[INFO] 共需导出{len(user_id_list)}个角色\n") - batch_start_time = datetime.datetime.now() - - # 循环处理每个角色id - for idx, (user_id, account_id) in enumerate(user_id_list, 1): - print(f"\n{'='*60}") - print(f"[INFO] 进度: {idx}/{len(user_id_list)} ({idx*100//len(user_id_list)}%)") - print(f"{'='*60}") - - # 生成输出文件名 - if account_id is None: - # 模式1和模式2:角色id_{}_导出时间_{}.xlsx - filename = f"角色id_{user_id}_导出时间_{date_str}.xlsx" - else: - # 模式3:账户id_{}_角色id_{}_导出时间_{}.xlsx - filename = f"账户id_{account_id}_角色id_{user_id}_导出时间_{date_str}.xlsx" - - output_path = os.path.join(OUTPUT_DIR, filename) - - # 导出单个角色的数据 - result = export_single_user(user_id, es_cfg, pg_conn, mysql_conn, output_path, id_2_unit_index, chapter_id_to_lesson_id) - if result: - success_count += 1 - else: - skip_count += 1 - - # 输出统计信息 - batch_total_time = (datetime.datetime.now() - batch_start_time).total_seconds() - print(f"\n{'='*60}") - print(f"[INFO] ===== 全部导出完成 =====") - print(f"[INFO] 总计: {len(user_id_list)}个角色") - print(f"[INFO] 成功: {success_count}个") - print(f"[INFO] 跳过: {skip_count}个") - print(f"[INFO] 总耗时: {batch_total_time:.2f}秒 ({batch_total_time/60:.2f}分钟)") - if success_count > 0: - print(f"[INFO] 平均每个角色: {batch_total_time/success_count:.2f}秒") - print(f"{'='*60}\n") - - finally: - if pg_conn: - try: - pg_conn.close() - except Exception: - pass - if mysql_conn: - try: - mysql_conn.close() - except Exception: - pass - - -if __name__ == "__main__": - main() diff --git a/new_export/export_user_id_data_latest.py b/new_export/export_user_id_data_latest.py deleted file mode 100644 index 22495f5..0000000 --- a/new_export/export_user_id_data_latest.py +++ /dev/null @@ -1,1846 +0,0 @@ -""" -初版需求v1.0: 2025.11.18 - -导出 一个userId的多表数据, 最终按照不同sheet,输出到一个 excel文件中。 - -1. 第一个sheet:"全部音频数据" -es相关配置通过以下环境变量 -ES_HOST=xxx -ES_PORT=9200 -ES_SCHEME=https -ES_USER=elastic -ES_PASSWORD=xxx - -index: user-audio - -脚本思路: -过滤字段: -userId == xxxx - -输出该userId的全部记录 按时间倒序排序 -包含以下字段内容: - -userId -userMsg -userName -soeData -audioUrl -asrStatus -componentId -componentType -dataVersion - -2. 第二个sheet:"互动组件学习记录" -在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 -数据库相关配置 从.env中读取: -PG_DB_HOST = xxx -PG_DB_PORT = xxx -PG_DB_USER = xxx -PG_DB_PASSWORD = xxx -PG_DB_DATABASE = xxx - -读取以下数据表: -user_component_play_record_0 ~ user_component_play_record_7 - -输出以下字段: -user_id, -component_unique_code, -session_id, -c_type, -c_id, -play_result, -user_behavior_info, -updated_at - -3.第三个sheet:"课程巩固记录" -在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 - -数据表:user_unit_review_question_result - -输出以下字段: -user_id -story_id -chapter_id -question_list -updated_at - -4.第四个sheet:"单元挑战记录" -在 PGsql数据库中 筛选出 user_id 对应的记录 按时间(updated_at)倒序排列。 - -数据表:user_unit_challenge_question_result - -输出以下字段: -user_id -story_id -category -score_text, -question_list -updated_at ------------- - -需求补充v1.1: -"全部音频数据"这个sheet -输出字段 添加timeStr 并按时间倒序排列 最新的记录 在最上面 - ------------- -需求补充v1.2: -"全部音频数据"这个sheet -如果userMsg字段内容 包含 ”makee_id“ 要进行以下处理: - -从userMsg字段中提取出具体的makee_id: -此时的字段样例: -``` -asr msg信息为:{ - "time_ms": 358, - "time_ms_api": 357, - "hot_words_str": "{\n \"context_type\": \"dialog_ctx\",\n \"context_data\": [\n {\n \"text\": \"planet Walla\"\n },\n {\n \"text\": \"Walla\"\n }\n ]\n}", - "makee_id": "d208c617-902f-4f81-8255-b5fb73599546", - "volcano_fast_x_tt_logid": "202511151541355DF72BE5EBFE73795BFD", - "api_name": "volcano-fast" -} -``` -然后基于makee_id 去另一个表里查记录: index:llm_asr_log -将查询到的记录的 result_text 字段内容 回填到 userMsg。 -将source字段内容 输出 到 source。 - -如果userMsg字段内容 不包含 ”makee_id“ 保持之前的逻辑。 - --------------- -需求补充 v1.3 -当前输入 只支持配置单个 userId (业务侧名称为角色id) - - -期望扩展为以下逻辑: -1. 改为配置 角色id list , 分别 导出 多份excel文件。命名格式为 角色id_{}_导出时间_{}.xlsx -2. 改为配置 账户id list , 分别 导出 多份excel文件。命名格式为 账户id_{}_角色id_{}_导出时间_{}.xlsx - -关于 账户 id 到角色id 的映射逻辑, -首先 读取 mysql 表 vala_app_character -筛选 account_id字段值 == 账户id 的 记录, 其中 该记录 的 id值,则为角色id 一个 账户id 可以对应多个角色id - -本次需求只针对输入侧调整, 数据抽取聚合逻辑部分和之前保持一致 - ---------------- -需求补充 v1.4 - -增加一个sheet "单元总结记录", -导出对应角色id的单元总结记录。 参考 export_unit_summary.py 中的原始数据提取方案即可(不必关注其中的数据统计部分)。 - -其他已有逻辑保持不动哦。 - ----------------- -需求补充 v1.5 - -1."互动组件学习记录"sheet 增加以下字段 -"互动组件名称"、"组件标题"、"组件配置摘要"、"知识点": -字段取值规则: -根据 c_type 及组件配置(从mysql表获取) 进行映射和处理: -``` -1).如果 c_type 开头为"mid" - -则读取下表:表名:middle_interaction_component - -获取以下字段值: -title (作为组件标题) -component_config (完整的组件配置) 获取其中 的 question 字段值 作为 组件配置摘要; -kp_relation_info 字段值 作为 知识点 - -"互动组件名称"规则: - -"物品互动": "mid_vocab_item", -"图片互动": "mid_vocab_image", -"填词互动": "mid_vocab_fillBlank", -"指令互动": "mid_vocab_instruction" -"对话互动-表达": "mid_sentence_dialogue", 且 component_config->question->mode == "express" -"对话互动-朗读": "mid_sentence_dialogue", 且 component_config->question->mode == "read" -"语音互动": "mid_sentence_voice", -"材料互动": "mid_sentence_material", -"造句互动": "mid_sentence_makeSentence" -"挖空互动": "mid_grammar_cloze", -"组句互动": "mid_grammar_sentence" -"发音互动": "mid_pron_pron" - - -2). 如果 c_type 开头为"core" -则读取下表:表名:core_interaction_component - -获取以下字段值: -title (作为组件标题) -component_config (完整的组件配置) 获取其中 的 taskInfo 字段值 作为 组件配置摘要 -kp_relation_info 字段值 作为 知识点 - -"互动组件名称"规则: -"口语快答": "core_speaking_reply", -"口语妙问": "core_speaking_inquiry", -"口语探讨": "core_speaking_explore", -"口语独白": "core_speaking_monologue" -"合作阅读": "core_reading_order", -"合作听力": "core_listening_order", -"看图组句": "core_writing_imgMakeSentence", -"看图撰写": "core_writing_imgWrite", -"问题组句": "core_writing_questionMakeSentence", -"问题撰写": "core_writing_questionWrite", -``` - -2."课程巩固记录" sheet 增加以下字段 -"正确率": 参考 export_lesson_review.py 中的计算逻辑 - -3. 新增一个"汇总统计"sheet -统计并展示以下内容 请以 可读性 比较好的方式排列、展示 - -a. "所有互动-按互动组件类型-通过情况统计" -以每种"互动组件名称"进行聚合 -统计play_result的取值分布情况,算以下指标: -总数量、Perfect数量、Good数量、Failed数量、Pass数量、Perfect比例、Good比例、Failed比例、Pass比例 - -b. "中互动组件-按知识点-通过情况统计" -以每个知识点进行聚合 - -其中 知识点配置格式如下: -``` -[{"kpId":"0000004","kpType":"sentence","kpTitle":"My name is ...","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000004","kpType":"sentence","kpTitle":"My name is ...","kpSkill":"sentence_meaning","kpSkillName":"语义"},{"kpId":"0000005","kpType":"sentence","kpTitle":"I'm… years old.","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000005","kpType":"sentence","kpTitle":"I'm… years old.","kpSkill":"sentence_meaning","kpSkillName":"语义"},{"kpId":"0000014","kpType":"sentence","kpTitle":"Nice to meet you.","kpSkill":"sentence_pron","kpSkillName":"语音"},{"kpId":"0000014","kpType":"sentence","kpTitle":"Nice to meet you.","kpSkill":"sentence_meaning","kpSkillName":"语义"}] -``` -一个组件可以绑定多个知识点,以每个知识点的 kpId + kpType + kpTitle 进行 展示及聚合 - -对所有绑定了某个知识点的中互动组件(c_type以mid开头) -统计play_result的取值分布情况,算以下指标: -总数量、Perfect数量、Good数量、Failed数量、Pass数量、Perfect比例、Good比例、Failed比例、Pass比例 - -c. "单元总结-按单元统计时长" - -将"单元总结记录"中的"play_time_seconds"字段值 以每个单元id 进行聚合 进行 累加 统计,并增加一列 转换为分钟为单位 取整数 - - -""" -# ==== 可直接修改的脚本变量(不使用命令行传参) ==== -# 三种模式互斥,只能配置一个: -# 模式1:单个角色id -USER_ID = None # 单个角色ID,示例:2911 - -# 模式2:角色id列表(多个角色id批量导出) -USER_ID_LIST = None # 角色ID列表,示例:[2911, 2912, 2913] - -# 模式3:账户id列表(通过账户id查询对应的角色id后批量导出) -ACCOUNT_ID_LIST = [9343] # 5095[7232] # [1783,5375,5371,5345,5303,5293,5095,4289,4494,4473,4460,4452,4386,4388,4236,4043,2758,2841,2756,2750,2692,1781,1693,2256,2234,2373] # 账户ID列表,示例:[100, 101, 102] - -OUTPUT_DIR = "output_latest/" # 输出目录,默认为output文件夹 -# ==== 变量结束 ==== -import os -import json -import re -from typing import Any, Dict, List, Optional - -import datetime - -try: - import requests -except Exception: - requests = None - -try: - import psycopg2 - from psycopg2.extras import RealDictCursor -except Exception: - psycopg2 = None - RealDictCursor = None - -try: - import pymysql - import pymysql.cursors -except Exception: - pymysql = None - -try: - import pandas as pd -except Exception: - pd = None - -try: - import urllib3 -except Exception: - urllib3 = None - - -SHEET1_COLUMNS = [ - "userId", - "userMsg", - "source", - "userName", - "soeData", - "audioUrl", - "asrStatus", - "componentId", - "componentType", - "dataVersion", - "timeStr", -] - -SHEET2_COLUMNS = [ - "user_id", - "component_unique_code", - "session_id", - "c_type", - "c_id", - "互动组件名称", - "组件标题", - "组件配置摘要", - "知识点", - "play_result", - "user_behavior_info", - "updated_at", -] - -SHEET3_COLUMNS = [ - "user_id", - "unit_id", - "lesson_id", - "question_list", - "正确率", - "updated_at", -] - -SHEET4_COLUMNS = [ - "user_id", - "unit_id", - "category", - "score_text", - "question_list", - "updated_at", -] - -SHEET5_COLUMNS = [ - "id", - "user_id", - "unit_id", - "updated_at", - "km_id", - "km_type", - "play_time_seconds", -] - - -def _load_env_file(path: str) -> None: - if not os.path.exists(path): - return - try: - with open(path, "r", encoding="utf-8") as f: - for line in f: - line = line.strip() - if not line or line.startswith("#"): - continue - if "=" not in line: - continue - k, v = line.split("=", 1) - k = k.strip() - v = v.strip().strip('"').strip("'") - if k and (os.getenv(k) is None): - os.environ[k] = v - except Exception: - pass - - -def load_env() -> None: - _load_env_file(os.path.join(os.getcwd(), ".env")) - _load_env_file(os.path.join(os.getcwd(), ".env.local")) - - -def to_json_str(v: Any) -> Any: - if isinstance(v, (dict, list)): - try: - return json.dumps(v, ensure_ascii=False) - except Exception: - return str(v) - return v - - -def parse_time(value: Any) -> Optional[datetime.datetime]: - if value is None: - return None - if isinstance(value, (int, float)): - try: - v = float(value) - # 兼容毫秒级时间戳 - if v > 1e11: - v = v / 1000.0 - return datetime.datetime.fromtimestamp(v) - except Exception: - return None - if isinstance(value, str): - fmts = [ - "%Y-%m-%dT%H:%M:%S.%fZ", - "%Y-%m-%dT%H:%M:%S.%f%z", - "%Y-%m-%dT%H:%M:%S%z", - "%Y-%m-%d %H:%M:%S", - "%Y-%m-%d", - ] - for fmt in fmts: - try: - return datetime.datetime.strptime(value, fmt) - except Exception: - continue - try: - return datetime.datetime.fromisoformat(value) - except Exception: - return None - return None - - -def pick_time(source: Dict[str, Any]) -> Optional[datetime.datetime]: - candidates = [ - "updated_at", - "created_at", - "@timestamp", - "timestamp", - "updatedAt", - "createdAt", - "time", - "ts", - "timeStr", - "update_time", - "create_time", - ] - for key in candidates: - if key in source: - t = parse_time(source.get(key)) - if t is not None: - return t - # 宽松匹配:尝试扫描所有可能的时间相关字段 - for k, v in source.items(): - lk = str(k).lower() - if any(s in lk for s in ["time", "date", "_at", "timestamp"]): - t = parse_time(v) - if t is not None: - return t - return None - - -def extract_makee_id_from_user_msg(user_msg: Any) -> Optional[str]: - # 支持dict或字符串形式 - if isinstance(user_msg, dict): - mk = user_msg.get("makee_id") - if isinstance(mk, str) and mk: - return mk - if isinstance(user_msg, str) and user_msg: - # 1) 尝试整体解析为JSON - try: - obj = json.loads(user_msg) - mk = obj.get("makee_id") - if isinstance(mk, str) and mk: - return mk - except Exception: - pass - # 2) 尝试截取大括号中的JSON - try: - start = user_msg.find("{") - end = user_msg.rfind("}") - if start != -1 and end != -1 and end > start: - candidate = user_msg[start : end + 1] - obj = json.loads(candidate) - mk = obj.get("makee_id") - if isinstance(mk, str) and mk: - return mk - except Exception: - pass - # 3) 正则匹配 makee_id - m = re.search(r"\bmakee_id\b\s*:\s*\"([^\"]+)\"", user_msg) - if m: - return m.group(1) - return None - - -def fetch_es_asr_log(makee_id: str, es_cfg: Dict[str, Any]) -> Optional[Dict[str, Any]]: - if requests is None: - raise RuntimeError("缺少requests依赖,请安装后再运行。") - host = es_cfg.get("host") - port = es_cfg.get("port") - scheme = es_cfg.get("scheme", "http") - user = es_cfg.get("user") - password = es_cfg.get("password") - index = "llm_asr_log" - if not host: - return None - base = f"{scheme}://{host}:{port}" - url = f"{base}/{index}/_search" - headers = {"Content-Type": "application/json"} - body = { - "query": { - "bool": { - "should": [ - {"term": {"makee_id": {"value": str(makee_id)}}}, - {"term": {"makee_id.keyword": {"value": str(makee_id)}}}, - ], - "minimum_should_match": 1, - } - }, - "size": 10, - "_source": [ - "makee_id", - "result_text", - "source", - "updated_at", - "created_at", - "@timestamp", - "timestamp", - "updatedAt", - "createdAt", - "time", - "ts", - "timeStr", - "update_time", - "create_time", - ], - } - auth = (user, password) if user and password else None - try: - if scheme == "https" and urllib3 is not None: - try: - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - except Exception: - pass - resp = requests.post(url, headers=headers, json=body, auth=auth, timeout=20, verify=False if scheme == "https" else True) - resp.raise_for_status() - data = resp.json() - except Exception: - return None - hits = data.get("hits", {}).get("hits", []) - if not hits: - return None - # 选最新的 - chosen = None - best_t = None - for h in hits: - src = h.get("_source", {}) or {} - t = pick_time(src) - if t is None: - continue - if best_t is None or t > best_t: - best_t = t - chosen = src - if chosen is None: - # 如果都没有时间,选第一条 - chosen = (hits[0].get("_source", {}) or {}) - return chosen - - -def get_es_config() -> Dict[str, Any]: - return { - "host": os.getenv("ES_HOST"), - "port": os.getenv("ES_PORT", "9200"), - "scheme": os.getenv("ES_SCHEME", "http"), - "user": os.getenv("ES_USER"), - "password": os.getenv("ES_PASSWORD"), - "index": "user-audio", - } - - -def fetch_es_user_audio(user_id: str, es_cfg: Dict[str, Any]) -> List[Dict[str, Any]]: - if requests is None: - raise RuntimeError("缺少requests依赖,请安装后再运行。") - - print(f" [ES] 开始查询user-audio索引...") - start_time = datetime.datetime.now() - - host = es_cfg.get("host") - port = es_cfg.get("port") - scheme = es_cfg.get("scheme", "http") - user = es_cfg.get("user") - password = es_cfg.get("password") - index = es_cfg.get("index", "user-audio") - - if not host: - return [] - - base = f"{scheme}://{host}:{port}" - url = f"{base}/{index}/_search" - headers = {"Content-Type": "application/json"} - - body = { - "query": { - "bool": { - "should": [ - {"term": {"userId": {"value": str(user_id)}}}, - {"term": {"userId.keyword": {"value": str(user_id)}}}, - ], - "minimum_should_match": 1, - } - }, - "size": 10000, - "_source": [ - "userId", - "userMsg", - "userName", - "soeData", - "audioUrl", - "asrStatus", - "componentId", - "componentType", - "dataVersion", - "updated_at", - "created_at", - "@timestamp", - "timestamp", - "updatedAt", - "createdAt", - "time", - "ts", - "timeStr", - "update_time", - "create_time", - ], - } - - auth = (user, password) if user and password else None - - try: - # 抑制自签证书下的HTTPS不安全警告 - if scheme == "https" and urllib3 is not None: - try: - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - except Exception: - pass - resp = requests.post(url, headers=headers, json=body, auth=auth, timeout=30, verify=False if scheme == "https" else True) - resp.raise_for_status() - data = resp.json() - except Exception as e: - raise RuntimeError(f"ES查询失败: {e}") - - hits = data.get("hits", {}).get("hits", []) - print(f" [ES] 查询完成,获得{len(hits)}条记录,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - - if not hits: - return [] - - print(f" [ES] 开始处理音频数据...") - process_start = datetime.datetime.now() - - rows: List[Dict[str, Any]] = [] - asr_cache: Dict[str, Dict[str, Any]] = {} - makee_id_count = 0 - - for idx, h in enumerate(hits, 1): - # 每处理100条显示一次进度 - if idx % 100 == 0 or idx == len(hits): - print(f" [ES] 处理进度: {idx}/{len(hits)} ({idx*100//len(hits)}%)") - - src = h.get("_source", {}) or {} - row = { - "userId": src.get("userId"), - "userMsg": src.get("userMsg"), - "source": None, - "userName": src.get("userName"), - "soeData": to_json_str(src.get("soeData")), - "audioUrl": src.get("audioUrl"), - "asrStatus": src.get("asrStatus"), - "componentId": src.get("componentId"), - "componentType": src.get("componentType"), - "dataVersion": src.get("dataVersion"), - } - t = pick_time(src) - row["_time"] = t.isoformat() if t else None - row["timeStr"] = t.strftime("%Y-%m-%d %H:%M:%S") if t else None - # v1.2: 当userMsg包含makee_id时,补充查询llm_asr_log并回填 - mk = extract_makee_id_from_user_msg(row.get("userMsg")) - if mk: - makee_id_count += 1 - asr_doc = asr_cache.get(mk) - if asr_doc is None: - asr_doc = fetch_es_asr_log(mk, es_cfg) - if asr_doc is not None: - asr_cache[mk] = asr_doc - if asr_doc is not None: - rt = asr_doc.get("result_text") - if rt: - row["userMsg"] = rt - row["source"] = to_json_str(asr_doc.get("source")) - rows.append(row) - - print(f" [ES] 数据处理完成,发现{makee_id_count}条包含makee_id的记录,耗时{(datetime.datetime.now() - process_start).total_seconds():.2f}秒") - - print(f" [ES] 开始排序...") - rows.sort(key=lambda x: parse_time(x.get("_time")) or datetime.datetime.min, reverse=True) - print(f" [ES] 音频数据处理完成,总耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - - return rows - - -def get_pg_conn() -> Any: - if psycopg2 is None: - raise RuntimeError("缺少psycopg2依赖,请安装后再运行。") - host = os.getenv("PG_DB_HOST") - port = int(os.getenv("PG_DB_PORT", "5432")) - user = os.getenv("PG_DB_USER") - password = os.getenv("PG_DB_PASSWORD") - dbname = os.getenv("PG_DB_DATABASE") - if not host or not dbname: - raise RuntimeError("PG数据库环境变量未配置完整") - conn = psycopg2.connect(host=host, port=port, user=user, password=password, dbname=dbname) - return conn - - -def get_mysql_conn(database: str) -> Any: - """ - 获取MySQL数据库连接 - - Args: - database: 数据库名,可选值:'vala_user' 或 'vala_test' - vala_user 使用 online 配置(环境变量后缀 _online) - vala_test 使用默认配置 - - Returns: - MySQL连接对象 - """ - if pymysql is None: - raise RuntimeError("缺少pymysql依赖,请安装后再运行。") - - # 根据数据库选择不同的环境变量配置 - if database == "vala_user": - # vala_user 数据库使用 online 配置 - host = os.getenv("MYSQL_HOST_online") - port = int(os.getenv("MYSQL_PORT_online", "3306")) - user = os.getenv("MYSQL_USERNAME_online") - password = os.getenv("MYSQL_PASSWORD_online") - if not host: - raise RuntimeError("MySQL数据库环境变量未配置完整(缺少MYSQL_HOST_online)") - else: - # vala_test 等其他数据库使用默认配置 - host = os.getenv("MYSQL_HOST") - port = int(os.getenv("MYSQL_PORT", "3306")) - user = os.getenv("MYSQL_USERNAME") - password = os.getenv("MYSQL_PASSWORD") - if not host: - raise RuntimeError("MySQL数据库环境变量未配置完整(缺少MYSQL_HOST)") - - conn = pymysql.connect( - host=host, - port=port, - user=user, - password=password, - database=database, # 直接使用传入的数据库名 - charset="utf8mb4", - cursorclass=pymysql.cursors.DictCursor, - ) - return conn - - -def get_id_2_unit_index(conn: Any) -> Dict[int, int]: - """ - 从MySQL获取 story_id 到 unit_id 的映射关系 - - Args: - conn: MySQL数据库连接 - - Returns: - 映射字典 {story_id: unit_id} - """ - sql = """ - SELECT * - FROM `vala_game_info` - WHERE id > 0 - AND `vala_game_info`.`deleted_at` IS NULL - ORDER BY season_package_id asc, `index` asc - """ - try: - with conn.cursor() as cur: - cur.execute(sql) - rows = cur.fetchall() or [] - # 构建映射表:按查询结果的顺序,索引即为unit_id - id_2_unit_index = {} - for index, row in enumerate(rows): - id_2_unit_index[row["id"]] = index - return id_2_unit_index - except Exception as e: - print(f"[ERROR] 获取story_id到unit_id映射失败: {e}") - return {} - - -def get_chapter_id_to_lesson_id(conn: Any) -> Dict[int, int]: - """ - 从MySQL获取 chapter_id 到 lesson_id 的映射关系 - - Args: - conn: MySQL数据库连接 - - Returns: - 映射字典 {chapter_id: lesson_id} - """ - sql = """ - SELECT id, `index` - FROM `vala_game_chapter` - WHERE deleted_at IS NULL - """ - try: - with conn.cursor() as cur: - cur.execute(sql) - rows = cur.fetchall() or [] - # 构建映射表:chapter的index字段即为lesson_id - chapter_id_to_lesson_id = {} - for row in rows: - chapter_id_to_lesson_id[row["id"]] = row["index"] - return chapter_id_to_lesson_id - except Exception as e: - print(f"[ERROR] 获取chapter_id到lesson_id映射失败: {e}") - return {} - - -# 组件类型到组件名称的映射 -COMPONENT_TYPE_NAMES = { - "mid_vocab_item": "物品互动", - "mid_vocab_image": "图片互动", - "mid_vocab_fillBlank": "填词互动", - "mid_vocab_instruction": "指令互动", - "mid_sentence_dialogue": "对话互动", # 需要根据mode进一步判断 - "mid_sentence_voice": "语音互动", - "mid_sentence_material": "材料互动", - "mid_sentence_makeSentence": "造句互动", - "mid_grammar_cloze": "挖空互动", - "mid_grammar_sentence": "组句互动", - "mid_pron_pron": "发音互动", - "core_speaking_reply": "口语快答", - "core_speaking_inquiry": "口语妙问", - "core_speaking_explore": "口语探讨", - "core_speaking_monologue": "口语独白", - "core_reading_order": "合作阅读", - "core_listening_order": "合作听力", - "core_writing_imgMakeSentence": "看图组句", - "core_writing_imgWrite": "看图撰写", - "core_writing_questionMakeSentence": "问题组句", - "core_writing_questionWrite": "问题撰写", -} - - -def get_component_name(c_type: str, component_config: Optional[Dict[str, Any]]) -> str: - """ - 根据c_type和组件配置获取组件名称 - - Args: - c_type: 组件类型 - component_config: 组件配置(用于判断对话互动的mode) - - Returns: - 组件名称 - """ - if not c_type: - return "" - - # 特殊处理:对话互动需要根据mode判断 - if c_type == "mid_sentence_dialogue" and component_config: - try: - question = component_config.get("question", {}) - mode = question.get("mode", "") - if mode == "express": - return "对话互动-表达" - elif mode == "read": - return "对话互动-朗读" - except Exception: - pass - - return COMPONENT_TYPE_NAMES.get(c_type, "") - - -def batch_fetch_component_configs(play_records: List[Dict[str, Any]], mysql_conn: Any) -> Dict[str, Dict[str, Any]]: - """ - 批量查询组件配置信息 - - Args: - play_records: 播放记录列表 - mysql_conn: MySQL连接 - - Returns: - 组件配置映射 {c_type_c_id: {title, component_config, kp_relation_info}} - """ - print(f" [MySQL] 开始批量查询组件配置...") - start_time = datetime.datetime.now() - - # 收集需要查询的c_type和c_id - mid_c_ids = set() - core_c_ids = set() - mid_type_id_pairs = [] # 用于调试日志 - core_type_id_pairs = [] - - for record in play_records: - c_type = record.get("c_type", "") - c_id = record.get("c_id") - if c_type and c_id: - if c_type.startswith("mid"): - mid_c_ids.add(c_id) - mid_type_id_pairs.append((c_type, c_id)) - elif c_type.startswith("core"): - core_c_ids.add(c_id) - core_type_id_pairs.append((c_type, c_id)) - - print(f" [MySQL] 需要查询中互动组件: {len(mid_c_ids)}个, 核心互动组件: {len(core_c_ids)}个") - if mid_c_ids: - print(f" [MySQL] 中互动组件ID列表(前10个): {sorted(list(mid_c_ids))[:10]}") - if core_c_ids: - print(f" [MySQL] 核心互动组件ID列表(前10个): {sorted(list(core_c_ids))[:10]}") - - config_map = {} - - # 批量查询middle_interaction_component - if mid_c_ids: - try: - with mysql_conn.cursor() as cur: - placeholders = ','.join(['%s'] * len(mid_c_ids)) - sql = f""" - SELECT c_id, c_type, title, component_config, kp_relation_info - FROM middle_interaction_component - WHERE c_id IN ({placeholders}) AND deleted_at IS NULL - """ - print(f" [MySQL] 执行中互动组件查询,查询条件: c_id IN ({len(mid_c_ids)}个ID)") - cur.execute(sql, tuple(mid_c_ids)) - rows = cur.fetchall() or [] - print(f" [MySQL] 查询到{len(rows)}条中互动组件配置") - - if len(rows) == 0 and len(mid_c_ids) > 0: - print(f" [MySQL] [警告] 查询结果为空!可能的原因:") - print(f" [MySQL] - 数据库中没有匹配的c_id记录") - print(f" [MySQL] - deleted_at字段不为NULL") - print(f" [MySQL] - c_id不存在") - - for idx, row in enumerate(rows): - c_type = row.get("c_type", "") - c_id = row.get("c_id") - key = f"{c_type}_{c_id}" - - if idx < 3: # 输出前3条的详细信息 - print(f" [MySQL] [样例{idx+1}] id={c_id}, c_type={c_type}, key={key}") - print(f" [MySQL] [样例{idx+1}] title={row.get('title', '')[:50]}") - - # 解析component_config - component_config = row.get("component_config") - if isinstance(component_config, str): - try: - component_config = json.loads(component_config) - except Exception as e: - print(f" [MySQL] [警告] 解析component_config失败 (id={c_id}): {e}") - component_config = {} - - # 提取question字段作为摘要 - summary = "" - if isinstance(component_config, dict): - question = component_config.get("question") - summary = to_json_str(question) if question else "" - if idx < 3 and question: - print(f" [MySQL] [样例{idx+1}] 提取到question字段,长度: {len(summary)}") - - # 解析kp_relation_info - kp_relation_info = row.get("kp_relation_info") - if isinstance(kp_relation_info, str): - try: - kp_relation_info = json.loads(kp_relation_info) - except Exception: - kp_relation_info = [] - - config_map[key] = { - "title": row.get("title", ""), - "component_config": component_config, - "summary": summary, - "kp_relation_info": to_json_str(kp_relation_info), - } - - print(f" [MySQL] 中互动组件配置已加入config_map,当前map大小: {len(config_map)}") - except Exception as e: - print(f" [MySQL] [错误] 查询中互动组件配置失败: {e}") - import traceback - traceback.print_exc() - - # 批量查询core_interaction_component - if core_c_ids: - try: - with mysql_conn.cursor() as cur: - placeholders = ','.join(['%s'] * len(core_c_ids)) - sql = f""" - SELECT c_id, c_type, title, component_config, kp_relation_info - FROM core_interaction_component - WHERE c_id IN ({placeholders}) AND deleted_at IS NULL - """ - print(f" [MySQL] 执行核心互动组件查询,查询条件: c_id IN ({len(core_c_ids)}个ID)") - cur.execute(sql, tuple(core_c_ids)) - rows = cur.fetchall() or [] - print(f" [MySQL] 查询到{len(rows)}条核心互动组件配置") - - if len(rows) == 0 and len(core_c_ids) > 0: - print(f" [MySQL] [警告] 查询结果为空!可能的原因:") - print(f" [MySQL] - 数据库中没有匹配的c_id记录") - print(f" [MySQL] - deleted_at字段不为NULL") - print(f" [MySQL] - c_id不存在") - - for idx, row in enumerate(rows): - c_type = row.get("c_type", "") - c_id = row.get("c_id") - key = f"{c_type}_{c_id}" - - if idx < 3: # 输出前3条的详细信息 - print(f" [MySQL] [样例{idx+1}] id={c_id}, c_type={c_type}, key={key}") - print(f" [MySQL] [样例{idx+1}] title={row.get('title', '')[:50]}") - - # 解析component_config - component_config = row.get("component_config") - if isinstance(component_config, str): - try: - component_config = json.loads(component_config) - except Exception as e: - print(f" [MySQL] [警告] 解析component_config失败 (id={c_id}): {e}") - component_config = {} - - # 提取taskInfo字段作为摘要 - summary = "" - if isinstance(component_config, dict): - task_info = component_config.get("taskInfo") - summary = to_json_str(task_info) if task_info else "" - if idx < 3 and task_info: - print(f" [MySQL] [样例{idx+1}] 提取到taskInfo字段,长度: {len(summary)}") - - # 解析kp_relation_info - kp_relation_info = row.get("kp_relation_info") - if isinstance(kp_relation_info, str): - try: - kp_relation_info = json.loads(kp_relation_info) - except Exception: - kp_relation_info = [] - - config_map[key] = { - "title": row.get("title", ""), - "component_config": component_config, - "summary": summary, - "kp_relation_info": to_json_str(kp_relation_info), - } - - print(f" [MySQL] 核心互动组件配置已加入config_map,当前map大小: {len(config_map)}") - except Exception as e: - print(f" [MySQL] [错误] 查询核心互动组件配置失败: {e}") - import traceback - traceback.print_exc() - - print(f" [MySQL] 组件配置查询完成,共{len(config_map)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - return config_map - - -def calculate_accuracy(question_list: Any) -> float: - """ - 计算问题列表的正确率 - - Args: - question_list: 问题列表(可能是JSON字符串或list) - - Returns: - 正确率(百分比,保留2位小数) - """ - try: - if isinstance(question_list, str): - question_list = json.loads(question_list) - - if not isinstance(question_list, list) or len(question_list) == 0: - return 0.0 - - total = len(question_list) - correct = sum(1 for q in question_list if q.get('isRight') == True) - accuracy = round(correct / total * 100, 2) if total > 0 else 0.0 - - return accuracy - except Exception: - return 0.0 - - - -def fetch_character_ids_by_account(account_id: str, conn: Any) -> List[str]: - """根据账户id查询对应的角色id列表""" - sql = "SELECT id FROM vala_app_character WHERE account_id = %s" - try: - with conn.cursor() as cur: - cur.execute(sql, (account_id,)) - rows = cur.fetchall() or [] - return [str(row["id"]) for row in rows if row.get("id")] - except Exception as e: - print(f"[ERROR] 查询账户id={account_id}的角色id失败: {e}") - return [] - - -def fetch_pg_play_records(user_id: str, conn: Any, mysql_conn: Any) -> List[Dict[str, Any]]: - """ - 查询互动组件学习记录并补充组件配置信息 - - Args: - user_id: 用户ID(角色ID) - conn: PostgreSQL数据库连接 - mysql_conn: MySQL数据库连接 - - Returns: - 互动组件学习记录列表 - """ - print(f" [PG] 开始查询互动组件学习记录(8张分表)...") - start_time = datetime.datetime.now() - - tables = [f"user_component_play_record_{i}" for i in range(8)] - rows: List[Dict[str, Any]] = [] - with conn.cursor(cursor_factory=RealDictCursor) as cur: - for t in tables: - try: - cur.execute( - f""" - SELECT user_id, component_unique_code, session_id, c_type, c_id, - play_result, user_behavior_info, updated_at - FROM {t} - WHERE user_id = %s - ORDER BY updated_at DESC - """, - (user_id,), - ) - part = cur.fetchall() or [] - if part: - print(f" [PG] 表{t}查到{len(part)}条记录") - for r in part: - r = dict(r) - r["play_result"] = to_json_str(r.get("play_result")) - r["user_behavior_info"] = to_json_str(r.get("user_behavior_info")) - # 将带时区的时间转换为无时区,避免Excel写入报错 - upd = r.get("updated_at") - if isinstance(upd, datetime.datetime): - try: - if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: - r["updated_at"] = upd.replace(tzinfo=None) - except Exception: - # 回退为字符串 - r["updated_at"] = str(upd) - rows.append(r) - except Exception as e: - print(f" [PG] 表{t}查询失败: {e}") - continue - - rows.sort(key=lambda x: parse_time(x.get("updated_at")) or datetime.datetime.min, reverse=True) - print(f" [PG] 互动组件学习记录查询完成,共{len(rows)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - - # 批量查询组件配置 - if rows and mysql_conn: - config_map = batch_fetch_component_configs(rows, mysql_conn) - - # 补充组件信息 - print(f" [PG] 开始补充组件配置信息...") - filled_count = 0 - empty_count = 0 - sample_keys = [] - sample_mode_check = [] # 检查对话互动的mode - - for r in rows: - c_type = r.get("c_type", "") - c_id = r.get("c_id") - key = f"{c_type}_{c_id}" if c_type and c_id else "" - - config = config_map.get(key, {}) - component_config = config.get("component_config", {}) - - component_name = get_component_name(c_type, component_config) - r["互动组件名称"] = component_name - r["组件标题"] = config.get("title", "") - r["组件配置摘要"] = config.get("summary", "") - r["知识点"] = config.get("kp_relation_info", "") - - # 统计填充情况 - if config: - filled_count += 1 - if len(sample_keys) < 3: - sample_keys.append((key, component_name, r["组件标题"][:30] if r["组件标题"] else "")) - - # 检查对话互动的mode - if c_type == "mid_sentence_dialogue" and len(sample_mode_check) < 3: - mode = "" - if isinstance(component_config, dict): - question = component_config.get("question", {}) - if isinstance(question, dict): - mode = question.get("mode", "") - sample_mode_check.append({ - "key": key, - "mode": mode, - "component_name": component_name - }) - else: - empty_count += 1 - if empty_count <= 5: # 输出前5个未匹配的key - print(f" [PG] [警告] 未找到组件配置: key={key}") - - print(f" [PG] 组件配置信息补充完成") - print(f" [PG] 匹配到配置: {filled_count}条, 未匹配: {empty_count}条") - if sample_keys: - print(f" [PG] 样例数据(前3条):") - for key, name, title in sample_keys: - print(f" [PG] - key={key}, 名称={name}, 标题={title}") - - if sample_mode_check: - print(f" [PG] 对话互动mode检查(前3条):") - for s in sample_mode_check: - print(f" [PG] - key={s['key']}, mode={s['mode']}, 最终名称={s['component_name']}") - - return rows - - -def fetch_pg_unit_review(user_id: str, conn: Any, id_2_unit_index: Dict[int, int], chapter_id_to_lesson_id: Dict[int, int]) -> List[Dict[str, Any]]: - """ - 查询课程巩固记录 - - Args: - user_id: 用户ID(角色ID) - conn: PostgreSQL数据库连接 - id_2_unit_index: story_id到unit_id的映射字典 - chapter_id_to_lesson_id: chapter_id到lesson_id的映射字典 - - Returns: - 课程巩固记录列表 - """ - print(f" [PG] 开始查询课程巩固记录...") - start_time = datetime.datetime.now() - - sql = ( - "SELECT user_id, story_id, chapter_id, question_list, updated_at " - "FROM user_unit_review_question_result WHERE user_id = %s ORDER BY updated_at DESC" - ) - with conn.cursor(cursor_factory=RealDictCursor) as cur: - try: - cur.execute(sql, (user_id,)) - rows = cur.fetchall() or [] - except Exception as e: - print(f" [PG] 课程巩固记录查询失败: {e}") - rows = [] - out: List[Dict[str, Any]] = [] - for r in rows: - d = dict(r) - - # 映射 story_id 到 unit_id - story_id = d.get("story_id") - unit_id = id_2_unit_index.get(story_id) if story_id else None - d["unit_id"] = unit_id - - # 映射 chapter_id 到 lesson_id - chapter_id = d.get("chapter_id") - lesson_id = chapter_id_to_lesson_id.get(chapter_id) if chapter_id else None - d["lesson_id"] = lesson_id - - # 计算正确率 - question_list = d.get("question_list") - d["正确率"] = calculate_accuracy(question_list) - - d["question_list"] = to_json_str(question_list) - upd = d.get("updated_at") - if isinstance(upd, datetime.datetime): - try: - if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: - d["updated_at"] = upd.replace(tzinfo=None) - except Exception: - d["updated_at"] = str(upd) - out.append(d) - - print(f" [PG] 课程巩固记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - return out - - -def fetch_pg_unit_challenge(user_id: str, conn: Any, id_2_unit_index: Dict[int, int]) -> List[Dict[str, Any]]: - """ - 查询单元挑战记录 - - Args: - user_id: 用户ID(角色ID) - conn: PostgreSQL数据库连接 - id_2_unit_index: story_id到unit_id的映射字典 - - Returns: - 单元挑战记录列表 - """ - print(f" [PG] 开始查询单元挑战记录...") - start_time = datetime.datetime.now() - - sql = ( - "SELECT user_id, story_id, category, score_text, question_list, updated_at " - "FROM user_unit_challenge_question_result WHERE user_id = %s ORDER BY updated_at DESC" - ) - with conn.cursor(cursor_factory=RealDictCursor) as cur: - try: - cur.execute(sql, (user_id,)) - rows = cur.fetchall() or [] - except Exception as e: - print(f" [PG] 单元挑战记录查询失败: {e}") - rows = [] - out: List[Dict[str, Any]] = [] - for r in rows: - d = dict(r) - - # 映射 story_id 到 unit_id - story_id = d.get("story_id") - unit_id = id_2_unit_index.get(story_id) if story_id else None - d["unit_id"] = unit_id - - d["question_list"] = to_json_str(d.get("question_list")) - upd = d.get("updated_at") - if isinstance(upd, datetime.datetime): - try: - if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: - d["updated_at"] = upd.replace(tzinfo=None) - except Exception: - d["updated_at"] = str(upd) - out.append(d) - - print(f" [PG] 单元挑战记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - return out - - -def fetch_pg_unit_summary(user_id: str, conn: Any, id_2_unit_index: Dict[int, int]) -> List[Dict[str, Any]]: - """ - 查询单元总结知识点结果数据 - - Args: - user_id: 用户ID(角色ID) - conn: PostgreSQL数据库连接 - id_2_unit_index: story_id到unit_id的映射字典 - - Returns: - 单元总结记录列表 - """ - print(f" [PG] 开始查询单元总结记录...") - start_time = datetime.datetime.now() - - sql = ( - "SELECT id, user_id, story_id, updated_at, km_id, km_type, play_time " - "FROM user_unit_summary_km_result WHERE user_id = %s AND deleted_at IS NULL ORDER BY updated_at DESC" - ) - with conn.cursor(cursor_factory=RealDictCursor) as cur: - try: - cur.execute(sql, (user_id,)) - rows = cur.fetchall() or [] - except Exception as e: - print(f" [PG] 单元总结记录查询失败: {e}") - rows = [] - - out: List[Dict[str, Any]] = [] - for r in rows: - d = dict(r) - # 映射 story_id 到 unit_id - story_id = d.get("story_id") - unit_id = id_2_unit_index.get(story_id) if story_id else None - d["unit_id"] = unit_id - - # 转换 play_time (毫秒) 为秒 (整数) - play_time = d.get("play_time") - d["play_time_seconds"] = play_time // 1000 if play_time else 0 - - # 移除时区信息 - upd = d.get("updated_at") - if isinstance(upd, datetime.datetime): - try: - if upd.tzinfo is not None and upd.tzinfo.utcoffset(upd) is not None: - d["updated_at"] = upd.replace(tzinfo=None) - except Exception: - d["updated_at"] = str(upd) - out.append(d) - - print(f" [PG] 单元总结记录查询完成,共{len(out)}条,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - return out - - -def generate_statistics(sheet2_rows: List[Dict[str, Any]], sheet5_rows: List[Dict[str, Any]]) -> tuple: - """ - 生成汇总统计数据 - - Args: - sheet2_rows: 互动组件学习记录 - sheet5_rows: 单元总结记录 - - Returns: - (组件统计DataFrame, 知识点统计DataFrame, 单元时长统计DataFrame) - """ - if pd is None: - raise RuntimeError("缺少pandas依赖,请安装后再运行。") - - print(f" [统计] 开始生成汇总统计数据...") - start_time = datetime.datetime.now() - - from collections import defaultdict - - # ============ a. 所有互动-按互动组件类型-通过情况统计 ============ - component_stats_data = [] - component_stats = defaultdict(lambda: {"Perfect": 0, "Good": 0, "Failed": 0, "Pass": 0, "Oops": 0, "total": 0}) - - # 用于调试 - sample_results = [] - parse_error_count = 0 - - for idx, record in enumerate(sheet2_rows): - component_name = record.get("互动组件名称", "") - if not component_name: - continue - - play_result_str = record.get("play_result", "") - - # 解析play_result - result = "" - try: - # 先判断是否是简单的字符串(Perfect/Good/Failed/Pass/Oops) - if isinstance(play_result_str, str): - # 去除空格后检查 - stripped = play_result_str.strip() - if stripped in ["Perfect", "Good", "Failed", "Pass", "Oops"]: - # 直接使用 - result = stripped - else: - # 尝试JSON解析 - try: - play_result = json.loads(play_result_str) - if isinstance(play_result, dict): - result = play_result.get("result", "") - else: - result = "" - except: - result = "" - else: - # 如果不是字符串,尝试当dict处理 - if isinstance(play_result_str, dict): - result = play_result_str.get("result", "") - else: - result = "" - - # 收集前3个样例 - if idx < 3: - sample_results.append({ - "component": component_name, - "raw": str(play_result_str)[:100], - "result": result - }) - except Exception as e: - parse_error_count += 1 - if parse_error_count <= 3: - print(f" [统计] [警告] 解析play_result失败 (第{idx+1}条): {e}, 原始值: {str(play_result_str)[:100]}") - result = "" - - component_stats[component_name]["total"] += 1 - if result in ["Perfect", "Good", "Failed", "Pass", "Oops"]: - component_stats[component_name][result] += 1 - - print(f" [统计] play_result解析样例(前3条):") - for s in sample_results: - print(f" [统计] - 组件: {s['component']}, 结果: {s['result']}, 原始: {s['raw']}") - if parse_error_count > 0: - print(f" [统计] play_result解析失败总数: {parse_error_count}") - - # 生成统计数据行 - for component_name in sorted(component_stats.keys()): - stats = component_stats[component_name] - total = stats["total"] - perfect = stats["Perfect"] - good = stats["Good"] - failed = stats["Failed"] - pass_count = stats["Pass"] - oops = stats["Oops"] - - perfect_ratio = round(perfect / total * 100, 2) if total > 0 else 0 - good_ratio = round(good / total * 100, 2) if total > 0 else 0 - failed_ratio = round(failed / total * 100, 2) if total > 0 else 0 - pass_ratio = round(pass_count / total * 100, 2) if total > 0 else 0 - oops_ratio = round(oops / total * 100, 2) if total > 0 else 0 - - component_stats_data.append({ - "互动组件名称": component_name, - "总数量": total, - "Perfect数量": perfect, - "Good数量": good, - "Failed数量": failed, - "Pass数量": pass_count, - "Oops数量": oops, - "Perfect比例(%)": perfect_ratio, - "Good比例(%)": good_ratio, - "Failed比例(%)": failed_ratio, - "Pass比例(%)": pass_ratio, - "Oops比例(%)": oops_ratio, - }) - - # ============ b. 中互动组件-按知识点-通过情况统计 ============ - kp_stats_data = [] - kp_stats = defaultdict(lambda: {"Perfect": 0, "Good": 0, "Failed": 0, "Pass": 0, "Oops": 0, "total": 0}) - - # 调试信息 - mid_count = 0 - has_kp_count = 0 - sample_kp_records = [] - - for idx, record in enumerate(sheet2_rows): - c_type = record.get("c_type", "") - if not c_type or not c_type.startswith("mid"): - continue - - mid_count += 1 - kp_relation_info_str = record.get("知识点", "") - - if not kp_relation_info_str: - continue - - has_kp_count += 1 - - # 解析知识点 - try: - if isinstance(kp_relation_info_str, str): - kp_relation_info = json.loads(kp_relation_info_str) - else: - kp_relation_info = kp_relation_info_str - - if not isinstance(kp_relation_info, list): - continue - - # 收集样例 - if len(sample_kp_records) < 3: - sample_kp_records.append({ - "c_type": c_type, - "kp_count": len(kp_relation_info), - "kp_info": str(kp_relation_info)[:200] - }) - - # 解析play_result(使用相同的逻辑) - play_result_str = record.get("play_result", "") - result = "" - if isinstance(play_result_str, str): - stripped = play_result_str.strip() - if stripped in ["Perfect", "Good", "Failed", "Pass", "Oops"]: - result = stripped - else: - try: - play_result = json.loads(play_result_str) - if isinstance(play_result, dict): - result = play_result.get("result", "") - except: - pass - elif isinstance(play_result_str, dict): - result = play_result_str.get("result", "") - - # 为每个知识点统计 - for kp in kp_relation_info: - if not isinstance(kp, dict): - continue - - kp_id = kp.get("kpId", "") - kp_type = kp.get("kpType", "") - kp_title = kp.get("kpTitle", "") - - if not kp_id: - continue - - kp_key = f"{kp_id}|{kp_type}|{kp_title}" - kp_stats[kp_key]["total"] += 1 - if result in ["Perfect", "Good", "Failed", "Pass", "Oops"]: - kp_stats[kp_key][result] += 1 - - except Exception as e: - if len(sample_kp_records) < 5: - print(f" [统计] [警告] 解析知识点失败: {e}, 原始值: {str(kp_relation_info_str)[:100]}") - continue - - print(f" [统计] 中互动组件统计: 总数={mid_count}, 有知识点={has_kp_count}, 知识点条目数={len(kp_stats)}") - if sample_kp_records: - print(f" [统计] 知识点样例(前3条):") - for s in sample_kp_records: - print(f" [统计] - c_type={s['c_type']}, 知识点数量={s['kp_count']}, 内容={s['kp_info']}") - - # 生成知识点统计数据行 - for kp_key in sorted(kp_stats.keys()): - parts = kp_key.split("|") - if len(parts) != 3: - continue - - kp_id, kp_type, kp_title = parts - stats = kp_stats[kp_key] - total = stats["total"] - perfect = stats["Perfect"] - good = stats["Good"] - failed = stats["Failed"] - pass_count = stats["Pass"] - oops = stats["Oops"] - - perfect_ratio = round(perfect / total * 100, 2) if total > 0 else 0 - good_ratio = round(good / total * 100, 2) if total > 0 else 0 - failed_ratio = round(failed / total * 100, 2) if total > 0 else 0 - pass_ratio = round(pass_count / total * 100, 2) if total > 0 else 0 - oops_ratio = round(oops / total * 100, 2) if total > 0 else 0 - - kp_stats_data.append({ - "知识点ID": kp_id, - "知识点类型": kp_type, - "知识点标题": kp_title, - "总数量": total, - "Perfect数量": perfect, - "Good数量": good, - "Failed数量": failed, - "Pass数量": pass_count, - "Oops数量": oops, - "Perfect比例(%)": perfect_ratio, - "Good比例(%)": good_ratio, - "Failed比例(%)": failed_ratio, - "Pass比例(%)": pass_ratio, - "Oops比例(%)": oops_ratio, - }) - - # ============ c. 单元总结-按单元统计时长 ============ - unit_time_stats_data = [] - unit_time_stats = defaultdict(int) - - for record in sheet5_rows: - unit_id = record.get("unit_id") - play_time_seconds = record.get("play_time_seconds", 0) - - if unit_id is not None: - unit_time_stats[unit_id] += play_time_seconds - - # 生成单元时长统计数据行 - for unit_id in sorted(unit_time_stats.keys()): - total_seconds = unit_time_stats[unit_id] - total_minutes = int(total_seconds / 60) - - unit_time_stats_data.append({ - "单元ID": f"unit_{unit_id}", - "总时长(秒)": total_seconds, - "总时长(分钟)": total_minutes, - }) - - print(f" [统计] 汇总统计数据生成完成,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - print(f" [统计] 生成了{len(component_stats_data)}条组件统计, {len(kp_stats_data)}条知识点统计, {len(unit_time_stats_data)}条单元时长统计") - - return ( - pd.DataFrame(component_stats_data), - pd.DataFrame(kp_stats_data), - pd.DataFrame(unit_time_stats_data) - ) - - - -def write_excel(path: str, sheet1_rows: List[Dict[str, Any]], sheet2_rows: List[Dict[str, Any]], sheet3_rows: List[Dict[str, Any]], sheet4_rows: List[Dict[str, Any]], sheet5_rows: List[Dict[str, Any]], stats_component_df: Any, stats_kp_df: Any, stats_unit_time_df: Any) -> None: - if pd is None: - raise RuntimeError("缺少pandas依赖,请安装后再运行。") - - print(f" [Excel] 开始写入Excel文件: {path}") - start_time = datetime.datetime.now() - - out_dir = os.path.dirname(path) or "." - os.makedirs(out_dir, exist_ok=True) - with pd.ExcelWriter(path, engine="openpyxl") as writer: - pd.DataFrame(sheet1_rows, columns=SHEET1_COLUMNS).to_excel(writer, sheet_name="全部音频数据", index=False) - pd.DataFrame(sheet2_rows, columns=SHEET2_COLUMNS).to_excel(writer, sheet_name="互动组件学习记录", index=False) - pd.DataFrame(sheet3_rows, columns=SHEET3_COLUMNS).to_excel(writer, sheet_name="课程巩固记录", index=False) - pd.DataFrame(sheet4_rows, columns=SHEET4_COLUMNS).to_excel(writer, sheet_name="单元挑战记录", index=False) - pd.DataFrame(sheet5_rows, columns=SHEET5_COLUMNS).to_excel(writer, sheet_name="单元总结记录", index=False) - stats_component_df.to_excel(writer, sheet_name="统计-互动组件通过情况", index=False) - stats_kp_df.to_excel(writer, sheet_name="统计-知识点通过情况", index=False) - stats_unit_time_df.to_excel(writer, sheet_name="统计-单元总结时长", index=False) - - print(f" [Excel] 写入完成,耗时{(datetime.datetime.now() - start_time).total_seconds():.2f}秒") - - -def get_date_str() -> str: - """获取当前日期字符串 格式:YYYYMMDD""" - return datetime.datetime.now().strftime("%Y%m%d") - - -def export_single_user(user_id: str, es_cfg: Dict[str, Any], pg_conn: Any, mysql_conn: Any, output_path: str, id_2_unit_index: Dict[int, int], chapter_id_to_lesson_id: Dict[int, int]) -> bool: - """ - 导出单个角色id的数据 - - Args: - user_id: 角色ID - es_cfg: ES配置 - pg_conn: PostgreSQL连接 - mysql_conn: MySQL连接 - output_path: 输出路径 - id_2_unit_index: story_id到unit_id的映射字典 - chapter_id_to_lesson_id: chapter_id到lesson_id的映射字典 - - Returns: - True表示成功,False表示失败 - """ - try: - print(f"\n[INFO] ========== 开始导出角色id={user_id} ==========") - total_start_time = datetime.datetime.now() - - # 查询ES数据 - sheet1_rows = fetch_es_user_audio(user_id, es_cfg) - - # 查询PG数据 - sheet2_rows = fetch_pg_play_records(user_id, pg_conn, mysql_conn) - sheet3_rows = fetch_pg_unit_review(user_id, pg_conn, id_2_unit_index, chapter_id_to_lesson_id) - sheet4_rows = fetch_pg_unit_challenge(user_id, pg_conn, id_2_unit_index) - sheet5_rows = fetch_pg_unit_summary(user_id, pg_conn, id_2_unit_index) - - # 检查是否有有效数据 - total_records = len(sheet1_rows) + len(sheet2_rows) + len(sheet3_rows) + len(sheet4_rows) + len(sheet5_rows) - print(f" [统计] 数据汇总:") - print(f" - 全部音频数据: {len(sheet1_rows)}条") - print(f" - 互动组件学习记录: {len(sheet2_rows)}条") - print(f" - 课程巩固记录: {len(sheet3_rows)}条") - print(f" - 单元挑战记录: {len(sheet4_rows)}条") - print(f" - 单元总结记录: {len(sheet5_rows)}条") - print(f" - 总计: {total_records}条") - - if total_records == 0: - print(f"[WARN] 角色id={user_id} 没有找到任何有效记录,跳过导出") - return False - - # 生成汇总统计数据 - stats_component_df, stats_kp_df, stats_unit_time_df = generate_statistics(sheet2_rows, sheet5_rows) - - # 写入Excel - write_excel(output_path, sheet1_rows, sheet2_rows, sheet3_rows, sheet4_rows, sheet5_rows, stats_component_df, stats_kp_df, stats_unit_time_df) - - total_time = (datetime.datetime.now() - total_start_time).total_seconds() - print(f"[INFO] 角色id={user_id} 导出成功") - print(f"[INFO] 文件路径: {output_path}") - print(f"[INFO] 总耗时: {total_time:.2f}秒") - print(f"[INFO] ========== 完成 ==========\n") - return True - - except Exception as e: - print(f"[ERROR] 角色id={user_id} 导出失败: {e}") - import traceback - traceback.print_exc() - return False - - -def main(): - load_env() - - # 确定运行模式并收集需要导出的角色id列表 - user_id_list: List[tuple] = [] # [(user_id, account_id or None), ...] - date_str = get_date_str() - - # 检查三种模式的配置 - has_user_id = USER_ID is not None - has_user_id_list = USER_ID_LIST is not None and len(USER_ID_LIST) > 0 - has_account_id_list = ACCOUNT_ID_LIST is not None and len(ACCOUNT_ID_LIST) > 0 - - # 验证只能配置一种模式 - mode_count = sum([has_user_id, has_user_id_list, has_account_id_list]) - if mode_count == 0: - raise RuntimeError("请配置 USER_ID、USER_ID_LIST 或 ACCOUNT_ID_LIST 中的一个") - if mode_count > 1: - raise RuntimeError("USER_ID、USER_ID_LIST、ACCOUNT_ID_LIST 只能配置一个,请检查配置") - - # 模式1:单个角色id - if has_user_id: - user_id_list = [(str(USER_ID), None)] - print(f"[INFO] 运行模式:单个角色id") - - # 模式2:角色id列表 - elif has_user_id_list: - user_id_list = [(str(uid), None) for uid in USER_ID_LIST] - print(f"[INFO] 运行模式:角色id列表,共{len(user_id_list)}个角色") - - # 模式3:账户id列表 - elif has_account_id_list: - print(f"[INFO] 运行模式:账户id列表,共{len(ACCOUNT_ID_LIST)}个账户") - mysql_conn = None - try: - mysql_conn = get_mysql_conn("vala_user") # 查询用户表,使用 vala_user 数据库 - for account_id in ACCOUNT_ID_LIST: - account_id_str = str(account_id) - print(f"[INFO] 查询账户id={account_id_str}对应的角色id...") - character_ids = fetch_character_ids_by_account(account_id_str, mysql_conn) - if not character_ids: - print(f"[WARN] 账户id={account_id_str} 未找到关联的角色id,跳过") - continue - print(f"[INFO] 账户id={account_id_str} 找到{len(character_ids)}个角色id: {character_ids}") - for cid in character_ids: - user_id_list.append((cid, account_id_str)) - finally: - if mysql_conn: - try: - mysql_conn.close() - except Exception: - pass - - if not user_id_list: - print("[WARN] 没有需要导出的角色id,程序退出") - return - - # 初始化连接 - es_cfg = get_es_config() - pg_conn = get_pg_conn() - - # 获取映射表(只需要查询一次,所有角色共用) - print(f"\n[INFO] ===== 准备工作:获取映射表 =====") - mysql_conn = None - id_2_unit_index = {} - chapter_id_to_lesson_id = {} - try: - print(f"[INFO] 正在连接MySQL数据库(vala_test)...") - mysql_conn = get_mysql_conn("vala_test") # 查询游戏配置表,使用 vala_test 数据库 - print(f"[INFO] 正在获取 story_id 到 unit_id 的映射...") - id_2_unit_index = get_id_2_unit_index(mysql_conn) - print(f"[INFO] 成功获取 {len(id_2_unit_index)} 个 story_id 映射") - print(f"[INFO] 正在获取 chapter_id 到 lesson_id 的映射...") - chapter_id_to_lesson_id = get_chapter_id_to_lesson_id(mysql_conn) - print(f"[INFO] 成功获取 {len(chapter_id_to_lesson_id)} 个 chapter_id 映射") - except Exception as e: - print(f"[ERROR] 获取映射表失败: {e}") - import traceback - traceback.print_exc() - if pg_conn: - try: - pg_conn.close() - except Exception: - pass - if mysql_conn: - try: - mysql_conn.close() - except Exception: - pass - return - - try: - # 统计信息 - success_count = 0 - skip_count = 0 - - print(f"\n[INFO] ===== 开始批量导出 =====") - print(f"[INFO] 共需导出{len(user_id_list)}个角色\n") - batch_start_time = datetime.datetime.now() - - # 循环处理每个角色id - for idx, (user_id, account_id) in enumerate(user_id_list, 1): - print(f"\n{'='*60}") - print(f"[INFO] 进度: {idx}/{len(user_id_list)} ({idx*100//len(user_id_list)}%)") - print(f"{'='*60}") - - # 生成输出文件名 - if account_id is None: - # 模式1和模式2:角色id_{}_导出时间_{}.xlsx - filename = f"角色id_{user_id}_导出时间_{date_str}.xlsx" - else: - # 模式3:账户id_{}_角色id_{}_导出时间_{}.xlsx - filename = f"账户id_{account_id}_角色id_{user_id}_导出时间_{date_str}.xlsx" - - output_path = os.path.join(OUTPUT_DIR, filename) - - # 导出单个角色的数据 - result = export_single_user(user_id, es_cfg, pg_conn, mysql_conn, output_path, id_2_unit_index, chapter_id_to_lesson_id) - if result: - success_count += 1 - else: - skip_count += 1 - - # 输出统计信息 - batch_total_time = (datetime.datetime.now() - batch_start_time).total_seconds() - print(f"\n{'='*60}") - print(f"[INFO] ===== 全部导出完成 =====") - print(f"[INFO] 总计: {len(user_id_list)}个角色") - print(f"[INFO] 成功: {success_count}个") - print(f"[INFO] 跳过: {skip_count}个") - print(f"[INFO] 总耗时: {batch_total_time:.2f}秒 ({batch_total_time/60:.2f}分钟)") - if success_count > 0: - print(f"[INFO] 平均每个角色: {batch_total_time/success_count:.2f}秒") - print(f"{'='*60}\n") - - finally: - if pg_conn: - try: - pg_conn.close() - except Exception: - pass - if mysql_conn: - try: - mysql_conn.close() - except Exception: - pass - - -if __name__ == "__main__": - main() diff --git a/new_export/llm_offline_production b/new_export/llm_offline_production deleted file mode 160000 index 75ab13e..0000000 --- a/new_export/llm_offline_production +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 75ab13e87dd0e856cb05c9515efcd507888b6486 diff --git a/new_export/test_db_connections.py b/new_export/test_db_connections.py deleted file mode 100644 index 77545eb..0000000 --- a/new_export/test_db_connections.py +++ /dev/null @@ -1,176 +0,0 @@ -#!/usr/bin/env python3 -"""测试各个数据库连接和查询""" - -import os -import json -import psycopg2 -import pymysql -import requests -from requests.auth import HTTPBasicAuth -import warnings -warnings.filterwarnings('ignore') - -def test_postgresql(): - """测试PostgreSQL连接""" - print("\n" + "="*60) - print("测试 PostgreSQL(Online)连接") - print("="*60) - - try: - conn = psycopg2.connect( - host="bj-postgres-16pob4sg.sql.tencentcdb.com", - port=28591, - user="ai_member", - password="LdfjdjL83h3h3^$&**YGG*", - dbname="vala", - connect_timeout=10 - ) - print("✅ PostgreSQL 连接成功!") - - # 测试查询 - with conn.cursor() as cur: - # 先查询所有表 - cur.execute("SELECT tablename FROM pg_tables WHERE schemaname = 'public' LIMIT 5") - tables = cur.fetchall() - print(f"✅ 查询成功!找到前5个表:{[t[0] for t in tables]}") - - # 尝试查询其中一个表的1条数据 - if tables: - table = tables[0][0] - cur.execute(f"SELECT * FROM {table} LIMIT 1") - row = cur.fetchone() - print(f"✅ 从表 {table} 读取到1条数据:{row if row else '空表'}") - - conn.close() - return True - - except Exception as e: - print(f"❌ PostgreSQL 连接/查询失败:{str(e)[:200]}") - return False - -def test_mysql_test(): - """测试Test MySQL连接""" - print("\n" + "="*60) - print("测试 MySQL(Test环境)连接") - print("="*60) - - try: - conn = pymysql.connect( - host="bj-cdb-8frbdwju.sql.tencentcdb.com", - port=25413, - user="read_only", - password="fdsfiidier^$*hjfdijjd232", - connect_timeout=10 - ) - print("✅ MySQL(Test)连接成功!") - - # 测试查询 - with conn.cursor() as cur: - cur.execute("SHOW DATABASES LIMIT 5") - dbs = cur.fetchall() - print(f"✅ 查询成功!找到前5个数据库:{[db[0] for db in dbs]}") - - if dbs: - db = dbs[0][0] - cur.execute(f"USE {db}") - cur.execute("SHOW TABLES LIMIT 1") - table = cur.fetchone() - if table: - cur.execute(f"SELECT * FROM {table[0]} LIMIT 1") - row = cur.fetchone() - print(f"✅ 从表 {table[0]} 读取到1条数据:{row if row else '空表'}") - - conn.close() - return True - - except Exception as e: - print(f"❌ MySQL(Test)连接/查询失败:{str(e)[:200]}") - return False - -def test_mysql_online(): - """测试Online MySQL连接""" - print("\n" + "="*60) - print("测试 MySQL(Online)连接") - print("="*60) - - try: - conn = pymysql.connect( - host="bj-cdb-dh2fkqa0.sql.tencentcdb.com", - port=27751, - user="read_only", - password="fsdo45ijfmfmuu77$%^&", - connect_timeout=10 - ) - print("✅ MySQL(Online)连接成功!") - - # 测试查询 - with conn.cursor() as cur: - cur.execute("SHOW DATABASES LIMIT 5") - dbs = cur.fetchall() - print(f"✅ 查询成功!找到前5个数据库:{[db[0] for db in dbs]}") - - conn.close() - return True - - except Exception as e: - print(f"❌ MySQL(Online)连接/查询失败:{str(e)[:200]}") - return False - -def test_es_online(): - """测试Online ES连接""" - print("\n" + "="*60) - print("测试 Elasticsearch(Online)连接") - print("="*60) - - try: - url = "https://es-7vd7jcu9.public.tencentelasticsearch.com:9200" - auth = HTTPBasicAuth("elastic", "F%?QDcWes7N2WTuiYD11") - - response = requests.get( - url, - auth=auth, - verify=False, - timeout=10 - ) - - if response.status_code == 200: - info = response.json() - print(f"✅ ES 连接成功!集群名称:{info.get('cluster_name')}") - - # 测试查询索引 - indices_resp = requests.get( - f"{url}/_cat/indices?format=json", - auth=auth, - verify=False, - timeout=10 - ) - if indices_resp.status_code == 200: - indices = indices_resp.json() - print(f"✅ 查询成功!索引数量:{len(indices)}") - if indices: - print(f" 前3个索引:{[idx['index'] for idx in indices[:3]]}") - - return True - else: - print(f"❌ ES 连接失败:HTTP {response.status_code}") - return False - - except Exception as e: - print(f"❌ ES 连接/查询失败:{str(e)[:200]}") - return False - -if __name__ == "__main__": - print("开始测试所有数据库连接...") - - results = {} - results["PostgreSQL(Online)"] = test_postgresql() - results["MySQL(Test)"] = test_mysql_test() - results["MySQL(Online)"] = test_mysql_online() - results["ES(Online)"] = test_es_online() - - print("\n" + "="*60) - print("测试总结") - print("="*60) - for name, result in results.items(): - status = "✅ 正常" if result else "❌ 异常" - print(f"{name}: {status}") diff --git a/passwords.example.txt b/passwords.example.txt deleted file mode 100644 index 80a0c70..0000000 --- a/passwords.example.txt +++ /dev/null @@ -1,12 +0,0 @@ -# 敏感密钥配置示例文件,真实密钥请放在 passwords.txt 中(已加入 .gitignore 不会提交到Git) -# MySQL -MYSQL_ONLINE_PASSWORD=***线上MySQL密码*** -MYSQL_TEST_PASSWORD=***测试MySQL密码*** - -# PostgreSQL -PG_ONLINE_PASSWORD=***线上PostgreSQL密码*** -PG_TEST_PASSWORD=***测试PostgreSQL密码*** - -# Elasticsearch -ES_TEST_PASSWORD=***测试ES密码*** -ES_ONLINE_PASSWORD=***线上ES密码*** diff --git a/scripts/generate_report.py b/scripts/generate_report.py index a2d152b..0b41638 100644 --- a/scripts/generate_report.py +++ b/scripts/generate_report.py @@ -1,99 +1,152 @@ import pandas as pd -import psycopg2 +from datetime import datetime -# 1. 读取最新的带成交标记的订单数据 -order_df = pd.read_csv('2026年3月1日至今订单_含正确成交标记.csv') -print(f"订单总数:{len(order_df)}") - -# 2. 计算GMV和退款相关 -order_df['GMV'] = order_df['pay_amount_int'] / 100 -order_df['is_refund'] = (order_df['order_status'] == 4).astype(int) -# 计算GSV:退款订单GSV为0,其他为GMV -order_df['GSV'] = order_df.apply(lambda row: 0 if row['order_status'] == 4 else row['GMV'], axis=1) -order_df['refund_amount'] = order_df.apply(lambda row: row['GMV'] if row['order_status'] == 4 else 0, axis=1) - -# 3. 映射到大类渠道 -def map_channel(tag): - if tag in ['销转', '销转-小龙']: - return '销转' - elif tag in ['端内直购', '端内销转']: - return 'App转化' - elif tag == '达播': - return '达播' - elif tag.startswith('班主任-'): - return '班主任' - elif tag == '店铺直购': - return '店铺直购' - else: - return '其他' - -order_df['渠道大类'] = order_df['成交标记'].apply(map_channel) - -# 4. 按大类统计 -channel_stats = order_df.groupby('渠道大类').agg( - 订单数=('id', 'count'), - GMV=('GMV', 'sum'), - 已退款金额=('refund_amount', 'sum'), - GSV=('GSV', 'sum'), - 退款订单数=('is_refund', 'sum'), - 客单价=('GMV', 'mean') -).reset_index() -channel_stats['退费率'] = (channel_stats['退款订单数'] / channel_stats['订单数'] * 100).round(1).astype(str) + '%' -channel_stats['GMV'] = channel_stats['GMV'].round(2) -channel_stats['GSV'] = channel_stats['GSV'].round(2) -channel_stats['已退款金额'] = channel_stats['已退款金额'].round(2) -channel_stats['客单价'] = channel_stats['客单价'].round(2) - -# 5. 原预测表的预测值 -pred_data = [ - {'渠道大类': '销转', '预测GSV': 100000}, - {'渠道大类': 'App转化', '预测GSV': 20000}, - {'渠道大类': '达播', '预测GSV': 250000}, - {'渠道大类': '班主任', '预测GSV': 10000} +# 1. 整体统计数据 +overall_data = [ + {"渠道": "学而思", "新增注册总人数": 615, "购课总人数":7, "购课总金额(元)":7794}, + {"渠道": "科大讯飞", "新增注册总人数": 377, "购课总人数":4, "购课总金额(元)":3796}, + {"渠道": "希沃", "新增注册总人数": 122, "购课总人数":1, "购课总金额(元)":599}, + {"渠道": "京东方", "新增注册总人数": 61, "购课总人数":1, "购课总金额(元)":599}, + {"渠道": "合计", "新增注册总人数": 1175, "购课总人数":13, "购课总金额(元)":12788}, ] -pred_df = pd.DataFrame(pred_data) +df_overall = pd.DataFrame(overall_data) -# 6. 合并实际和预测数据 -report_df = pd.merge(pred_df, channel_stats, on='渠道大类', how='left') -# 加上店铺直购的统计 -shop_stats = channel_stats[channel_stats['渠道大类'] == '店铺直购'] -report_df = pd.concat([report_df, shop_stats], ignore_index=True) -# 加上总计 -total = pd.DataFrame({ - '渠道大类': ['总计'], - '预测GSV': [pred_df['预测GSV'].sum()], - '订单数': [channel_stats['订单数'].sum()], - 'GMV': [channel_stats['GMV'].sum()], - '已退款金额': [channel_stats['已退款金额'].sum()], - 'GSV': [channel_stats['GSV'].sum()], - '退款订单数': [channel_stats['退款订单数'].sum()], - '客单价': [channel_stats['GMV'].sum()/channel_stats['订单数'].sum()], - '退费率': [str((channel_stats['退款订单数'].sum()/channel_stats['订单数'].sum()*100).round(1)) + '%'] -}) -report_df = pd.concat([report_df, total], ignore_index=True) -report_df['完成率'] = report_df.apply(lambda row: str(round(row['GSV']/row['预测GSV']*100, 1)) + '%' if pd.notna(row['预测GSV']) else '-', axis=1) +# 2. 每日购课明细数据 +purchase_data = [ + {"日期": "2026-03-02", "渠道": "学而思", "购课人数":1, "购课金额(元)":599, "订单号": "zfb202603022031481772454708683943"}, + {"日期": "2026-03-07", "渠道": "学而思", "购课人数":1, "购课金额(元)":599, "订单号": "wx202603071022051772850125753228"}, + {"日期": "2026-03-07", "渠道": "科大讯飞", "购课人数":1, "购课金额(元)":599, "订单号": "wx202603072123501772889830225976"}, + {"日期": "2026-03-10", "渠道": "学而思", "购课人数":1, "购课金额(元)":1999, "订单号": "wx202603101820431773138043948181"}, + {"日期": "2026-03-15", "渠道": "科大讯飞", "购课人数":2, "购课金额(元)":2598, "订单号": "wx202603150854031773536043478685、wx20260315122747177354886748896"}, + {"日期": "2026-03-18", "渠道": "学而思", "购课人数":2, "购课金额(元)":2598, "订单号": "wx202603182055481773838548372991、zfb202603182118201773839900411837"}, + {"日期": "2026-03-23", "渠道": "科大讯飞", "购课人数":1, "购课金额(元)":599, "订单号": "wx202603232015081774268108032833"}, + {"日期": "2026-03-24", "渠道": "京东方", "购课人数":1, "购课金额(元)":599, "订单号": "zfb202603242026431774355203538499"}, + {"日期": "2026-03-27", "渠道": "学而思", "购课人数":1, "购课金额(元)":1999, "订单号": "wx202603271258341774587514141956"}, + {"日期": "2026-03-28", "渠道": "希沃", "购课人数":1, "购课金额(元)":599, "订单号": "wx20260328145038177468063894734"}, +] +df_purchase = pd.DataFrame(purchase_data) -# 7. 保存报表 -output_file = '2026年3月收入预测报表_最新版.xlsx' -with pd.ExcelWriter(output_file) as writer: - report_df.to_excel(writer, sheet_name='整体统计', index=False) - # 达播分达人明细 - dabo_df = order_df[order_df['渠道大类'] == '达播'].groupby('key_from').agg( - 订单数=('id', 'count'), - GMV=('GMV', 'sum'), - GSV=('GSV', 'sum'), - 退费率=('is_refund', lambda x: str((x.sum()/x.count()*100).round(1)) + '%') - ).reset_index() - dabo_df.to_excel(writer, sheet_name='达播达人明细', index=False) - # 成交标记明细 - tag_df = order_df.groupby('成交标记').agg( - 订单数=('id', 'count'), - GMV=('GMV', 'sum'), - GSV=('GSV', 'sum'), - 退费率=('is_refund', lambda x: str((x.sum()/x.count()*100).round(1)) + '%') - ).reset_index() - tag_df.to_excel(writer, sheet_name='成交标记明细', index=False) +# 3. 每日新增注册数据 +register_data = [ + {"日期": "2026-03-01", "渠道": "京东方", "新增注册人数": 1}, + {"日期": "2026-03-01", "渠道": "学而思", "新增注册人数": 48}, + {"日期": "2026-03-01", "渠道": "希沃", "新增注册人数": 2}, + {"日期": "2026-03-02", "渠道": "京东方", "新增注册人数": 3}, + {"日期": "2026-03-02", "渠道": "学而思", "新增注册人数": 38}, + {"日期": "2026-03-02", "渠道": "希沃", "新增注册人数": 1}, + {"日期": "2026-03-03", "渠道": "学而思", "新增注册人数": 24}, + {"日期": "2026-03-03", "渠道": "希沃", "新增注册人数": 4}, + {"日期": "2026-03-04", "渠道": "京东方", "新增注册人数": 4}, + {"日期": "2026-03-04", "渠道": "学而思", "新增注册人数": 20}, + {"日期": "2026-03-04", "渠道": "希沃", "新增注册人数": 10}, + {"日期": "2026-03-04", "渠道": "科大讯飞", "新增注册人数": 3}, + {"日期": "2026-03-05", "渠道": "京东方", "新增注册人数": 7}, + {"日期": "2026-03-05", "渠道": "学而思", "新增注册人数": 37}, + {"日期": "2026-03-05", "渠道": "希沃", "新增注册人数": 15}, + {"日期": "2026-03-05", "渠道": "科大讯飞", "新增注册人数": 17}, + {"日期": "2026-03-06", "渠道": "京东方", "新增注册人数": 6}, + {"日期": "2026-03-06", "渠道": "学而思", "新增注册人数": 26}, + {"日期": "2026-03-06", "渠道": "希沃", "新增注册人数": 9}, + {"日期": "2026-03-06", "渠道": "科大讯飞", "新增注册人数": 12}, + {"日期": "2026-03-07", "渠道": "京东方", "新增注册人数": 5}, + {"日期": "2026-03-07", "渠道": "学而思", "新增注册人数": 35}, + {"日期": "2026-03-07", "渠道": "希沃", "新增注册人数": 5}, + {"日期": "2026-03-07", "渠道": "科大讯飞", "新增注册人数": 34}, + {"日期": "2026-03-08", "渠道": "京东方", "新增注册人数": 3}, + {"日期": "2026-03-08", "渠道": "学而思", "新增注册人数": 33}, + {"日期": "2026-03-08", "渠道": "希沃", "新增注册人数": 12}, + {"日期": "2026-03-08", "渠道": "科大讯飞", "新增注册人数": 34}, + {"日期": "2026-03-09", "渠道": "京东方", "新增注册人数": 2}, + {"日期": "2026-03-09", "渠道": "学而思", "新增注册人数": 27}, + {"日期": "2026-03-09", "渠道": "希沃", "新增注册人数": 5}, + {"日期": "2026-03-09", "渠道": "科大讯飞", "新增注册人数": 15}, + {"日期": "2026-03-10", "渠道": "学而思", "新增注册人数": 15}, + {"日期": "2026-03-10", "渠道": "希沃", "新增注册人数": 3}, + {"日期": "2026-03-10", "渠道": "科大讯飞", "新增注册人数": 9}, + {"日期": "2026-03-11", "渠道": "京东方", "新增注册人数": 2}, + {"日期": "2026-03-11", "渠道": "学而思", "新增注册人数": 25}, + {"日期": "2026-03-11", "渠道": "希沃", "新增注册人数": 2}, + {"日期": "2026-03-11", "渠道": "科大讯飞", "新增注册人数": 13}, + {"日期": "2026-03-12", "渠道": "京东方", "新增注册人数": 5}, + {"日期": "2026-03-12", "渠道": "学而思", "新增注册人数": 24}, + {"日期": "2026-03-12", "渠道": "希沃", "新增注册人数": 5}, + {"日期": "2026-03-12", "渠道": "科大讯飞", "新增注册人数": 15}, + {"日期": "2026-03-13", "渠道": "京东方", "新增注册人数": 2}, + {"日期": "2026-03-13", "渠道": "学而思", "新增注册人数": 31}, + {"日期": "2026-03-13", "渠道": "希沃", "新增注册人数": 7}, + {"日期": "2026-03-13", "渠道": "科大讯飞", "新增注册人数": 8}, + {"日期": "2026-03-14", "渠道": "学而思", "新增注册人数": 30}, + {"日期": "2026-03-14", "渠道": "希沃", "新增注册人数": 3}, + {"日期": "2026-03-14", "渠道": "科大讯飞", "新增注册人数": 22}, + {"日期": "2026-03-15", "渠道": "京东方", "新增注册人数": 1}, + {"日期": "2026-03-15", "渠道": "学而思", "新增注册人数": 22}, + {"日期": "2026-03-15", "渠道": "希沃", "新增注册人数": 3}, + {"日期": "2026-03-15", "渠道": "科大讯飞", "新增注册人数": 22}, + {"日期": "2026-03-16", "渠道": "京东方", "新增注册人数": 2}, + {"日期": "2026-03-16", "渠道": "学而思", "新增注册人数": 6}, + {"日期": "2026-03-16", "渠道": "希沃", "新增注册人数": 2}, + {"日期": "2026-03-16", "渠道": "科大讯飞", "新增注册人数": 10}, + {"日期": "2026-03-17", "渠道": "京东方", "新增注册人数": 3}, + {"日期": "2026-03-17", "渠道": "学而思", "新增注册人数": 12}, + {"日期": "2026-03-17", "渠道": "希沃", "新增注册人数": 3}, + {"日期": "2026-03-17", "渠道": "科大讯飞", "新增注册人数": 6}, + {"日期": "2026-03-18", "渠道": "京东方", "新增注册人数": 1}, + {"日期": "2026-03-18", "渠道": "学而思", "新增注册人数": 9}, + {"日期": "2026-03-18", "渠道": "科大讯飞", "新增注册人数": 11}, + {"日期": "2026-03-19", "渠道": "京东方", "新增注册人数": 1}, + {"日期": "2026-03-19", "渠道": "学而思", "新增注册人数": 6}, + {"日期": "2026-03-19", "渠道": "希沃", "新增注册人数": 2}, + {"日期": "2026-03-19", "渠道": "科大讯飞", "新增注册人数": 9}, + {"日期": "2026-03-20", "渠道": "京东方", "新增注册人数": 1}, + {"日期": "2026-03-20", "渠道": "学而思", "新增注册人数": 13}, + {"日期": "2026-03-20", "渠道": "希沃", "新增注册人数": 2}, + {"日期": "2026-03-20", "渠道": "科大讯飞", "新增注册人数": 12}, + {"日期": "2026-03-21", "渠道": "京东方", "新增注册人数": 1}, + {"日期": "2026-03-21", "渠道": "学而思", "新增注册人数": 27}, + {"日期": "2026-03-21", "渠道": "科大讯飞", "新增注册人数": 26}, + {"日期": "2026-03-22", "渠道": "学而思", "新增注册人数": 12}, + {"日期": "2026-03-22", "渠道": "希沃", "新增注册人数": 4}, + {"日期": "2026-03-22", "渠道": "科大讯飞", "新增注册人数": 22}, + {"日期": "2026-03-23", "渠道": "京东方", "新增注册人数": 1}, + {"日期": "2026-03-23", "渠道": "学而思", "新增注册人数": 9}, + {"日期": "2026-03-23", "渠道": "希沃", "新增注册人数": 2}, + {"日期": "2026-03-23", "渠道": "科大讯飞", "新增注册人数": 5}, + {"日期": "2026-03-24", "渠道": "学而思", "新增注册人数": 4}, + {"日期": "2026-03-24", "渠道": "希沃", "新增注册人数": 2}, + {"日期": "2026-03-24", "渠道": "科大讯飞", "新增注册人数": 8}, + {"日期": "2026-03-25", "渠道": "京东方", "新增注册人数": 1}, + {"日期": "2026-03-25", "渠道": "学而思", "新增注册人数": 12}, + {"日期": "2026-03-25", "渠道": "希沃", "新增注册人数": 5}, + {"日期": "2026-03-25", "渠道": "科大讯飞", "新增注册人数": 13}, + {"日期": "2026-03-26", "渠道": "京东方", "新增注册人数": 1}, + {"日期": "2026-03-26", "渠道": "学而思", "新增注册人数": 8}, + {"日期": "2026-03-26", "渠道": "希沃", "新增注册人数": 2}, + {"日期": "2026-03-26", "渠道": "科大讯飞", "新增注册人数": 8}, + {"日期": "2026-03-27", "渠道": "学而思", "新增注册人数": 9}, + {"日期": "2026-03-27", "渠道": "希沃", "新增注册人数": 6}, + {"日期": "2026-03-27", "渠道": "科大讯飞", "新增注册人数": 6}, + {"日期": "2026-03-28", "渠道": "京东方", "新增注册人数": 2}, + {"日期": "2026-03-28", "渠道": "学而思", "新增注册人数": 20}, + {"日期": "2026-03-28", "渠道": "希沃", "新增注册人数": 4}, + {"日期": "2026-03-28", "渠道": "科大讯飞", "新增注册人数": 12}, + {"日期": "2026-03-29", "渠道": "京东方", "新增注册人数": 2}, + {"日期": "2026-03-29", "渠道": "学而思", "新增注册人数": 16}, + {"日期": "2026-03-29", "渠道": "科大讯飞", "新增注册人数": 9}, + {"日期": "2026-03-30", "渠道": "京东方", "新增注册人数": 2}, + {"日期": "2026-03-30", "渠道": "学而思", "新增注册人数": 7}, + {"日期": "2026-03-30", "渠道": "希沃", "新增注册人数": 2}, + {"日期": "2026-03-30", "渠道": "科大讯飞", "新增注册人数": 6}, + {"日期": "2026-03-31", "渠道": "京东方", "新增注册人数": 3}, + {"日期": "2026-03-31", "渠道": "学而思", "新增注册人数": 10}, + {"日期": "2026-03-31", "渠道": "科大讯飞", "新增注册人数": 10}, +] +df_register = pd.DataFrame(register_data) -print(f"\n最新3月收入预测报表已生成:{output_file}") -print("\n整体统计结果:") -print(report_df[['渠道大类', '预测GSV', 'GSV', '完成率', '订单数', 'GMV', '退费率']]) +# 生成Excel文件 +output_path = "/root/.openclaw/workspace/2026年3月硬件渠道数据汇总.xlsx" +with pd.ExcelWriter(output_path, engine='openpyxl') as writer: + df_overall.to_excel(writer, sheet_name='整体统计', index=False) + df_purchase.to_excel(writer, sheet_name='每日购课明细', index=False) + df_register.to_excel(writer, sheet_name='每日新增注册明细', index=False) + +print(f"文件已生成:{output_path}") diff --git a/send_file.sh b/scripts/send_file.sh similarity index 100% rename from send_file.sh rename to scripts/send_file.sh diff --git a/venv/bin/python b/venv/bin/python deleted file mode 120000 index b8a0adb..0000000 --- a/venv/bin/python +++ /dev/null @@ -1 +0,0 @@ -python3 \ No newline at end of file diff --git a/venv/bin/python3 b/venv/bin/python3 deleted file mode 120000 index ae65fda..0000000 --- a/venv/bin/python3 +++ /dev/null @@ -1 +0,0 @@ -/usr/bin/python3 \ No newline at end of file diff --git a/venv/bin/python3.12 b/venv/bin/python3.12 deleted file mode 120000 index b8a0adb..0000000 --- a/venv/bin/python3.12 +++ /dev/null @@ -1 +0,0 @@ -python3 \ No newline at end of file diff --git a/venv/lib64 b/venv/lib64 deleted file mode 120000 index 7951405..0000000 --- a/venv/lib64 +++ /dev/null @@ -1 +0,0 @@ -lib \ No newline at end of file diff --git a/venv/pyvenv.cfg b/venv/pyvenv.cfg deleted file mode 100644 index 0d52065..0000000 --- a/venv/pyvenv.cfg +++ /dev/null @@ -1,5 +0,0 @@ -home = /usr/bin -include-system-site-packages = false -version = 3.12.3 -executable = /usr/bin/python3.12 -command = /usr/bin/python3 -m venv /root/.openclaw/workspace/venv