ai_member_xiaoban/scripts/extract_course_outline.py
2026-06-04 08:00:01 +08:00

96 lines
3.2 KiB
Python

#!/usr/bin/env python3
"""Extract course outline (core words, sentence patterns) from vala_bak database."""
import json
import pymysql
import re
conn = pymysql.connect(
host='bj-cdb-8frbdwju.sql.tencentcdb.com',
port=25413,
user='read_only',
password='fdsfiidier^$*hjfdijjd232',
database='vala_bak',
charset='utf8mb4'
)
# L2-S1 chapters: game_ids 12,10,16,18,19,20,21,22,23,24,25,26
# L2-S2 chapters: game_ids 27-38
# L2-S3 chapters: game_ids 39-43
all_game_ids = [12,10,16,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43]
cursor = conn.cursor()
# Get chapters
placeholders = ','.join(['%s'] * len(all_game_ids))
cursor.execute(f"""
SELECT gi.id as game_id, gi.cn_name as unit_name, gi.game_code,
gc.id as chapter_id, gc.cn_name as lesson_name, gc.index as lesson_idx
FROM vala_game_info gi
JOIN vala_game_chapter gc ON gc.game_id = gi.id
WHERE gi.id IN ({placeholders}) AND gc.lesson_type = 1
ORDER BY gi.id, gc.index
""", all_game_ids)
chapters = cursor.fetchall()
# Get text_parse for all chapters
chapter_ids = [c[3] for c in chapters]
if chapter_ids:
placeholders2 = ','.join(['%s'] * len(chapter_ids))
cursor.execute(f"""
SELECT chapter_id, text_parse FROM unit_chapter_text_parse WHERE chapter_id IN ({placeholders2})
""", chapter_ids)
text_parses = {row[0]: row[1] for row in cursor.fetchall()}
def extract_from_text_parse(tp_str):
"""Extract core words and sentence patterns from text_parse JSON."""
if not tp_str:
return [], []
words = {} # word -> meaning
sentences = {} # pattern -> title
try:
data = json.loads(tp_str)
except:
return [], []
for item in data:
# Extract from textParse
for tp in item.get('textParse', []):
for sl in tp.get('slices', []):
if sl.get('type') == 1 and 'meaning' in sl:
word = sl.get('slice', '')
meaning = sl.get('meaning', '')
if word and word not in words:
words[word] = meaning
elif sl.get('type') == 2 and 'meaning' in sl:
# sentence pattern
pattern = sl.get('slice', '')
if pattern and pattern not in sentences:
sentences[pattern] = sl.get('meaning', '')
# Also check keySlices
for ks in tp.get('keySlices', []):
sl = ks.get('slice', '')
meaning = ks.get('meaning', '')
if sl and sl not in words:
words[sl] = meaning
return list(words.keys()), list(sentences.keys())
# Process each chapter
for game_id, unit_name, game_code, chapter_id, lesson_name, lesson_idx in chapters:
tp = text_parses.get(chapter_id, '')
words, sentences = extract_from_text_parse(tp)
# Deduplicate and limit
words_str = ', '.join(words[:8]) if words else ''
sentences_str = ', '.join(sentences[:4]) if sentences else ''
print(f"GAME={game_id}|UNIT={unit_name}|CODE={game_code}|CH={chapter_id}|L{lesson_idx}|NAME={lesson_name}|WORDS={words_str}|SENT={sentences_str}")
cursor.close()
conn.close()