ai_member_xiaoban/scripts/extract_course_outline.py

#!/usr/bin/env python3
"""Extract course outline (core words, sentence patterns) from vala_bak database."""
import json
import pymysql
import re

conn = pymysql.connect(
    host='bj-cdb-8frbdwju.sql.tencentcdb.com',
    port=25413,
    user='read_only',
    password='fdsfiidier^$*hjfdijjd232',
    database='vala_bak',
    charset='utf8mb4'
)

# L2-S1 chapters: game_ids 12,10,16,18,19,20,21,22,23,24,25,26
# L2-S2 chapters: game_ids 27-38
# L2-S3 chapters: game_ids 39-43

all_game_ids = [12,10,16,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43]

cursor = conn.cursor()

# Get chapters
placeholders = ','.join(['%s'] * len(all_game_ids))
cursor.execute(f"""
SELECT gi.id as game_id, gi.cn_name as unit_name, gi.game_code,
       gc.id as chapter_id, gc.cn_name as lesson_name, gc.index as lesson_idx
FROM vala_game_info gi
JOIN vala_game_chapter gc ON gc.game_id = gi.id
WHERE gi.id IN ({placeholders}) AND gc.lesson_type = 1
ORDER BY gi.id, gc.index
""", all_game_ids)

chapters = cursor.fetchall()

# Get text_parse for all chapters
chapter_ids = [c[3] for c in chapters]
if chapter_ids:
    placeholders2 = ','.join(['%s'] * len(chapter_ids))
    cursor.execute(f"""
    SELECT chapter_id, text_parse FROM unit_chapter_text_parse WHERE chapter_id IN ({placeholders2})
    """, chapter_ids)
    text_parses = {row[0]: row[1] for row in cursor.fetchall()}

def extract_from_text_parse(tp_str):
    """Extract core words and sentence patterns from text_parse JSON."""
    if not tp_str:
        return [], []

    words = {}  # word -> meaning
    sentences = {}  # pattern -> title

    try:
        data = json.loads(tp_str)
    except:
        return [], []

    for item in data:
        # Extract from textParse
        for tp in item.get('textParse', []):
            for sl in tp.get('slices', []):
                if sl.get('type') == 1 and 'meaning' in sl:
                    word = sl.get('slice', '')
                    meaning = sl.get('meaning', '')
                    if word and word not in words:
                        words[word] = meaning
                elif sl.get('type') == 2 and 'meaning' in sl:
                    # sentence pattern
                    pattern = sl.get('slice', '')
                    if pattern and pattern not in sentences:
                        sentences[pattern] = sl.get('meaning', '')

            # Also check keySlices
            for ks in tp.get('keySlices', []):
                sl = ks.get('slice', '')
                meaning = ks.get('meaning', '')
                if sl and sl not in words:
                    words[sl] = meaning

    return list(words.keys()), list(sentences.keys())

# Process each chapter
for game_id, unit_name, game_code, chapter_id, lesson_name, lesson_idx in chapters:
    tp = text_parses.get(chapter_id, '')
    words, sentences = extract_from_text_parse(tp)

    # Deduplicate and limit
    words_str = ', '.join(words[:8]) if words else ''
    sentences_str = ', '.join(sentences[:4]) if sentences else ''

    print(f"GAME={game_id}|UNIT={unit_name}|CODE={game_code}|CH={chapter_id}|L{lesson_idx}|NAME={lesson_name}|WORDS={words_str}|SENT={sentences_str}")

cursor.close()
conn.close()