#!/usr/bin/env python3 """Extract course outline (core words, sentence patterns) from vala_bak database.""" import json import pymysql import re conn = pymysql.connect( host='bj-cdb-8frbdwju.sql.tencentcdb.com', port=25413, user='read_only', password='fdsfiidier^$*hjfdijjd232', database='vala_bak', charset='utf8mb4' ) # L2-S1 chapters: game_ids 12,10,16,18,19,20,21,22,23,24,25,26 # L2-S2 chapters: game_ids 27-38 # L2-S3 chapters: game_ids 39-43 all_game_ids = [12,10,16,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43] cursor = conn.cursor() # Get chapters placeholders = ','.join(['%s'] * len(all_game_ids)) cursor.execute(f""" SELECT gi.id as game_id, gi.cn_name as unit_name, gi.game_code, gc.id as chapter_id, gc.cn_name as lesson_name, gc.index as lesson_idx FROM vala_game_info gi JOIN vala_game_chapter gc ON gc.game_id = gi.id WHERE gi.id IN ({placeholders}) AND gc.lesson_type = 1 ORDER BY gi.id, gc.index """, all_game_ids) chapters = cursor.fetchall() # Get text_parse for all chapters chapter_ids = [c[3] for c in chapters] if chapter_ids: placeholders2 = ','.join(['%s'] * len(chapter_ids)) cursor.execute(f""" SELECT chapter_id, text_parse FROM unit_chapter_text_parse WHERE chapter_id IN ({placeholders2}) """, chapter_ids) text_parses = {row[0]: row[1] for row in cursor.fetchall()} def extract_from_text_parse(tp_str): """Extract core words and sentence patterns from text_parse JSON.""" if not tp_str: return [], [] words = {} # word -> meaning sentences = {} # pattern -> title try: data = json.loads(tp_str) except: return [], [] for item in data: # Extract from textParse for tp in item.get('textParse', []): for sl in tp.get('slices', []): if sl.get('type') == 1 and 'meaning' in sl: word = sl.get('slice', '') meaning = sl.get('meaning', '') if word and word not in words: words[word] = meaning elif sl.get('type') == 2 and 'meaning' in sl: # sentence pattern pattern = sl.get('slice', '') if pattern and pattern not in sentences: sentences[pattern] = sl.get('meaning', '') # Also check keySlices for ks in tp.get('keySlices', []): sl = ks.get('slice', '') meaning = ks.get('meaning', '') if sl and sl not in words: words[sl] = meaning return list(words.keys()), list(sentences.keys()) # Process each chapter for game_id, unit_name, game_code, chapter_id, lesson_name, lesson_idx in chapters: tp = text_parses.get(chapter_id, '') words, sentences = extract_from_text_parse(tp) # Deduplicate and limit words_str = ', '.join(words[:8]) if words else '' sentences_str = ', '.join(sentences[:4]) if sentences else '' print(f"GAME={game_id}|UNIT={unit_name}|CODE={game_code}|CH={chapter_id}|L{lesson_idx}|NAME={lesson_name}|WORDS={words_str}|SENT={sentences_str}") cursor.close() conn.close()