ai_member_xiaobian/scripts/allocate_words.py
2026-06-12 08:10:01 +08:00

137 lines
5.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
U28-U36 选词分配 - 先确定每Unit的20词池
规则:
1. U29锁定11词 + 补9词
2. 每Unit约2个虚词总18个虚词分9个Unit
3. 大纲提示词优先分配
4. 每个词只能出现在一个Unit
5. 同一词多词性视为不同词条但不能放同一Lesson
"""
# 读取词库
words = []
with open('/tmp/wordbank_all.tsv', 'r') as f:
for line in f:
parts = line.strip().split('\t')
if len(parts) >= 4:
words.append({
'word': parts[0],
'pos': parts[1],
'meaning': parts[2],
'theme': parts[3],
'id': f"{parts[0]}({parts[1]})" # 唯一标识
})
# U29 锁定词(这些从可用池中移除)
u29_locked_words = [
'star', 'moon', 'forest', 'rainbow', 'hill',
'waterfall', 'jungle', 'leaf/leaves', 'lake', 'snow', 'ice'
]
# 建立可用池去除U29锁定词
available = []
u29_pool = []
for w in words:
if w['word'] in u29_locked_words:
u29_pool.append(w)
else:
available.append(w)
print(f"可用池: {len(available)} 词条")
print(f"U29锁定: {len(u29_pool)} 词条")
# ========== 按Unit分配20词 ==========
# 策略先把与各Unit强关联的词标出来
# Unit主题关键词映射
unit_themes = {
'U28': {
'desc': '乡村奥德赛-社区建筑',
'strong_themes': ['地点与建筑', '交通与出行', '职业'],
'keywords': ['building', 'café', 'farm', 'field', 'village', 'countryside',
'town', 'square', 'elevator', 'lift', 'car park', 'pool',
'map', 'trip', 'travel', 'ride', 'station', 'bus station',
'ticket', 'tractor', 'farmer', 'worker', 'driver',
'address', 'centre', 'town centre']
},
'U29': {
'desc': '追逐星星-四季岛自然',
'strong_themes': ['自然世界', '天气'],
'keywords': ['cloud', 'sky', 'river', 'wave', 'weather', 'wind',
'rain', 'cloudy', 'island']
},
'U30': {
'desc': '马戏巡游',
'strong_themes': ['爱好与休闲', '动物'],
'keywords': ['circus', 'lion', 'cage', 'clown', 'parrot', 'fire',
'mistake', 'band', 'net', 'climb', 'frightened',
'naughty', 'skip', 'quickly', 'exciting', 'brave',
'careful', 'drop', 'hurry', 'loud']
},
'U31': {
'desc': '家与家人-离别',
'strong_themes': ['家庭与朋友', '家居与房屋'],
'keywords': ['parent', 'aunt', 'uncle', 'grandparent', 'grandson',
'granddaughter', 'roof', 'stairs', 'floor', 'balcony',
'basement', 'wish', 'dream', 'surprised', 'everyone',
'quiet', 'think', 'different', 'only', 'share']
},
'U32': {
'desc': '小小园地-打理生态球',
'strong_themes': ['自然世界', '天气', '动物'],
'keywords': ['plant', 'grow', 'ground', 'river', 'cloud', 'sky',
'mountain', 'light', 'weather', 'temperature', 'wind',
'rain', 'kitten', 'puppy', 'feed', 'water', 'build',
'fix', 'tidy', 'lovely']
},
'U33': {
'desc': '森林派对-孤独',
'strong_themes': ['沟通与社交', '食物与饮品', '家居物品'],
'keywords': ['party', 'invite', 'send', 'message', 'breakfast',
'lunch', 'dinner', 'pancake', 'salad', 'cup', 'bowl',
'plate', 'glass', 'laugh', 'boring', 'noise',
'sometimes', 'nothing', 'picnic', 'walk']
},
'U34': {
'desc': '孤独空谷-情绪崩溃',
'strong_themes': ['感受与情绪', '程度', '动作与行为'],
'keywords': ['afraid', 'cry', 'bad', 'terrible', 'wrong',
'mountain', 'shout', 'opposite', 'fall', 'lose',
'change', 'weak', 'difficult', 'dark', 'asleep',
'badly', 'worse', 'worst', 'never', 'move']
},
'U35': {
'desc': '永不离弃-解心结',
'strong_themes': ['动作与行为', '沟通与社交', '状态与描述'],
'keywords': ['bring', 'carry', 'hurry', 'wait', 'call', 'need',
'should', 'must', 'idea', 'mean', 'safe', 'sure',
'strong', 'together', 'someone', 'something', 'then',
'find', 'turn', 'open']
},
'U36': {
'desc': '冬假快乐-冬季活动',
'strong_themes': ['运动', '衣物与配饰', '程度'],
'keywords': ['ice skates', 'ice skating', 'skate', 'coat', 'scarf',
'sweater', 'wonderful', 'lovely', 'better', 'best',
'well', 'free', 'happy', 'swim', 'roller skates',
'hop', 'score', 'win', 'cold', 'blanket']
}
}
# 检查哪些关键词在可用池中
print("\n=== 各Unit关键词在词库中的匹配情况 ===\n")
for unit, info in unit_themes.items():
found = []
missing = []
for kw in info['keywords']:
matches = [w for w in available if w['word'] == kw]
if matches:
found.extend(matches)
else:
missing.append(kw)
print(f"{unit} ({info['desc']}): 匹配{len(found)}词, 缺失{len(missing)}")
if missing:
print(f" 缺失: {missing[:10]}")
print()