ai_member_xiaoyan/output/S3_U30_L3_审校分析.py

#!/usr/bin/env python3
"""L2剧本审校 - S3-U30-L3 霹雳飞船"""
import re, json, sys

# 读取文档 markdown
with open("/tmp/l2_script_review.md", "r") as f:
    content = f.read()

# ========== 解析剧本表 ==========
# 找到"### 剧本"后的表格
script_section = content.split("### 剧本")[1] if "### 剧本" in content else content

# 解析 lark-table 行
rows = []
table_match = re.findall(r'<lark-tr>(.*?)</lark-tr>', script_section, re.DOTALL)
for tr in table_match:
    cells = re.findall(r'<lark-td[^>]*>(.*?)</lark-td>', tr, re.DOTALL)
    # 清理 markdown 标记和 HTML 标签
    clean_cells = []
    for c in cells:
        c = re.sub(r'<[^>]+>', '', c)  # 移除 HTML 标签
        c = re.sub(r'\*\*', '', c)  # 移除 markdown 加粗
        c = re.sub(r'\{align="[^"]*"\}', '', c)
        c = re.sub(r'\{color="[^"]*"\}', '', c)
        c = c.strip()
        clean_cells.append(c)
    if len(clean_cells) >= 6:
        rows.append(clean_cells)

# 跳过表头行
header = rows[0] if rows else []
print(f"表头: {header}")
print(f"总行数(含表头): {len(rows)}")

data_rows = rows[1:]  # 跳过表头

# 列映射: 0=类型, 1=剧情内容, 2=中文对白, 3=知识点, 4=翻译, 5=配置信息
print("\n" + "="*80)
print("一、台词字数统计")
print("="*80)

total_cn_chars = 0
total_en_words = 0
sentence_stats = []

for i, row in enumerate(data_rows):
    row_type = row[0].strip() if len(row) > 0 else ""
    cn_text = row[2].strip() if len(row) > 2 else ""
    en_text = row[4].strip() if len(row) > 4 else ""

    cn_count = len(re.sub(r'\s', '', cn_text))
    en_words = len(en_text.split()) if en_text else 0

    total_cn_chars += cn_count
    total_en_words += en_words

    if en_words > 0:
        sentence_stats.append({
            'row': i+2, 'type': row_type, 'cn_chars': cn_count, 'en_words': en_words,
            'en_text_preview': en_text[:80]
        })

print(f"中文对白总字数: {total_cn_chars}")
print(f"英文翻译总词数: {total_en_words}")
print(f"规范: 中文1500-2000字, 英文1000-1300词")
print(f"中文判定: {'超标' if total_cn_chars > 2300 else ('偏多' if total_cn_chars > 2000 else '合规')}")
print(f"英文判定: {'超标' if total_en_words > 1300 else ('偏多' if total_en_words > 1300 else '合规')}")

# 单句词数检查
print("\n单句词数检查:")
for s in sentence_stats:
    w = s['en_words']
    t = s['type']
    if 'TL' in t:
        limit = 18
    elif any(k in t for k in ['朗读', '挖空', '组句', '选读', '表达', '选择']):
        limit = 15
    elif any(k in t for k in ['阅读']):
        limit = 22
    elif any(k in t for k in ['听力', '听']):
        limit = 15
    else:
        limit = 18

    if w > limit:
        print(f"  ⚠️ 行{s['row']} [{t}] {w}词 > {limit}词上限: {s['en_text_preview']}...")

print("\n" + "="*80)
print("二、互动量统计")
print("="*80)

interaction_count = 0
core_interactions = []
interaction_types = []

for i, row in enumerate(data_rows):
    row_type = row[0].strip() if len(row) > 0 else ""
    if row_type and 'TL' not in row_type:
        interaction_count += 1
        interaction_types.append(row_type)
        # 核心互动识别
        if any(k in row_type for k in ['阅读', '任务对话', '口语妙问', '口语独白', '合作听力', '邮件', '写作']):
            core_interactions.append({'row': i+2, 'type': row_type})

print(f"总互动量: {interaction_count} (规范: 22-26)")
print(f"核心互动数: {len(core_interactions)} (规范: 2个/lesson)")

# 互动类型分布
from collections import Counter
type_dist = Counter(interaction_types)
print("\n互动类型分布:")
for t, c in type_dist.most_common():
    print(f"  {t}: {c}次")

print("\n核心互动明细:")
for ci in core_interactions:
    print(f"  行{ci['row']}: {ci['type']}")

print("\n" + "="*80)
print("三、知识点覆盖统计")
print("="*80)

# 知识点出现次数
kp_counter = Counter()
kp_rows = {}

for i, row in enumerate(data_rows):
    kp_text = row[3].strip() if len(row) > 3 else ""
    if kp_text:
        # 可能包含多个知识点，用换行或逗号分隔
        kps = re.split(r'[\n,，、]+', kp_text)
        for kp in kps:
            kp = kp.strip()
            if kp:
                kp_counter[kp] += 1
                if kp not in kp_rows:
                    kp_rows[kp] = []
                kp_rows[kp].append(i+2)

print("知识点出现次数:")
for kp, count in kp_counter.most_common():
    status = "✅" if 2 <= count <= 3 else ("⚠️超标" if count > 3 else "⚠️不足")
    print(f"  {kp}: {count}次 {status} (行: {kp_rows[kp]})")

print("\n" + "="*80)
print("四、核心互动类型分布")
print("="*80)

# 映射核心互动类型
type_map = {
    '阅读理解': '读', '阅读': '读',
    '任务对话': '说', '口语妙问': '说', '口语独白': '说', '口语': '说',
    '合作听力': '听', '听力': '听',
    '邮件撰写': '写', '写作回复': '写', '写作': '写',
}

core_type_count = Counter()
for ci in core_interactions:
    mapped = None
    for k, v in type_map.items():
        if k in ci['type']:
            mapped = v
            break
    if mapped:
        core_type_count[mapped] += 1
    else:
        core_type_count['其他'] += 1

print(f"核心互动类型分布: 读{core_type_count.get('读',0)} : 说{core_type_count.get('说',0)} : 听{core_type_count.get('听',0)} : 写{core_type_count.get('写',0)}")
print(f"规范比例: 读3 : 说3 : 听2 : 写2")

print("\n" + "="*80)
print("五、拼写/语法检查")
print("="*80)

spelling_errors = []
grammar_errors = []

# 常见拼写错误
common_misspellings = {
    'recieve': 'receive', 'seperate': 'separate', 'occured': 'occurred',
    'accomodate': 'accommodate', 'acheive': 'achieve', 'beleive': 'believe',
    'calender': 'calendar', 'definately': 'definitely', 'embarass': 'embarrass',
    'goverment': 'government', 'neccessary': 'necessary', 'occassion': 'occasion',
    'paralel': 'parallel', 'priviledge': 'privilege', 'recomend': 'recommend',
    'tommorow': 'tomorrow', 'untill': 'until', 'wether': 'whether',
}

for i, row in enumerate(data_rows):
    en_text = row[4].strip() if len(row) > 4 else ""
    if not en_text:
        continue

    # 拼写检查
    words_lower = en_text.lower()
    for wrong, correct in common_misspellings.items():
        if wrong in words_lower:
            spelling_errors.append({
                'row': i+2, 'type': '拼写错误', 'original': wrong, 'suggestion': correct,
                'context': en_text[:100]
            })

    # 重复词检查
    words = en_text.split()
    for j in range(len(words)-1):
        if words[j].lower() == words[j+1].lower():
            grammar_errors.append({
                'row': i+2, 'type': '重复词', 'original': f"{words[j]} {words[j+1]}",
                'suggestion': words[j], 'context': en_text[:100]
            })

    # its/it's 检查
    its_matches = re.finditer(r'\bits\b', en_text)
    for m in its_matches:
        pos = m.start()
        after = en_text[pos+3:].strip()
        if after and after[0].isalpha() and not after.startswith("'s"):
            grammar_errors.append({
                'row': i+2, 'type': "its/it's混淆", 'original': 'its',
                'suggestion': "it's" if after.split()[0] not in ['own','name','color','size','shape','way'] else 'its',
                'context': en_text[max(0,pos-20):pos+30]
            })

print(f"拼写错误: {len(spelling_errors)}处")
for e in spelling_errors:
    print(f"  行{e['row']}: {e['original']} → {e['suggestion']} | {e['context'][:60]}")

print(f"\n语法问题: {len(grammar_errors)}处")
for e in grammar_errors:
    print(f"  行{e['row']} [{e['type']}]: {e['original']} → {e['suggestion']} | {e['context'][:60]}")

print("\n" + "="*80)
print("六、词汇难度检查")
print("="*80)

# 加载L2词库
with open("/root/.openclaw/workspace-xiaoyan/business_knowledge/L2_word_list.json", "r") as f:
    l2_words = json.load(f)

# 构建已知词集合
known_words = set()
word_list = l2_words.get('words', l2_words) if isinstance(l2_words, dict) else l2_words
for w in word_list:
    if isinstance(w, dict):
        known_words.add(w.get('word', '').lower().strip())

# B1+ 常见超纲词
b1_words = {
    'unforgivable', 'arrogance', 'corrupts', 'electromagnetic', 'bargain', 'negotiate',
    'convince', 'persuade', 'reluctant', 'desperate', 'frustrated', 'annoyed',
    'embarrassed', 'disappointed', 'impressed', 'fascinated', 'terrified',
    'absolutely', 'definitely', 'probably', 'unfortunately', 'fortunately',
    'accidentally', 'apparently', 'obviously', 'gradually', 'eventually',
    'opportunity', 'experience', 'environment', 'government', 'information',
    'population', 'situation', 'tradition', 'education', 'competition',
    'explosion', 'explode', 'crash', 'collapse', 'destroy', 'damage',
    'repair', 'replace', 'install', 'remove', 'connect', 'disconnect',
    'engine', 'vehicle', 'transport', 'passenger', 'customer', 'client',
    'service', 'product', 'quality', 'quantity', 'price', 'value',
    'similar', 'different', 'important', 'necessary', 'possible', 'impossible',
    'comfortable', 'uncomfortable', 'dangerous', 'safe', 'popular', 'common',
    'special', 'normal', 'strange', 'weird', 'terrible', 'wonderful',
    'excellent', 'perfect', 'awful', 'horrible', 'brilliant', 'fantastic',
    'massive', 'huge', 'tiny', 'enormous', 'giant',
}

# 实际需要检查CEFR等级的词汇 - 用简化方式
vocab_issues = []
for i, row in enumerate(data_rows):
    en_text = row[4].strip() if len(row) > 4 else ""
    if not en_text:
        continue

    words = re.findall(r'\b[a-zA-Z]+\b', en_text.lower())
    for w in words:
        if len(w) <= 2:
            continue
        if w not in known_words and w in b1_words:
            vocab_issues.append({
                'row': i+2, 'word': w, 'context': en_text[:80]
            })

print(f"超纲词(B1+): {len(vocab_issues)}处")
for v in vocab_issues:
    print(f"  行{v['row']}: {v['word']} | {v['context'][:60]}")

print("\n" + "="*80)
print("七、标点符号与价值观检查")
print("="*80)

punct_issues = []
value_issues = []

# 负面词汇
negative_words = ['白痴', '人渣', '废物', '笨蛋', '蠢货', '傻瓜', '神经病', '去死', '滚开']
violence_words = ['吃人不吐骨', '死得很惨', '只能活一个', '你会后悔的', '让你好看', '杀了你', '打死你']

for i, row in enumerate(data_rows):
    cn_text = row[2].strip() if len(row) > 2 else ""
    en_text = row[4].strip() if len(row) > 4 else ""

    # 中文标点检查
    if '～' in cn_text:
        punct_issues.append({'row': i+2, 'type': '波浪号', 'text': cn_text[:60]})
    if '!!' in en_text:
        punct_issues.append({'row': i+2, 'type': '双叹号', 'text': en_text[:60]})
    if '**' in en_text:
        punct_issues.append({'row': i+2, 'type': 'Markdown标记', 'text': en_text[:60]})

    # 英文全角标点
    if re.search(r'[！？，。]', en_text):
        punct_issues.append({'row': i+2, 'type': '英文全角标点', 'text': en_text[:60]})

    # 中文半角标点
    if re.search(r'[!?,]', cn_text):
        punct_issues.append({'row': i+2, 'type': '中文半角标点', 'text': cn_text[:60]})

    # 价值观检查
    for nw in negative_words:
        if nw in cn_text:
            value_issues.append({'row': i+2, 'type': '侮辱性词汇', 'word': nw, 'text': cn_text[:60]})
    for vw in violence_words:
        if vw in cn_text:
            value_issues.append({'row': i+2, 'type': '暴力隐喻', 'word': vw, 'text': cn_text[:60]})

print(f"标点问题: {len(punct_issues)}处")
for p in punct_issues:
    print(f"  行{p['row']} [{p['type']}]: {p['text'][:60]}")

print(f"\n价值观问题: {len(value_issues)}处")
for v in value_issues:
    print(f"  行{v['row']} [{v['type']}]: {v['word']} | {v['text'][:60]}")

print("\n" + "="*80)
print("八、结构性问题检查")
print("="*80)

struct_issues = []

# 检查删除线残留
for i, row in enumerate(data_rows):
    for col_idx in [1, 2, 4]:  # 剧情内容、中文对白、翻译
        text = row[col_idx] if len(row) > col_idx else ""
        if '~~' in text:
            struct_issues.append({'row': i+2, 'type': '删除线残留', 'text': text[:60]})

# 检查组件配置列是否为空
empty_config = 0
for i, row in enumerate(data_rows):
    row_type = row[0].strip() if len(row) > 0 else ""
    if row_type and 'TL' not in row_type:
        config = row[5].strip() if len(row) > 5 else ""
        if not config:
            empty_config += 1
            struct_issues.append({'row': i+2, 'type': '组件配置为空', 'text': row_type})

print(f"结构性问题: {len(struct_issues)}处")
for s in struct_issues:
    print(f"  行{s['row']} [{s['type']}]: {s['text'][:60]}")

print(f"\n组件配置为空的互动行: {empty_config}个")

print("\n" + "="*80)
print("九、总结评分")
print("="*80)

# 汇总
def grade_cn(cn):
    if cn > 2300: return '🔴 超标'
    elif cn > 2000: return '🟡 偏多'
    else: return '✅ 合规'

def grade_en(en):
    if en > 1300: return '🔴 超标'
    elif en > 1300: return '🟡 偏多'
    else: return '✅ 合规'

def grade_interaction(n):
    if n < 22: return '🔴 不足'
    elif n > 26: return '🔴 超标'
    else: return '✅ 合规'

def grade_kp(kp_counter):
    issues = sum(1 for c in kp_counter.values() if c > 3 or c < 2)
    if issues >= 3: return '🔴 问题较多'
    elif issues > 0: return '🟡 部分超标'
    else: return '✅ 合规'

def grade_spell(n):
    if n > 5: return '🔴 硬伤较多'
    elif n > 0: return '🟡 少量问题'
    else: return '✅ 合规'

def grade_vocab(n):
    if n > 10: return '🔴 超纲较多'
    elif n > 3: return '🟡 少量超纲'
    else: return '✅ 合规'

def grade_punct(n):
    if n > 10: return '🔴 问题较多'
    elif n > 3: return '🟡 少量问题'
    else: return '✅ 合规'

def grade_struct(n):
    if n > 5: return '🔴 问题较多'
    elif n > 0: return '🟡 少量问题'
    else: return '✅ 合规'

grades = {
    '台词字数(中)': grade_cn(total_cn_chars),
    '台词字数(英)': grade_en(total_en_words),
    '互动量': grade_interaction(interaction_count),
    '知识点覆盖': grade_kp(kp_counter),
    '核心互动': '✅ 合规' if len(core_interactions) == 2 else ('🔴 超标' if len(core_interactions) > 2 else '🟡 不足'),
    '拼写/语法': grade_spell(len(spelling_errors) + len(grammar_errors)),
    '词汇难度': grade_vocab(len(vocab_issues)),
    '标点/价值观': grade_punct(len(punct_issues) + len(value_issues)),
    '结构性问题': grade_struct(len(struct_issues)),
}

red_count = sum(1 for v in grades.values() if '🔴' in v)
yellow_count = sum(1 for v in grades.values() if '🟡' in v)

print(f"{'维度':<16} {'状态':<14} {'等级'}")
print("-"*44)
for k, v in grades.items():
    print(f"{k:<16} {v:<14}")

print(f"\n🔴维度: {red_count}, 🟡维度: {yellow_count}")
if red_count >= 3:
    print("整体判定: 🔴 不合格（需大修）")
elif red_count >= 1:
    print("整体判定: 🟡 需修改")
else:
    print("整体判定: ✅ 合格")