#!/usr/bin/env python3
"""L2剧本审校 - S3-U30-L3 霹雳飞船"""
import re, json, sys
# 读取文档 markdown
with open("/tmp/l2_script_review.md", "r") as f:
content = f.read()
# ========== 解析剧本表 ==========
# 找到"### 剧本"后的表格
script_section = content.split("### 剧本")[1] if "### 剧本" in content else content
# 解析 lark-table 行
rows = []
table_match = re.findall(r'(.*?)', script_section, re.DOTALL)
for tr in table_match:
cells = re.findall(r']*>(.*?)', tr, re.DOTALL)
# 清理 markdown 标记和 HTML 标签
clean_cells = []
for c in cells:
c = re.sub(r'<[^>]+>', '', c) # 移除 HTML 标签
c = re.sub(r'\*\*', '', c) # 移除 markdown 加粗
c = re.sub(r'\{align="[^"]*"\}', '', c)
c = re.sub(r'\{color="[^"]*"\}', '', c)
c = c.strip()
clean_cells.append(c)
if len(clean_cells) >= 6:
rows.append(clean_cells)
# 跳过表头行
header = rows[0] if rows else []
print(f"表头: {header}")
print(f"总行数(含表头): {len(rows)}")
data_rows = rows[1:] # 跳过表头
# 列映射: 0=类型, 1=剧情内容, 2=中文对白, 3=知识点, 4=翻译, 5=配置信息
print("\n" + "="*80)
print("一、台词字数统计")
print("="*80)
total_cn_chars = 0
total_en_words = 0
sentence_stats = []
for i, row in enumerate(data_rows):
row_type = row[0].strip() if len(row) > 0 else ""
cn_text = row[2].strip() if len(row) > 2 else ""
en_text = row[4].strip() if len(row) > 4 else ""
cn_count = len(re.sub(r'\s', '', cn_text))
en_words = len(en_text.split()) if en_text else 0
total_cn_chars += cn_count
total_en_words += en_words
if en_words > 0:
sentence_stats.append({
'row': i+2, 'type': row_type, 'cn_chars': cn_count, 'en_words': en_words,
'en_text_preview': en_text[:80]
})
print(f"中文对白总字数: {total_cn_chars}")
print(f"英文翻译总词数: {total_en_words}")
print(f"规范: 中文1500-2000字, 英文1000-1300词")
print(f"中文判定: {'超标' if total_cn_chars > 2300 else ('偏多' if total_cn_chars > 2000 else '合规')}")
print(f"英文判定: {'超标' if total_en_words > 1300 else ('偏多' if total_en_words > 1300 else '合规')}")
# 单句词数检查
print("\n单句词数检查:")
for s in sentence_stats:
w = s['en_words']
t = s['type']
if 'TL' in t:
limit = 18
elif any(k in t for k in ['朗读', '挖空', '组句', '选读', '表达', '选择']):
limit = 15
elif any(k in t for k in ['阅读']):
limit = 22
elif any(k in t for k in ['听力', '听']):
limit = 15
else:
limit = 18
if w > limit:
print(f" ⚠️ 行{s['row']} [{t}] {w}词 > {limit}词上限: {s['en_text_preview']}...")
print("\n" + "="*80)
print("二、互动量统计")
print("="*80)
interaction_count = 0
core_interactions = []
interaction_types = []
for i, row in enumerate(data_rows):
row_type = row[0].strip() if len(row) > 0 else ""
if row_type and 'TL' not in row_type:
interaction_count += 1
interaction_types.append(row_type)
# 核心互动识别
if any(k in row_type for k in ['阅读', '任务对话', '口语妙问', '口语独白', '合作听力', '邮件', '写作']):
core_interactions.append({'row': i+2, 'type': row_type})
print(f"总互动量: {interaction_count} (规范: 22-26)")
print(f"核心互动数: {len(core_interactions)} (规范: 2个/lesson)")
# 互动类型分布
from collections import Counter
type_dist = Counter(interaction_types)
print("\n互动类型分布:")
for t, c in type_dist.most_common():
print(f" {t}: {c}次")
print("\n核心互动明细:")
for ci in core_interactions:
print(f" 行{ci['row']}: {ci['type']}")
print("\n" + "="*80)
print("三、知识点覆盖统计")
print("="*80)
# 知识点出现次数
kp_counter = Counter()
kp_rows = {}
for i, row in enumerate(data_rows):
kp_text = row[3].strip() if len(row) > 3 else ""
if kp_text:
# 可能包含多个知识点,用换行或逗号分隔
kps = re.split(r'[\n,,、]+', kp_text)
for kp in kps:
kp = kp.strip()
if kp:
kp_counter[kp] += 1
if kp not in kp_rows:
kp_rows[kp] = []
kp_rows[kp].append(i+2)
print("知识点出现次数:")
for kp, count in kp_counter.most_common():
status = "✅" if 2 <= count <= 3 else ("⚠️超标" if count > 3 else "⚠️不足")
print(f" {kp}: {count}次 {status} (行: {kp_rows[kp]})")
print("\n" + "="*80)
print("四、核心互动类型分布")
print("="*80)
# 映射核心互动类型
type_map = {
'阅读理解': '读', '阅读': '读',
'任务对话': '说', '口语妙问': '说', '口语独白': '说', '口语': '说',
'合作听力': '听', '听力': '听',
'邮件撰写': '写', '写作回复': '写', '写作': '写',
}
core_type_count = Counter()
for ci in core_interactions:
mapped = None
for k, v in type_map.items():
if k in ci['type']:
mapped = v
break
if mapped:
core_type_count[mapped] += 1
else:
core_type_count['其他'] += 1
print(f"核心互动类型分布: 读{core_type_count.get('读',0)} : 说{core_type_count.get('说',0)} : 听{core_type_count.get('听',0)} : 写{core_type_count.get('写',0)}")
print(f"规范比例: 读3 : 说3 : 听2 : 写2")
print("\n" + "="*80)
print("五、拼写/语法检查")
print("="*80)
spelling_errors = []
grammar_errors = []
# 常见拼写错误
common_misspellings = {
'recieve': 'receive', 'seperate': 'separate', 'occured': 'occurred',
'accomodate': 'accommodate', 'acheive': 'achieve', 'beleive': 'believe',
'calender': 'calendar', 'definately': 'definitely', 'embarass': 'embarrass',
'goverment': 'government', 'neccessary': 'necessary', 'occassion': 'occasion',
'paralel': 'parallel', 'priviledge': 'privilege', 'recomend': 'recommend',
'tommorow': 'tomorrow', 'untill': 'until', 'wether': 'whether',
}
for i, row in enumerate(data_rows):
en_text = row[4].strip() if len(row) > 4 else ""
if not en_text:
continue
# 拼写检查
words_lower = en_text.lower()
for wrong, correct in common_misspellings.items():
if wrong in words_lower:
spelling_errors.append({
'row': i+2, 'type': '拼写错误', 'original': wrong, 'suggestion': correct,
'context': en_text[:100]
})
# 重复词检查
words = en_text.split()
for j in range(len(words)-1):
if words[j].lower() == words[j+1].lower():
grammar_errors.append({
'row': i+2, 'type': '重复词', 'original': f"{words[j]} {words[j+1]}",
'suggestion': words[j], 'context': en_text[:100]
})
# its/it's 检查
its_matches = re.finditer(r'\bits\b', en_text)
for m in its_matches:
pos = m.start()
after = en_text[pos+3:].strip()
if after and after[0].isalpha() and not after.startswith("'s"):
grammar_errors.append({
'row': i+2, 'type': "its/it's混淆", 'original': 'its',
'suggestion': "it's" if after.split()[0] not in ['own','name','color','size','shape','way'] else 'its',
'context': en_text[max(0,pos-20):pos+30]
})
print(f"拼写错误: {len(spelling_errors)}处")
for e in spelling_errors:
print(f" 行{e['row']}: {e['original']} → {e['suggestion']} | {e['context'][:60]}")
print(f"\n语法问题: {len(grammar_errors)}处")
for e in grammar_errors:
print(f" 行{e['row']} [{e['type']}]: {e['original']} → {e['suggestion']} | {e['context'][:60]}")
print("\n" + "="*80)
print("六、词汇难度检查")
print("="*80)
# 加载L2词库
with open("/root/.openclaw/workspace-xiaoyan/business_knowledge/L2_word_list.json", "r") as f:
l2_words = json.load(f)
# 构建已知词集合
known_words = set()
word_list = l2_words.get('words', l2_words) if isinstance(l2_words, dict) else l2_words
for w in word_list:
if isinstance(w, dict):
known_words.add(w.get('word', '').lower().strip())
# B1+ 常见超纲词
b1_words = {
'unforgivable', 'arrogance', 'corrupts', 'electromagnetic', 'bargain', 'negotiate',
'convince', 'persuade', 'reluctant', 'desperate', 'frustrated', 'annoyed',
'embarrassed', 'disappointed', 'impressed', 'fascinated', 'terrified',
'absolutely', 'definitely', 'probably', 'unfortunately', 'fortunately',
'accidentally', 'apparently', 'obviously', 'gradually', 'eventually',
'opportunity', 'experience', 'environment', 'government', 'information',
'population', 'situation', 'tradition', 'education', 'competition',
'explosion', 'explode', 'crash', 'collapse', 'destroy', 'damage',
'repair', 'replace', 'install', 'remove', 'connect', 'disconnect',
'engine', 'vehicle', 'transport', 'passenger', 'customer', 'client',
'service', 'product', 'quality', 'quantity', 'price', 'value',
'similar', 'different', 'important', 'necessary', 'possible', 'impossible',
'comfortable', 'uncomfortable', 'dangerous', 'safe', 'popular', 'common',
'special', 'normal', 'strange', 'weird', 'terrible', 'wonderful',
'excellent', 'perfect', 'awful', 'horrible', 'brilliant', 'fantastic',
'massive', 'huge', 'tiny', 'enormous', 'giant',
}
# 实际需要检查CEFR等级的词汇 - 用简化方式
vocab_issues = []
for i, row in enumerate(data_rows):
en_text = row[4].strip() if len(row) > 4 else ""
if not en_text:
continue
words = re.findall(r'\b[a-zA-Z]+\b', en_text.lower())
for w in words:
if len(w) <= 2:
continue
if w not in known_words and w in b1_words:
vocab_issues.append({
'row': i+2, 'word': w, 'context': en_text[:80]
})
print(f"超纲词(B1+): {len(vocab_issues)}处")
for v in vocab_issues:
print(f" 行{v['row']}: {v['word']} | {v['context'][:60]}")
print("\n" + "="*80)
print("七、标点符号与价值观检查")
print("="*80)
punct_issues = []
value_issues = []
# 负面词汇
negative_words = ['白痴', '人渣', '废物', '笨蛋', '蠢货', '傻瓜', '神经病', '去死', '滚开']
violence_words = ['吃人不吐骨', '死得很惨', '只能活一个', '你会后悔的', '让你好看', '杀了你', '打死你']
for i, row in enumerate(data_rows):
cn_text = row[2].strip() if len(row) > 2 else ""
en_text = row[4].strip() if len(row) > 4 else ""
# 中文标点检查
if '~' in cn_text:
punct_issues.append({'row': i+2, 'type': '波浪号', 'text': cn_text[:60]})
if '!!' in en_text:
punct_issues.append({'row': i+2, 'type': '双叹号', 'text': en_text[:60]})
if '**' in en_text:
punct_issues.append({'row': i+2, 'type': 'Markdown标记', 'text': en_text[:60]})
# 英文全角标点
if re.search(r'[!?,。]', en_text):
punct_issues.append({'row': i+2, 'type': '英文全角标点', 'text': en_text[:60]})
# 中文半角标点
if re.search(r'[!?,]', cn_text):
punct_issues.append({'row': i+2, 'type': '中文半角标点', 'text': cn_text[:60]})
# 价值观检查
for nw in negative_words:
if nw in cn_text:
value_issues.append({'row': i+2, 'type': '侮辱性词汇', 'word': nw, 'text': cn_text[:60]})
for vw in violence_words:
if vw in cn_text:
value_issues.append({'row': i+2, 'type': '暴力隐喻', 'word': vw, 'text': cn_text[:60]})
print(f"标点问题: {len(punct_issues)}处")
for p in punct_issues:
print(f" 行{p['row']} [{p['type']}]: {p['text'][:60]}")
print(f"\n价值观问题: {len(value_issues)}处")
for v in value_issues:
print(f" 行{v['row']} [{v['type']}]: {v['word']} | {v['text'][:60]}")
print("\n" + "="*80)
print("八、结构性问题检查")
print("="*80)
struct_issues = []
# 检查删除线残留
for i, row in enumerate(data_rows):
for col_idx in [1, 2, 4]: # 剧情内容、中文对白、翻译
text = row[col_idx] if len(row) > col_idx else ""
if '~~' in text:
struct_issues.append({'row': i+2, 'type': '删除线残留', 'text': text[:60]})
# 检查组件配置列是否为空
empty_config = 0
for i, row in enumerate(data_rows):
row_type = row[0].strip() if len(row) > 0 else ""
if row_type and 'TL' not in row_type:
config = row[5].strip() if len(row) > 5 else ""
if not config:
empty_config += 1
struct_issues.append({'row': i+2, 'type': '组件配置为空', 'text': row_type})
print(f"结构性问题: {len(struct_issues)}处")
for s in struct_issues:
print(f" 行{s['row']} [{s['type']}]: {s['text'][:60]}")
print(f"\n组件配置为空的互动行: {empty_config}个")
print("\n" + "="*80)
print("九、总结评分")
print("="*80)
# 汇总
def grade_cn(cn):
if cn > 2300: return '🔴 超标'
elif cn > 2000: return '🟡 偏多'
else: return '✅ 合规'
def grade_en(en):
if en > 1300: return '🔴 超标'
elif en > 1300: return '🟡 偏多'
else: return '✅ 合规'
def grade_interaction(n):
if n < 22: return '🔴 不足'
elif n > 26: return '🔴 超标'
else: return '✅ 合规'
def grade_kp(kp_counter):
issues = sum(1 for c in kp_counter.values() if c > 3 or c < 2)
if issues >= 3: return '🔴 问题较多'
elif issues > 0: return '🟡 部分超标'
else: return '✅ 合规'
def grade_spell(n):
if n > 5: return '🔴 硬伤较多'
elif n > 0: return '🟡 少量问题'
else: return '✅ 合规'
def grade_vocab(n):
if n > 10: return '🔴 超纲较多'
elif n > 3: return '🟡 少量超纲'
else: return '✅ 合规'
def grade_punct(n):
if n > 10: return '🔴 问题较多'
elif n > 3: return '🟡 少量问题'
else: return '✅ 合规'
def grade_struct(n):
if n > 5: return '🔴 问题较多'
elif n > 0: return '🟡 少量问题'
else: return '✅ 合规'
grades = {
'台词字数(中)': grade_cn(total_cn_chars),
'台词字数(英)': grade_en(total_en_words),
'互动量': grade_interaction(interaction_count),
'知识点覆盖': grade_kp(kp_counter),
'核心互动': '✅ 合规' if len(core_interactions) == 2 else ('🔴 超标' if len(core_interactions) > 2 else '🟡 不足'),
'拼写/语法': grade_spell(len(spelling_errors) + len(grammar_errors)),
'词汇难度': grade_vocab(len(vocab_issues)),
'标点/价值观': grade_punct(len(punct_issues) + len(value_issues)),
'结构性问题': grade_struct(len(struct_issues)),
}
red_count = sum(1 for v in grades.values() if '🔴' in v)
yellow_count = sum(1 for v in grades.values() if '🟡' in v)
print(f"{'维度':<16} {'状态':<14} {'等级'}")
print("-"*44)
for k, v in grades.items():
print(f"{k:<16} {v:<14}")
print(f"\n🔴维度: {red_count}, 🟡维度: {yellow_count}")
if red_count >= 3:
print("整体判定: 🔴 不合格(需大修)")
elif red_count >= 1:
print("整体判定: 🟡 需修改")
else:
print("整体判定: ✅ 合格")