436 lines
15 KiB
Python
436 lines
15 KiB
Python
#!/usr/bin/env python3
|
||
"""L2剧本审校 - S3-U30-L3 霹雳飞船"""
|
||
import re, json, sys
|
||
|
||
# 读取文档 markdown
|
||
with open("/tmp/l2_script_review.md", "r") as f:
|
||
content = f.read()
|
||
|
||
# ========== 解析剧本表 ==========
|
||
# 找到"### 剧本"后的表格
|
||
script_section = content.split("### 剧本")[1] if "### 剧本" in content else content
|
||
|
||
# 解析 lark-table 行
|
||
rows = []
|
||
table_match = re.findall(r'<lark-tr>(.*?)</lark-tr>', script_section, re.DOTALL)
|
||
for tr in table_match:
|
||
cells = re.findall(r'<lark-td[^>]*>(.*?)</lark-td>', tr, re.DOTALL)
|
||
# 清理 markdown 标记和 HTML 标签
|
||
clean_cells = []
|
||
for c in cells:
|
||
c = re.sub(r'<[^>]+>', '', c) # 移除 HTML 标签
|
||
c = re.sub(r'\*\*', '', c) # 移除 markdown 加粗
|
||
c = re.sub(r'\{align="[^"]*"\}', '', c)
|
||
c = re.sub(r'\{color="[^"]*"\}', '', c)
|
||
c = c.strip()
|
||
clean_cells.append(c)
|
||
if len(clean_cells) >= 6:
|
||
rows.append(clean_cells)
|
||
|
||
# 跳过表头行
|
||
header = rows[0] if rows else []
|
||
print(f"表头: {header}")
|
||
print(f"总行数(含表头): {len(rows)}")
|
||
|
||
data_rows = rows[1:] # 跳过表头
|
||
|
||
# 列映射: 0=类型, 1=剧情内容, 2=中文对白, 3=知识点, 4=翻译, 5=配置信息
|
||
print("\n" + "="*80)
|
||
print("一、台词字数统计")
|
||
print("="*80)
|
||
|
||
total_cn_chars = 0
|
||
total_en_words = 0
|
||
sentence_stats = []
|
||
|
||
for i, row in enumerate(data_rows):
|
||
row_type = row[0].strip() if len(row) > 0 else ""
|
||
cn_text = row[2].strip() if len(row) > 2 else ""
|
||
en_text = row[4].strip() if len(row) > 4 else ""
|
||
|
||
cn_count = len(re.sub(r'\s', '', cn_text))
|
||
en_words = len(en_text.split()) if en_text else 0
|
||
|
||
total_cn_chars += cn_count
|
||
total_en_words += en_words
|
||
|
||
if en_words > 0:
|
||
sentence_stats.append({
|
||
'row': i+2, 'type': row_type, 'cn_chars': cn_count, 'en_words': en_words,
|
||
'en_text_preview': en_text[:80]
|
||
})
|
||
|
||
print(f"中文对白总字数: {total_cn_chars}")
|
||
print(f"英文翻译总词数: {total_en_words}")
|
||
print(f"规范: 中文1500-2000字, 英文1000-1300词")
|
||
print(f"中文判定: {'超标' if total_cn_chars > 2300 else ('偏多' if total_cn_chars > 2000 else '合规')}")
|
||
print(f"英文判定: {'超标' if total_en_words > 1300 else ('偏多' if total_en_words > 1300 else '合规')}")
|
||
|
||
# 单句词数检查
|
||
print("\n单句词数检查:")
|
||
for s in sentence_stats:
|
||
w = s['en_words']
|
||
t = s['type']
|
||
if 'TL' in t:
|
||
limit = 18
|
||
elif any(k in t for k in ['朗读', '挖空', '组句', '选读', '表达', '选择']):
|
||
limit = 15
|
||
elif any(k in t for k in ['阅读']):
|
||
limit = 22
|
||
elif any(k in t for k in ['听力', '听']):
|
||
limit = 15
|
||
else:
|
||
limit = 18
|
||
|
||
if w > limit:
|
||
print(f" ⚠️ 行{s['row']} [{t}] {w}词 > {limit}词上限: {s['en_text_preview']}...")
|
||
|
||
print("\n" + "="*80)
|
||
print("二、互动量统计")
|
||
print("="*80)
|
||
|
||
interaction_count = 0
|
||
core_interactions = []
|
||
interaction_types = []
|
||
|
||
for i, row in enumerate(data_rows):
|
||
row_type = row[0].strip() if len(row) > 0 else ""
|
||
if row_type and 'TL' not in row_type:
|
||
interaction_count += 1
|
||
interaction_types.append(row_type)
|
||
# 核心互动识别
|
||
if any(k in row_type for k in ['阅读', '任务对话', '口语妙问', '口语独白', '合作听力', '邮件', '写作']):
|
||
core_interactions.append({'row': i+2, 'type': row_type})
|
||
|
||
print(f"总互动量: {interaction_count} (规范: 22-26)")
|
||
print(f"核心互动数: {len(core_interactions)} (规范: 2个/lesson)")
|
||
|
||
# 互动类型分布
|
||
from collections import Counter
|
||
type_dist = Counter(interaction_types)
|
||
print("\n互动类型分布:")
|
||
for t, c in type_dist.most_common():
|
||
print(f" {t}: {c}次")
|
||
|
||
print("\n核心互动明细:")
|
||
for ci in core_interactions:
|
||
print(f" 行{ci['row']}: {ci['type']}")
|
||
|
||
print("\n" + "="*80)
|
||
print("三、知识点覆盖统计")
|
||
print("="*80)
|
||
|
||
# 知识点出现次数
|
||
kp_counter = Counter()
|
||
kp_rows = {}
|
||
|
||
for i, row in enumerate(data_rows):
|
||
kp_text = row[3].strip() if len(row) > 3 else ""
|
||
if kp_text:
|
||
# 可能包含多个知识点,用换行或逗号分隔
|
||
kps = re.split(r'[\n,,、]+', kp_text)
|
||
for kp in kps:
|
||
kp = kp.strip()
|
||
if kp:
|
||
kp_counter[kp] += 1
|
||
if kp not in kp_rows:
|
||
kp_rows[kp] = []
|
||
kp_rows[kp].append(i+2)
|
||
|
||
print("知识点出现次数:")
|
||
for kp, count in kp_counter.most_common():
|
||
status = "✅" if 2 <= count <= 3 else ("⚠️超标" if count > 3 else "⚠️不足")
|
||
print(f" {kp}: {count}次 {status} (行: {kp_rows[kp]})")
|
||
|
||
print("\n" + "="*80)
|
||
print("四、核心互动类型分布")
|
||
print("="*80)
|
||
|
||
# 映射核心互动类型
|
||
type_map = {
|
||
'阅读理解': '读', '阅读': '读',
|
||
'任务对话': '说', '口语妙问': '说', '口语独白': '说', '口语': '说',
|
||
'合作听力': '听', '听力': '听',
|
||
'邮件撰写': '写', '写作回复': '写', '写作': '写',
|
||
}
|
||
|
||
core_type_count = Counter()
|
||
for ci in core_interactions:
|
||
mapped = None
|
||
for k, v in type_map.items():
|
||
if k in ci['type']:
|
||
mapped = v
|
||
break
|
||
if mapped:
|
||
core_type_count[mapped] += 1
|
||
else:
|
||
core_type_count['其他'] += 1
|
||
|
||
print(f"核心互动类型分布: 读{core_type_count.get('读',0)} : 说{core_type_count.get('说',0)} : 听{core_type_count.get('听',0)} : 写{core_type_count.get('写',0)}")
|
||
print(f"规范比例: 读3 : 说3 : 听2 : 写2")
|
||
|
||
print("\n" + "="*80)
|
||
print("五、拼写/语法检查")
|
||
print("="*80)
|
||
|
||
spelling_errors = []
|
||
grammar_errors = []
|
||
|
||
# 常见拼写错误
|
||
common_misspellings = {
|
||
'recieve': 'receive', 'seperate': 'separate', 'occured': 'occurred',
|
||
'accomodate': 'accommodate', 'acheive': 'achieve', 'beleive': 'believe',
|
||
'calender': 'calendar', 'definately': 'definitely', 'embarass': 'embarrass',
|
||
'goverment': 'government', 'neccessary': 'necessary', 'occassion': 'occasion',
|
||
'paralel': 'parallel', 'priviledge': 'privilege', 'recomend': 'recommend',
|
||
'tommorow': 'tomorrow', 'untill': 'until', 'wether': 'whether',
|
||
}
|
||
|
||
for i, row in enumerate(data_rows):
|
||
en_text = row[4].strip() if len(row) > 4 else ""
|
||
if not en_text:
|
||
continue
|
||
|
||
# 拼写检查
|
||
words_lower = en_text.lower()
|
||
for wrong, correct in common_misspellings.items():
|
||
if wrong in words_lower:
|
||
spelling_errors.append({
|
||
'row': i+2, 'type': '拼写错误', 'original': wrong, 'suggestion': correct,
|
||
'context': en_text[:100]
|
||
})
|
||
|
||
# 重复词检查
|
||
words = en_text.split()
|
||
for j in range(len(words)-1):
|
||
if words[j].lower() == words[j+1].lower():
|
||
grammar_errors.append({
|
||
'row': i+2, 'type': '重复词', 'original': f"{words[j]} {words[j+1]}",
|
||
'suggestion': words[j], 'context': en_text[:100]
|
||
})
|
||
|
||
# its/it's 检查
|
||
its_matches = re.finditer(r'\bits\b', en_text)
|
||
for m in its_matches:
|
||
pos = m.start()
|
||
after = en_text[pos+3:].strip()
|
||
if after and after[0].isalpha() and not after.startswith("'s"):
|
||
grammar_errors.append({
|
||
'row': i+2, 'type': "its/it's混淆", 'original': 'its',
|
||
'suggestion': "it's" if after.split()[0] not in ['own','name','color','size','shape','way'] else 'its',
|
||
'context': en_text[max(0,pos-20):pos+30]
|
||
})
|
||
|
||
print(f"拼写错误: {len(spelling_errors)}处")
|
||
for e in spelling_errors:
|
||
print(f" 行{e['row']}: {e['original']} → {e['suggestion']} | {e['context'][:60]}")
|
||
|
||
print(f"\n语法问题: {len(grammar_errors)}处")
|
||
for e in grammar_errors:
|
||
print(f" 行{e['row']} [{e['type']}]: {e['original']} → {e['suggestion']} | {e['context'][:60]}")
|
||
|
||
print("\n" + "="*80)
|
||
print("六、词汇难度检查")
|
||
print("="*80)
|
||
|
||
# 加载L2词库
|
||
with open("/root/.openclaw/workspace-xiaoyan/business_knowledge/L2_word_list.json", "r") as f:
|
||
l2_words = json.load(f)
|
||
|
||
# 构建已知词集合
|
||
known_words = set()
|
||
word_list = l2_words.get('words', l2_words) if isinstance(l2_words, dict) else l2_words
|
||
for w in word_list:
|
||
if isinstance(w, dict):
|
||
known_words.add(w.get('word', '').lower().strip())
|
||
|
||
# B1+ 常见超纲词
|
||
b1_words = {
|
||
'unforgivable', 'arrogance', 'corrupts', 'electromagnetic', 'bargain', 'negotiate',
|
||
'convince', 'persuade', 'reluctant', 'desperate', 'frustrated', 'annoyed',
|
||
'embarrassed', 'disappointed', 'impressed', 'fascinated', 'terrified',
|
||
'absolutely', 'definitely', 'probably', 'unfortunately', 'fortunately',
|
||
'accidentally', 'apparently', 'obviously', 'gradually', 'eventually',
|
||
'opportunity', 'experience', 'environment', 'government', 'information',
|
||
'population', 'situation', 'tradition', 'education', 'competition',
|
||
'explosion', 'explode', 'crash', 'collapse', 'destroy', 'damage',
|
||
'repair', 'replace', 'install', 'remove', 'connect', 'disconnect',
|
||
'engine', 'vehicle', 'transport', 'passenger', 'customer', 'client',
|
||
'service', 'product', 'quality', 'quantity', 'price', 'value',
|
||
'similar', 'different', 'important', 'necessary', 'possible', 'impossible',
|
||
'comfortable', 'uncomfortable', 'dangerous', 'safe', 'popular', 'common',
|
||
'special', 'normal', 'strange', 'weird', 'terrible', 'wonderful',
|
||
'excellent', 'perfect', 'awful', 'horrible', 'brilliant', 'fantastic',
|
||
'massive', 'huge', 'tiny', 'enormous', 'giant',
|
||
}
|
||
|
||
# 实际需要检查CEFR等级的词汇 - 用简化方式
|
||
vocab_issues = []
|
||
for i, row in enumerate(data_rows):
|
||
en_text = row[4].strip() if len(row) > 4 else ""
|
||
if not en_text:
|
||
continue
|
||
|
||
words = re.findall(r'\b[a-zA-Z]+\b', en_text.lower())
|
||
for w in words:
|
||
if len(w) <= 2:
|
||
continue
|
||
if w not in known_words and w in b1_words:
|
||
vocab_issues.append({
|
||
'row': i+2, 'word': w, 'context': en_text[:80]
|
||
})
|
||
|
||
print(f"超纲词(B1+): {len(vocab_issues)}处")
|
||
for v in vocab_issues:
|
||
print(f" 行{v['row']}: {v['word']} | {v['context'][:60]}")
|
||
|
||
print("\n" + "="*80)
|
||
print("七、标点符号与价值观检查")
|
||
print("="*80)
|
||
|
||
punct_issues = []
|
||
value_issues = []
|
||
|
||
# 负面词汇
|
||
negative_words = ['白痴', '人渣', '废物', '笨蛋', '蠢货', '傻瓜', '神经病', '去死', '滚开']
|
||
violence_words = ['吃人不吐骨', '死得很惨', '只能活一个', '你会后悔的', '让你好看', '杀了你', '打死你']
|
||
|
||
for i, row in enumerate(data_rows):
|
||
cn_text = row[2].strip() if len(row) > 2 else ""
|
||
en_text = row[4].strip() if len(row) > 4 else ""
|
||
|
||
# 中文标点检查
|
||
if '~' in cn_text:
|
||
punct_issues.append({'row': i+2, 'type': '波浪号', 'text': cn_text[:60]})
|
||
if '!!' in en_text:
|
||
punct_issues.append({'row': i+2, 'type': '双叹号', 'text': en_text[:60]})
|
||
if '**' in en_text:
|
||
punct_issues.append({'row': i+2, 'type': 'Markdown标记', 'text': en_text[:60]})
|
||
|
||
# 英文全角标点
|
||
if re.search(r'[!?,。]', en_text):
|
||
punct_issues.append({'row': i+2, 'type': '英文全角标点', 'text': en_text[:60]})
|
||
|
||
# 中文半角标点
|
||
if re.search(r'[!?,]', cn_text):
|
||
punct_issues.append({'row': i+2, 'type': '中文半角标点', 'text': cn_text[:60]})
|
||
|
||
# 价值观检查
|
||
for nw in negative_words:
|
||
if nw in cn_text:
|
||
value_issues.append({'row': i+2, 'type': '侮辱性词汇', 'word': nw, 'text': cn_text[:60]})
|
||
for vw in violence_words:
|
||
if vw in cn_text:
|
||
value_issues.append({'row': i+2, 'type': '暴力隐喻', 'word': vw, 'text': cn_text[:60]})
|
||
|
||
print(f"标点问题: {len(punct_issues)}处")
|
||
for p in punct_issues:
|
||
print(f" 行{p['row']} [{p['type']}]: {p['text'][:60]}")
|
||
|
||
print(f"\n价值观问题: {len(value_issues)}处")
|
||
for v in value_issues:
|
||
print(f" 行{v['row']} [{v['type']}]: {v['word']} | {v['text'][:60]}")
|
||
|
||
print("\n" + "="*80)
|
||
print("八、结构性问题检查")
|
||
print("="*80)
|
||
|
||
struct_issues = []
|
||
|
||
# 检查删除线残留
|
||
for i, row in enumerate(data_rows):
|
||
for col_idx in [1, 2, 4]: # 剧情内容、中文对白、翻译
|
||
text = row[col_idx] if len(row) > col_idx else ""
|
||
if '~~' in text:
|
||
struct_issues.append({'row': i+2, 'type': '删除线残留', 'text': text[:60]})
|
||
|
||
# 检查组件配置列是否为空
|
||
empty_config = 0
|
||
for i, row in enumerate(data_rows):
|
||
row_type = row[0].strip() if len(row) > 0 else ""
|
||
if row_type and 'TL' not in row_type:
|
||
config = row[5].strip() if len(row) > 5 else ""
|
||
if not config:
|
||
empty_config += 1
|
||
struct_issues.append({'row': i+2, 'type': '组件配置为空', 'text': row_type})
|
||
|
||
print(f"结构性问题: {len(struct_issues)}处")
|
||
for s in struct_issues:
|
||
print(f" 行{s['row']} [{s['type']}]: {s['text'][:60]}")
|
||
|
||
print(f"\n组件配置为空的互动行: {empty_config}个")
|
||
|
||
print("\n" + "="*80)
|
||
print("九、总结评分")
|
||
print("="*80)
|
||
|
||
# 汇总
|
||
def grade_cn(cn):
|
||
if cn > 2300: return '🔴 超标'
|
||
elif cn > 2000: return '🟡 偏多'
|
||
else: return '✅ 合规'
|
||
|
||
def grade_en(en):
|
||
if en > 1300: return '🔴 超标'
|
||
elif en > 1300: return '🟡 偏多'
|
||
else: return '✅ 合规'
|
||
|
||
def grade_interaction(n):
|
||
if n < 22: return '🔴 不足'
|
||
elif n > 26: return '🔴 超标'
|
||
else: return '✅ 合规'
|
||
|
||
def grade_kp(kp_counter):
|
||
issues = sum(1 for c in kp_counter.values() if c > 3 or c < 2)
|
||
if issues >= 3: return '🔴 问题较多'
|
||
elif issues > 0: return '🟡 部分超标'
|
||
else: return '✅ 合规'
|
||
|
||
def grade_spell(n):
|
||
if n > 5: return '🔴 硬伤较多'
|
||
elif n > 0: return '🟡 少量问题'
|
||
else: return '✅ 合规'
|
||
|
||
def grade_vocab(n):
|
||
if n > 10: return '🔴 超纲较多'
|
||
elif n > 3: return '🟡 少量超纲'
|
||
else: return '✅ 合规'
|
||
|
||
def grade_punct(n):
|
||
if n > 10: return '🔴 问题较多'
|
||
elif n > 3: return '🟡 少量问题'
|
||
else: return '✅ 合规'
|
||
|
||
def grade_struct(n):
|
||
if n > 5: return '🔴 问题较多'
|
||
elif n > 0: return '🟡 少量问题'
|
||
else: return '✅ 合规'
|
||
|
||
grades = {
|
||
'台词字数(中)': grade_cn(total_cn_chars),
|
||
'台词字数(英)': grade_en(total_en_words),
|
||
'互动量': grade_interaction(interaction_count),
|
||
'知识点覆盖': grade_kp(kp_counter),
|
||
'核心互动': '✅ 合规' if len(core_interactions) == 2 else ('🔴 超标' if len(core_interactions) > 2 else '🟡 不足'),
|
||
'拼写/语法': grade_spell(len(spelling_errors) + len(grammar_errors)),
|
||
'词汇难度': grade_vocab(len(vocab_issues)),
|
||
'标点/价值观': grade_punct(len(punct_issues) + len(value_issues)),
|
||
'结构性问题': grade_struct(len(struct_issues)),
|
||
}
|
||
|
||
red_count = sum(1 for v in grades.values() if '🔴' in v)
|
||
yellow_count = sum(1 for v in grades.values() if '🟡' in v)
|
||
|
||
print(f"{'维度':<16} {'状态':<14} {'等级'}")
|
||
print("-"*44)
|
||
for k, v in grades.items():
|
||
print(f"{k:<16} {v:<14}")
|
||
|
||
print(f"\n🔴维度: {red_count}, 🟡维度: {yellow_count}")
|
||
if red_count >= 3:
|
||
print("整体判定: 🔴 不合格(需大修)")
|
||
elif red_count >= 1:
|
||
print("整体判定: 🟡 需修改")
|
||
else:
|
||
print("整体判定: ✅ 合格")
|