ai_member_xiaoyan/output/S3_U30_L3_审校分析.py

436 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""L2剧本审校 - S3-U30-L3 霹雳飞船"""
import re, json, sys
# 读取文档 markdown
with open("/tmp/l2_script_review.md", "r") as f:
content = f.read()
# ========== 解析剧本表 ==========
# 找到"### 剧本"后的表格
script_section = content.split("### 剧本")[1] if "### 剧本" in content else content
# 解析 lark-table 行
rows = []
table_match = re.findall(r'<lark-tr>(.*?)</lark-tr>', script_section, re.DOTALL)
for tr in table_match:
cells = re.findall(r'<lark-td[^>]*>(.*?)</lark-td>', tr, re.DOTALL)
# 清理 markdown 标记和 HTML 标签
clean_cells = []
for c in cells:
c = re.sub(r'<[^>]+>', '', c) # 移除 HTML 标签
c = re.sub(r'\*\*', '', c) # 移除 markdown 加粗
c = re.sub(r'\{align="[^"]*"\}', '', c)
c = re.sub(r'\{color="[^"]*"\}', '', c)
c = c.strip()
clean_cells.append(c)
if len(clean_cells) >= 6:
rows.append(clean_cells)
# 跳过表头行
header = rows[0] if rows else []
print(f"表头: {header}")
print(f"总行数(含表头): {len(rows)}")
data_rows = rows[1:] # 跳过表头
# 列映射: 0=类型, 1=剧情内容, 2=中文对白, 3=知识点, 4=翻译, 5=配置信息
print("\n" + "="*80)
print("一、台词字数统计")
print("="*80)
total_cn_chars = 0
total_en_words = 0
sentence_stats = []
for i, row in enumerate(data_rows):
row_type = row[0].strip() if len(row) > 0 else ""
cn_text = row[2].strip() if len(row) > 2 else ""
en_text = row[4].strip() if len(row) > 4 else ""
cn_count = len(re.sub(r'\s', '', cn_text))
en_words = len(en_text.split()) if en_text else 0
total_cn_chars += cn_count
total_en_words += en_words
if en_words > 0:
sentence_stats.append({
'row': i+2, 'type': row_type, 'cn_chars': cn_count, 'en_words': en_words,
'en_text_preview': en_text[:80]
})
print(f"中文对白总字数: {total_cn_chars}")
print(f"英文翻译总词数: {total_en_words}")
print(f"规范: 中文1500-2000字, 英文1000-1300词")
print(f"中文判定: {'超标' if total_cn_chars > 2300 else ('偏多' if total_cn_chars > 2000 else '合规')}")
print(f"英文判定: {'超标' if total_en_words > 1300 else ('偏多' if total_en_words > 1300 else '合规')}")
# 单句词数检查
print("\n单句词数检查:")
for s in sentence_stats:
w = s['en_words']
t = s['type']
if 'TL' in t:
limit = 18
elif any(k in t for k in ['朗读', '挖空', '组句', '选读', '表达', '选择']):
limit = 15
elif any(k in t for k in ['阅读']):
limit = 22
elif any(k in t for k in ['听力', '']):
limit = 15
else:
limit = 18
if w > limit:
print(f" ⚠️ 行{s['row']} [{t}] {w}词 > {limit}词上限: {s['en_text_preview']}...")
print("\n" + "="*80)
print("二、互动量统计")
print("="*80)
interaction_count = 0
core_interactions = []
interaction_types = []
for i, row in enumerate(data_rows):
row_type = row[0].strip() if len(row) > 0 else ""
if row_type and 'TL' not in row_type:
interaction_count += 1
interaction_types.append(row_type)
# 核心互动识别
if any(k in row_type for k in ['阅读', '任务对话', '口语妙问', '口语独白', '合作听力', '邮件', '写作']):
core_interactions.append({'row': i+2, 'type': row_type})
print(f"总互动量: {interaction_count} (规范: 22-26)")
print(f"核心互动数: {len(core_interactions)} (规范: 2个/lesson)")
# 互动类型分布
from collections import Counter
type_dist = Counter(interaction_types)
print("\n互动类型分布:")
for t, c in type_dist.most_common():
print(f" {t}: {c}")
print("\n核心互动明细:")
for ci in core_interactions:
print(f"{ci['row']}: {ci['type']}")
print("\n" + "="*80)
print("三、知识点覆盖统计")
print("="*80)
# 知识点出现次数
kp_counter = Counter()
kp_rows = {}
for i, row in enumerate(data_rows):
kp_text = row[3].strip() if len(row) > 3 else ""
if kp_text:
# 可能包含多个知识点,用换行或逗号分隔
kps = re.split(r'[\n,,、]+', kp_text)
for kp in kps:
kp = kp.strip()
if kp:
kp_counter[kp] += 1
if kp not in kp_rows:
kp_rows[kp] = []
kp_rows[kp].append(i+2)
print("知识点出现次数:")
for kp, count in kp_counter.most_common():
status = "" if 2 <= count <= 3 else ("⚠️超标" if count > 3 else "⚠️不足")
print(f" {kp}: {count}{status} (行: {kp_rows[kp]})")
print("\n" + "="*80)
print("四、核心互动类型分布")
print("="*80)
# 映射核心互动类型
type_map = {
'阅读理解': '', '阅读': '',
'任务对话': '', '口语妙问': '', '口语独白': '', '口语': '',
'合作听力': '', '听力': '',
'邮件撰写': '', '写作回复': '', '写作': '',
}
core_type_count = Counter()
for ci in core_interactions:
mapped = None
for k, v in type_map.items():
if k in ci['type']:
mapped = v
break
if mapped:
core_type_count[mapped] += 1
else:
core_type_count['其他'] += 1
print(f"核心互动类型分布: 读{core_type_count.get('',0)} : 说{core_type_count.get('',0)} : 听{core_type_count.get('',0)} : 写{core_type_count.get('',0)}")
print(f"规范比例: 读3 : 说3 : 听2 : 写2")
print("\n" + "="*80)
print("五、拼写/语法检查")
print("="*80)
spelling_errors = []
grammar_errors = []
# 常见拼写错误
common_misspellings = {
'recieve': 'receive', 'seperate': 'separate', 'occured': 'occurred',
'accomodate': 'accommodate', 'acheive': 'achieve', 'beleive': 'believe',
'calender': 'calendar', 'definately': 'definitely', 'embarass': 'embarrass',
'goverment': 'government', 'neccessary': 'necessary', 'occassion': 'occasion',
'paralel': 'parallel', 'priviledge': 'privilege', 'recomend': 'recommend',
'tommorow': 'tomorrow', 'untill': 'until', 'wether': 'whether',
}
for i, row in enumerate(data_rows):
en_text = row[4].strip() if len(row) > 4 else ""
if not en_text:
continue
# 拼写检查
words_lower = en_text.lower()
for wrong, correct in common_misspellings.items():
if wrong in words_lower:
spelling_errors.append({
'row': i+2, 'type': '拼写错误', 'original': wrong, 'suggestion': correct,
'context': en_text[:100]
})
# 重复词检查
words = en_text.split()
for j in range(len(words)-1):
if words[j].lower() == words[j+1].lower():
grammar_errors.append({
'row': i+2, 'type': '重复词', 'original': f"{words[j]} {words[j+1]}",
'suggestion': words[j], 'context': en_text[:100]
})
# its/it's 检查
its_matches = re.finditer(r'\bits\b', en_text)
for m in its_matches:
pos = m.start()
after = en_text[pos+3:].strip()
if after and after[0].isalpha() and not after.startswith("'s"):
grammar_errors.append({
'row': i+2, 'type': "its/it's混淆", 'original': 'its',
'suggestion': "it's" if after.split()[0] not in ['own','name','color','size','shape','way'] else 'its',
'context': en_text[max(0,pos-20):pos+30]
})
print(f"拼写错误: {len(spelling_errors)}")
for e in spelling_errors:
print(f"{e['row']}: {e['original']}{e['suggestion']} | {e['context'][:60]}")
print(f"\n语法问题: {len(grammar_errors)}")
for e in grammar_errors:
print(f"{e['row']} [{e['type']}]: {e['original']}{e['suggestion']} | {e['context'][:60]}")
print("\n" + "="*80)
print("六、词汇难度检查")
print("="*80)
# 加载L2词库
with open("/root/.openclaw/workspace-xiaoyan/business_knowledge/L2_word_list.json", "r") as f:
l2_words = json.load(f)
# 构建已知词集合
known_words = set()
word_list = l2_words.get('words', l2_words) if isinstance(l2_words, dict) else l2_words
for w in word_list:
if isinstance(w, dict):
known_words.add(w.get('word', '').lower().strip())
# B1+ 常见超纲词
b1_words = {
'unforgivable', 'arrogance', 'corrupts', 'electromagnetic', 'bargain', 'negotiate',
'convince', 'persuade', 'reluctant', 'desperate', 'frustrated', 'annoyed',
'embarrassed', 'disappointed', 'impressed', 'fascinated', 'terrified',
'absolutely', 'definitely', 'probably', 'unfortunately', 'fortunately',
'accidentally', 'apparently', 'obviously', 'gradually', 'eventually',
'opportunity', 'experience', 'environment', 'government', 'information',
'population', 'situation', 'tradition', 'education', 'competition',
'explosion', 'explode', 'crash', 'collapse', 'destroy', 'damage',
'repair', 'replace', 'install', 'remove', 'connect', 'disconnect',
'engine', 'vehicle', 'transport', 'passenger', 'customer', 'client',
'service', 'product', 'quality', 'quantity', 'price', 'value',
'similar', 'different', 'important', 'necessary', 'possible', 'impossible',
'comfortable', 'uncomfortable', 'dangerous', 'safe', 'popular', 'common',
'special', 'normal', 'strange', 'weird', 'terrible', 'wonderful',
'excellent', 'perfect', 'awful', 'horrible', 'brilliant', 'fantastic',
'massive', 'huge', 'tiny', 'enormous', 'giant',
}
# 实际需要检查CEFR等级的词汇 - 用简化方式
vocab_issues = []
for i, row in enumerate(data_rows):
en_text = row[4].strip() if len(row) > 4 else ""
if not en_text:
continue
words = re.findall(r'\b[a-zA-Z]+\b', en_text.lower())
for w in words:
if len(w) <= 2:
continue
if w not in known_words and w in b1_words:
vocab_issues.append({
'row': i+2, 'word': w, 'context': en_text[:80]
})
print(f"超纲词(B1+): {len(vocab_issues)}")
for v in vocab_issues:
print(f"{v['row']}: {v['word']} | {v['context'][:60]}")
print("\n" + "="*80)
print("七、标点符号与价值观检查")
print("="*80)
punct_issues = []
value_issues = []
# 负面词汇
negative_words = ['白痴', '人渣', '废物', '笨蛋', '蠢货', '傻瓜', '神经病', '去死', '滚开']
violence_words = ['吃人不吐骨', '死得很惨', '只能活一个', '你会后悔的', '让你好看', '杀了你', '打死你']
for i, row in enumerate(data_rows):
cn_text = row[2].strip() if len(row) > 2 else ""
en_text = row[4].strip() if len(row) > 4 else ""
# 中文标点检查
if '' in cn_text:
punct_issues.append({'row': i+2, 'type': '波浪号', 'text': cn_text[:60]})
if '!!' in en_text:
punct_issues.append({'row': i+2, 'type': '双叹号', 'text': en_text[:60]})
if '**' in en_text:
punct_issues.append({'row': i+2, 'type': 'Markdown标记', 'text': en_text[:60]})
# 英文全角标点
if re.search(r'[!?,。]', en_text):
punct_issues.append({'row': i+2, 'type': '英文全角标点', 'text': en_text[:60]})
# 中文半角标点
if re.search(r'[!?,]', cn_text):
punct_issues.append({'row': i+2, 'type': '中文半角标点', 'text': cn_text[:60]})
# 价值观检查
for nw in negative_words:
if nw in cn_text:
value_issues.append({'row': i+2, 'type': '侮辱性词汇', 'word': nw, 'text': cn_text[:60]})
for vw in violence_words:
if vw in cn_text:
value_issues.append({'row': i+2, 'type': '暴力隐喻', 'word': vw, 'text': cn_text[:60]})
print(f"标点问题: {len(punct_issues)}")
for p in punct_issues:
print(f"{p['row']} [{p['type']}]: {p['text'][:60]}")
print(f"\n价值观问题: {len(value_issues)}")
for v in value_issues:
print(f"{v['row']} [{v['type']}]: {v['word']} | {v['text'][:60]}")
print("\n" + "="*80)
print("八、结构性问题检查")
print("="*80)
struct_issues = []
# 检查删除线残留
for i, row in enumerate(data_rows):
for col_idx in [1, 2, 4]: # 剧情内容、中文对白、翻译
text = row[col_idx] if len(row) > col_idx else ""
if '~~' in text:
struct_issues.append({'row': i+2, 'type': '删除线残留', 'text': text[:60]})
# 检查组件配置列是否为空
empty_config = 0
for i, row in enumerate(data_rows):
row_type = row[0].strip() if len(row) > 0 else ""
if row_type and 'TL' not in row_type:
config = row[5].strip() if len(row) > 5 else ""
if not config:
empty_config += 1
struct_issues.append({'row': i+2, 'type': '组件配置为空', 'text': row_type})
print(f"结构性问题: {len(struct_issues)}")
for s in struct_issues:
print(f"{s['row']} [{s['type']}]: {s['text'][:60]}")
print(f"\n组件配置为空的互动行: {empty_config}")
print("\n" + "="*80)
print("九、总结评分")
print("="*80)
# 汇总
def grade_cn(cn):
if cn > 2300: return '🔴 超标'
elif cn > 2000: return '🟡 偏多'
else: return '✅ 合规'
def grade_en(en):
if en > 1300: return '🔴 超标'
elif en > 1300: return '🟡 偏多'
else: return '✅ 合规'
def grade_interaction(n):
if n < 22: return '🔴 不足'
elif n > 26: return '🔴 超标'
else: return '✅ 合规'
def grade_kp(kp_counter):
issues = sum(1 for c in kp_counter.values() if c > 3 or c < 2)
if issues >= 3: return '🔴 问题较多'
elif issues > 0: return '🟡 部分超标'
else: return '✅ 合规'
def grade_spell(n):
if n > 5: return '🔴 硬伤较多'
elif n > 0: return '🟡 少量问题'
else: return '✅ 合规'
def grade_vocab(n):
if n > 10: return '🔴 超纲较多'
elif n > 3: return '🟡 少量超纲'
else: return '✅ 合规'
def grade_punct(n):
if n > 10: return '🔴 问题较多'
elif n > 3: return '🟡 少量问题'
else: return '✅ 合规'
def grade_struct(n):
if n > 5: return '🔴 问题较多'
elif n > 0: return '🟡 少量问题'
else: return '✅ 合规'
grades = {
'台词字数(中)': grade_cn(total_cn_chars),
'台词字数(英)': grade_en(total_en_words),
'互动量': grade_interaction(interaction_count),
'知识点覆盖': grade_kp(kp_counter),
'核心互动': '✅ 合规' if len(core_interactions) == 2 else ('🔴 超标' if len(core_interactions) > 2 else '🟡 不足'),
'拼写/语法': grade_spell(len(spelling_errors) + len(grammar_errors)),
'词汇难度': grade_vocab(len(vocab_issues)),
'标点/价值观': grade_punct(len(punct_issues) + len(value_issues)),
'结构性问题': grade_struct(len(struct_issues)),
}
red_count = sum(1 for v in grades.values() if '🔴' in v)
yellow_count = sum(1 for v in grades.values() if '🟡' in v)
print(f"{'维度':<16} {'状态':<14} {'等级'}")
print("-"*44)
for k, v in grades.items():
print(f"{k:<16} {v:<14}")
print(f"\n🔴维度: {red_count}, 🟡维度: {yellow_count}")
if red_count >= 3:
print("整体判定: 🔴 不合格(需大修)")
elif red_count >= 1:
print("整体判定: 🟡 需修改")
else:
print("整体判定: ✅ 合格")