#!/usr/bin/env python3 """L2剧本审校 - S3-U30-L3 霹雳飞船""" import re, json, sys # 读取文档 markdown with open("/tmp/l2_script_review.md", "r") as f: content = f.read() # ========== 解析剧本表 ========== # 找到"### 剧本"后的表格 script_section = content.split("### 剧本")[1] if "### 剧本" in content else content # 解析 lark-table 行 rows = [] table_match = re.findall(r'(.*?)', script_section, re.DOTALL) for tr in table_match: cells = re.findall(r']*>(.*?)', tr, re.DOTALL) # 清理 markdown 标记和 HTML 标签 clean_cells = [] for c in cells: c = re.sub(r'<[^>]+>', '', c) # 移除 HTML 标签 c = re.sub(r'\*\*', '', c) # 移除 markdown 加粗 c = re.sub(r'\{align="[^"]*"\}', '', c) c = re.sub(r'\{color="[^"]*"\}', '', c) c = c.strip() clean_cells.append(c) if len(clean_cells) >= 6: rows.append(clean_cells) # 跳过表头行 header = rows[0] if rows else [] print(f"表头: {header}") print(f"总行数(含表头): {len(rows)}") data_rows = rows[1:] # 跳过表头 # 列映射: 0=类型, 1=剧情内容, 2=中文对白, 3=知识点, 4=翻译, 5=配置信息 print("\n" + "="*80) print("一、台词字数统计") print("="*80) total_cn_chars = 0 total_en_words = 0 sentence_stats = [] for i, row in enumerate(data_rows): row_type = row[0].strip() if len(row) > 0 else "" cn_text = row[2].strip() if len(row) > 2 else "" en_text = row[4].strip() if len(row) > 4 else "" cn_count = len(re.sub(r'\s', '', cn_text)) en_words = len(en_text.split()) if en_text else 0 total_cn_chars += cn_count total_en_words += en_words if en_words > 0: sentence_stats.append({ 'row': i+2, 'type': row_type, 'cn_chars': cn_count, 'en_words': en_words, 'en_text_preview': en_text[:80] }) print(f"中文对白总字数: {total_cn_chars}") print(f"英文翻译总词数: {total_en_words}") print(f"规范: 中文1500-2000字, 英文1000-1300词") print(f"中文判定: {'超标' if total_cn_chars > 2300 else ('偏多' if total_cn_chars > 2000 else '合规')}") print(f"英文判定: {'超标' if total_en_words > 1300 else ('偏多' if total_en_words > 1300 else '合规')}") # 单句词数检查 print("\n单句词数检查:") for s in sentence_stats: w = s['en_words'] t = s['type'] if 'TL' in t: limit = 18 elif any(k in t for k in ['朗读', '挖空', '组句', '选读', '表达', '选择']): limit = 15 elif any(k in t for k in ['阅读']): limit = 22 elif any(k in t for k in ['听力', '听']): limit = 15 else: limit = 18 if w > limit: print(f" ⚠️ 行{s['row']} [{t}] {w}词 > {limit}词上限: {s['en_text_preview']}...") print("\n" + "="*80) print("二、互动量统计") print("="*80) interaction_count = 0 core_interactions = [] interaction_types = [] for i, row in enumerate(data_rows): row_type = row[0].strip() if len(row) > 0 else "" if row_type and 'TL' not in row_type: interaction_count += 1 interaction_types.append(row_type) # 核心互动识别 if any(k in row_type for k in ['阅读', '任务对话', '口语妙问', '口语独白', '合作听力', '邮件', '写作']): core_interactions.append({'row': i+2, 'type': row_type}) print(f"总互动量: {interaction_count} (规范: 22-26)") print(f"核心互动数: {len(core_interactions)} (规范: 2个/lesson)") # 互动类型分布 from collections import Counter type_dist = Counter(interaction_types) print("\n互动类型分布:") for t, c in type_dist.most_common(): print(f" {t}: {c}次") print("\n核心互动明细:") for ci in core_interactions: print(f" 行{ci['row']}: {ci['type']}") print("\n" + "="*80) print("三、知识点覆盖统计") print("="*80) # 知识点出现次数 kp_counter = Counter() kp_rows = {} for i, row in enumerate(data_rows): kp_text = row[3].strip() if len(row) > 3 else "" if kp_text: # 可能包含多个知识点,用换行或逗号分隔 kps = re.split(r'[\n,,、]+', kp_text) for kp in kps: kp = kp.strip() if kp: kp_counter[kp] += 1 if kp not in kp_rows: kp_rows[kp] = [] kp_rows[kp].append(i+2) print("知识点出现次数:") for kp, count in kp_counter.most_common(): status = "✅" if 2 <= count <= 3 else ("⚠️超标" if count > 3 else "⚠️不足") print(f" {kp}: {count}次 {status} (行: {kp_rows[kp]})") print("\n" + "="*80) print("四、核心互动类型分布") print("="*80) # 映射核心互动类型 type_map = { '阅读理解': '读', '阅读': '读', '任务对话': '说', '口语妙问': '说', '口语独白': '说', '口语': '说', '合作听力': '听', '听力': '听', '邮件撰写': '写', '写作回复': '写', '写作': '写', } core_type_count = Counter() for ci in core_interactions: mapped = None for k, v in type_map.items(): if k in ci['type']: mapped = v break if mapped: core_type_count[mapped] += 1 else: core_type_count['其他'] += 1 print(f"核心互动类型分布: 读{core_type_count.get('读',0)} : 说{core_type_count.get('说',0)} : 听{core_type_count.get('听',0)} : 写{core_type_count.get('写',0)}") print(f"规范比例: 读3 : 说3 : 听2 : 写2") print("\n" + "="*80) print("五、拼写/语法检查") print("="*80) spelling_errors = [] grammar_errors = [] # 常见拼写错误 common_misspellings = { 'recieve': 'receive', 'seperate': 'separate', 'occured': 'occurred', 'accomodate': 'accommodate', 'acheive': 'achieve', 'beleive': 'believe', 'calender': 'calendar', 'definately': 'definitely', 'embarass': 'embarrass', 'goverment': 'government', 'neccessary': 'necessary', 'occassion': 'occasion', 'paralel': 'parallel', 'priviledge': 'privilege', 'recomend': 'recommend', 'tommorow': 'tomorrow', 'untill': 'until', 'wether': 'whether', } for i, row in enumerate(data_rows): en_text = row[4].strip() if len(row) > 4 else "" if not en_text: continue # 拼写检查 words_lower = en_text.lower() for wrong, correct in common_misspellings.items(): if wrong in words_lower: spelling_errors.append({ 'row': i+2, 'type': '拼写错误', 'original': wrong, 'suggestion': correct, 'context': en_text[:100] }) # 重复词检查 words = en_text.split() for j in range(len(words)-1): if words[j].lower() == words[j+1].lower(): grammar_errors.append({ 'row': i+2, 'type': '重复词', 'original': f"{words[j]} {words[j+1]}", 'suggestion': words[j], 'context': en_text[:100] }) # its/it's 检查 its_matches = re.finditer(r'\bits\b', en_text) for m in its_matches: pos = m.start() after = en_text[pos+3:].strip() if after and after[0].isalpha() and not after.startswith("'s"): grammar_errors.append({ 'row': i+2, 'type': "its/it's混淆", 'original': 'its', 'suggestion': "it's" if after.split()[0] not in ['own','name','color','size','shape','way'] else 'its', 'context': en_text[max(0,pos-20):pos+30] }) print(f"拼写错误: {len(spelling_errors)}处") for e in spelling_errors: print(f" 行{e['row']}: {e['original']} → {e['suggestion']} | {e['context'][:60]}") print(f"\n语法问题: {len(grammar_errors)}处") for e in grammar_errors: print(f" 行{e['row']} [{e['type']}]: {e['original']} → {e['suggestion']} | {e['context'][:60]}") print("\n" + "="*80) print("六、词汇难度检查") print("="*80) # 加载L2词库 with open("/root/.openclaw/workspace-xiaoyan/business_knowledge/L2_word_list.json", "r") as f: l2_words = json.load(f) # 构建已知词集合 known_words = set() word_list = l2_words.get('words', l2_words) if isinstance(l2_words, dict) else l2_words for w in word_list: if isinstance(w, dict): known_words.add(w.get('word', '').lower().strip()) # B1+ 常见超纲词 b1_words = { 'unforgivable', 'arrogance', 'corrupts', 'electromagnetic', 'bargain', 'negotiate', 'convince', 'persuade', 'reluctant', 'desperate', 'frustrated', 'annoyed', 'embarrassed', 'disappointed', 'impressed', 'fascinated', 'terrified', 'absolutely', 'definitely', 'probably', 'unfortunately', 'fortunately', 'accidentally', 'apparently', 'obviously', 'gradually', 'eventually', 'opportunity', 'experience', 'environment', 'government', 'information', 'population', 'situation', 'tradition', 'education', 'competition', 'explosion', 'explode', 'crash', 'collapse', 'destroy', 'damage', 'repair', 'replace', 'install', 'remove', 'connect', 'disconnect', 'engine', 'vehicle', 'transport', 'passenger', 'customer', 'client', 'service', 'product', 'quality', 'quantity', 'price', 'value', 'similar', 'different', 'important', 'necessary', 'possible', 'impossible', 'comfortable', 'uncomfortable', 'dangerous', 'safe', 'popular', 'common', 'special', 'normal', 'strange', 'weird', 'terrible', 'wonderful', 'excellent', 'perfect', 'awful', 'horrible', 'brilliant', 'fantastic', 'massive', 'huge', 'tiny', 'enormous', 'giant', } # 实际需要检查CEFR等级的词汇 - 用简化方式 vocab_issues = [] for i, row in enumerate(data_rows): en_text = row[4].strip() if len(row) > 4 else "" if not en_text: continue words = re.findall(r'\b[a-zA-Z]+\b', en_text.lower()) for w in words: if len(w) <= 2: continue if w not in known_words and w in b1_words: vocab_issues.append({ 'row': i+2, 'word': w, 'context': en_text[:80] }) print(f"超纲词(B1+): {len(vocab_issues)}处") for v in vocab_issues: print(f" 行{v['row']}: {v['word']} | {v['context'][:60]}") print("\n" + "="*80) print("七、标点符号与价值观检查") print("="*80) punct_issues = [] value_issues = [] # 负面词汇 negative_words = ['白痴', '人渣', '废物', '笨蛋', '蠢货', '傻瓜', '神经病', '去死', '滚开'] violence_words = ['吃人不吐骨', '死得很惨', '只能活一个', '你会后悔的', '让你好看', '杀了你', '打死你'] for i, row in enumerate(data_rows): cn_text = row[2].strip() if len(row) > 2 else "" en_text = row[4].strip() if len(row) > 4 else "" # 中文标点检查 if '~' in cn_text: punct_issues.append({'row': i+2, 'type': '波浪号', 'text': cn_text[:60]}) if '!!' in en_text: punct_issues.append({'row': i+2, 'type': '双叹号', 'text': en_text[:60]}) if '**' in en_text: punct_issues.append({'row': i+2, 'type': 'Markdown标记', 'text': en_text[:60]}) # 英文全角标点 if re.search(r'[!?,。]', en_text): punct_issues.append({'row': i+2, 'type': '英文全角标点', 'text': en_text[:60]}) # 中文半角标点 if re.search(r'[!?,]', cn_text): punct_issues.append({'row': i+2, 'type': '中文半角标点', 'text': cn_text[:60]}) # 价值观检查 for nw in negative_words: if nw in cn_text: value_issues.append({'row': i+2, 'type': '侮辱性词汇', 'word': nw, 'text': cn_text[:60]}) for vw in violence_words: if vw in cn_text: value_issues.append({'row': i+2, 'type': '暴力隐喻', 'word': vw, 'text': cn_text[:60]}) print(f"标点问题: {len(punct_issues)}处") for p in punct_issues: print(f" 行{p['row']} [{p['type']}]: {p['text'][:60]}") print(f"\n价值观问题: {len(value_issues)}处") for v in value_issues: print(f" 行{v['row']} [{v['type']}]: {v['word']} | {v['text'][:60]}") print("\n" + "="*80) print("八、结构性问题检查") print("="*80) struct_issues = [] # 检查删除线残留 for i, row in enumerate(data_rows): for col_idx in [1, 2, 4]: # 剧情内容、中文对白、翻译 text = row[col_idx] if len(row) > col_idx else "" if '~~' in text: struct_issues.append({'row': i+2, 'type': '删除线残留', 'text': text[:60]}) # 检查组件配置列是否为空 empty_config = 0 for i, row in enumerate(data_rows): row_type = row[0].strip() if len(row) > 0 else "" if row_type and 'TL' not in row_type: config = row[5].strip() if len(row) > 5 else "" if not config: empty_config += 1 struct_issues.append({'row': i+2, 'type': '组件配置为空', 'text': row_type}) print(f"结构性问题: {len(struct_issues)}处") for s in struct_issues: print(f" 行{s['row']} [{s['type']}]: {s['text'][:60]}") print(f"\n组件配置为空的互动行: {empty_config}个") print("\n" + "="*80) print("九、总结评分") print("="*80) # 汇总 def grade_cn(cn): if cn > 2300: return '🔴 超标' elif cn > 2000: return '🟡 偏多' else: return '✅ 合规' def grade_en(en): if en > 1300: return '🔴 超标' elif en > 1300: return '🟡 偏多' else: return '✅ 合规' def grade_interaction(n): if n < 22: return '🔴 不足' elif n > 26: return '🔴 超标' else: return '✅ 合规' def grade_kp(kp_counter): issues = sum(1 for c in kp_counter.values() if c > 3 or c < 2) if issues >= 3: return '🔴 问题较多' elif issues > 0: return '🟡 部分超标' else: return '✅ 合规' def grade_spell(n): if n > 5: return '🔴 硬伤较多' elif n > 0: return '🟡 少量问题' else: return '✅ 合规' def grade_vocab(n): if n > 10: return '🔴 超纲较多' elif n > 3: return '🟡 少量超纲' else: return '✅ 合规' def grade_punct(n): if n > 10: return '🔴 问题较多' elif n > 3: return '🟡 少量问题' else: return '✅ 合规' def grade_struct(n): if n > 5: return '🔴 问题较多' elif n > 0: return '🟡 少量问题' else: return '✅ 合规' grades = { '台词字数(中)': grade_cn(total_cn_chars), '台词字数(英)': grade_en(total_en_words), '互动量': grade_interaction(interaction_count), '知识点覆盖': grade_kp(kp_counter), '核心互动': '✅ 合规' if len(core_interactions) == 2 else ('🔴 超标' if len(core_interactions) > 2 else '🟡 不足'), '拼写/语法': grade_spell(len(spelling_errors) + len(grammar_errors)), '词汇难度': grade_vocab(len(vocab_issues)), '标点/价值观': grade_punct(len(punct_issues) + len(value_issues)), '结构性问题': grade_struct(len(struct_issues)), } red_count = sum(1 for v in grades.values() if '🔴' in v) yellow_count = sum(1 for v in grades.values() if '🟡' in v) print(f"{'维度':<16} {'状态':<14} {'等级'}") print("-"*44) for k, v in grades.items(): print(f"{k:<16} {v:<14}") print(f"\n🔴维度: {red_count}, 🟡维度: {yellow_count}") if red_count >= 3: print("整体判定: 🔴 不合格(需大修)") elif red_count >= 1: print("整体判定: 🟡 需修改") else: print("整体判定: ✅ 合格")