213 lines
9.4 KiB
Python
213 lines
9.4 KiB
Python
#!/usr/bin/env python3
|
||
import argparse
|
||
import sys
|
||
import os
|
||
import yaml
|
||
from openai import OpenAI
|
||
from pathlib import Path
|
||
|
||
# 加载配置
|
||
BASE_DIR = Path(__file__).parent.parent
|
||
ASSETS_DIR = BASE_DIR / "assets"
|
||
|
||
# 加载外部配置文件(所有可调优参数全部在assets目录下的yaml文件,无需改代码)
|
||
try:
|
||
# 基础配置
|
||
with open(ASSETS_DIR / "sci_fi_map.yaml", "r", encoding="utf-8") as f:
|
||
SCI_FI_WORD_MAP = yaml.safe_load(f)
|
||
with open(ASSETS_DIR / "stage_config.yaml", "r", encoding="utf-8") as f:
|
||
STAGE_CONFIG = yaml.safe_load(f)
|
||
# 调优配置
|
||
with open(ASSETS_DIR / "expression_map.yaml", "r", encoding="utf-8") as f:
|
||
EXPRESSION_MAP = yaml.safe_load(f)
|
||
with open(ASSETS_DIR / "prompt_config.yaml", "r", encoding="utf-8") as f:
|
||
PROMPT_CONFIG = yaml.safe_load(f)
|
||
with open(ASSETS_DIR / "validation_config.yaml", "r", encoding="utf-8") as f:
|
||
VALIDATION_CONFIG = yaml.safe_load(f)
|
||
# 词表配置
|
||
with open(BASE_DIR / "references" / "l1_word_list.json", "r", encoding="utf-8") as f:
|
||
L1_WORD_LIST = set([word.lower() for word in yaml.safe_load(f)])
|
||
except Exception as e:
|
||
print(f"❌ 配置文件加载失败,请检查yaml格式是否正确: {str(e)}")
|
||
sys.exit(1)
|
||
|
||
# 初始化LLM客户端,配置从环境变量读取
|
||
try:
|
||
client = OpenAI(
|
||
api_key=os.getenv("OPENAI_API_KEY", "your-api-key"),
|
||
base_url=os.getenv("OPENAI_BASE_URL", "https://ark.cn-beijing.volces.com/api/v3")
|
||
)
|
||
MODEL = os.getenv("OPENAI_MODEL", "volcengine/doubao-seed-2-0-pro-260215")
|
||
except Exception as e:
|
||
print(f"❌ LLM客户端初始化失败: {str(e)}")
|
||
sys.exit(1)
|
||
|
||
def load_input(input_path):
|
||
"""加载输入内容,支持单个文件或目录批量加载"""
|
||
input_path = Path(input_path)
|
||
if not input_path.exists():
|
||
print(f"❌ 输入路径不存在: {input_path}")
|
||
sys.exit(1)
|
||
|
||
if input_path.is_file():
|
||
with open(input_path, "r", encoding="utf-8") as f:
|
||
return [(input_path.name, f.read())]
|
||
elif input_path.is_dir():
|
||
# 批量加载目录下所有txt文件
|
||
script_files = list(input_path.glob("*.txt"))
|
||
if not script_files:
|
||
print(f"❌ 目录下没有找到txt格式的剧本文件: {input_path}")
|
||
sys.exit(1)
|
||
results = []
|
||
for f in script_files:
|
||
with open(f, "r", encoding="utf-8") as fp:
|
||
results.append((f.name, fp.read()))
|
||
return results
|
||
else:
|
||
print(f"❌ 不支持的输入类型: {input_path}")
|
||
sys.exit(1)
|
||
|
||
def get_prompt(input_text, stage):
|
||
"""生成Prompt,所有可调规则从配置文件读取,无需改代码"""
|
||
sci_fi_map_str = "\n".join([f"{k} → {v}" for k, v in SCI_FI_WORD_MAP.items()])
|
||
# 动态加载配置规则
|
||
emotion_map_rule = "优先使用以下映射匹配情绪词:" + "、".join([f"{k}→{v}" for k,v in EXPRESSION_MAP['emotion_map'].items()]) if PROMPT_CONFIG['naturalization']['enable_emotion_word'] else "不使用自定义情绪词映射"
|
||
synonym_replace_rule = "可使用以下同义口语替换(不改变原意):" + "、".join([f"{k}→{v}" for k,v in EXPRESSION_MAP['synonym_replace'].items()]) if PROMPT_CONFIG['naturalization']['enable_synonym_replace'] else "不使用同义替换"
|
||
split_rule = "包含2个及以上信息的句子拆成单信息短句" if PROMPT_CONFIG['naturalization']['enable_long_sentence_split'] else "不拆分长句"
|
||
repeat_rule = "允许自然重复(比如It is dirty. Very dirty.)" if PROMPT_CONFIG['naturalization']['allow_repeat_expression'] else "不允许重复表达"
|
||
exclamation_rule = "情绪强烈的句子可用感叹号" if PROMPT_CONFIG['naturalization']['enable_exclamation_mark'] else "统一使用句号"
|
||
fidelity_rule = "100%忠于原剧本内容:禁止新增任何原剧本没有的信息、禁止删除任何原剧本已有的信息" if PROMPT_CONFIG['script_fidelity']['strictly_no_add'] and PROMPT_CONFIG['script_fidelity']['strictly_no_delete'] else "允许适当调整细节"
|
||
|
||
return f"""
|
||
你是专为4-8岁儿童打造的英文台词生产专家,严格遵守以下所有规则生成内容,绝对不允许违反:
|
||
### 剧本忠实度规则(最高优先级,绝对不能违反)
|
||
{fidelity_rule}
|
||
|
||
### 第一步:输入归一
|
||
当前输入是:{input_text}
|
||
不管输入是纯中文/纯英文/中英混合,你首先统一转成标准中文「角色: 台词」格式,完整保留所有剧情、动作、角色关系、道具、事件触发点信息,不能丢失任何核心内容。
|
||
|
||
### 第二步:中文AR预处理
|
||
严格遵守4个保留机制(绝对不能改):
|
||
1. 保留完整事件动词链
|
||
2. 保留所有事件触发点
|
||
3. 保留完整道具逻辑链
|
||
4. 保留原有角色关系
|
||
按以下7条规则拆成单信息短句,1句仅表达1个信息,不改变剧情:
|
||
1. 复杂句拆成短句
|
||
2. 因果拆分,保留事实不保留连接词
|
||
3. 目的拆分,不删目的信息
|
||
4. 多步动作拆成单动作句
|
||
5. 条件+行为全拆分,去掉假设逻辑
|
||
6. 情绪与事实拆分,不修改情绪
|
||
7. 去复杂推理,只留可见事实
|
||
|
||
### 第三步:分级英文生成
|
||
目标Stage:{stage}
|
||
对应要求:{STAGE_CONFIG[stage]["rules"]}
|
||
蓝思值要求:{STAGE_CONFIG[stage]["lexile"]}
|
||
|
||
自然化要求(**严格遵守剧本忠实度规则,禁止新增/删减任何原剧本没有的内容**):
|
||
1. 情绪词映射规则:{emotion_map_rule}
|
||
2. 同义替换规则:{synonym_replace_rule}
|
||
3. 长句拆分规则:{split_rule}
|
||
4. 重复表达规则:{repeat_rule}
|
||
5. 标点规则:{exclamation_rule}
|
||
6. 绝对禁止成人化连接词(actually/in fact/however等)
|
||
7. 完全符合母语小朋友说话习惯,绝对不能有翻译腔
|
||
8. 科幻词汇自动按以下映射替换:
|
||
{sci_fi_map_str}
|
||
|
||
### 第四步:自动校验
|
||
生成后自行校验以下4项:
|
||
1. AR等级合规:S1禁止AR3/AR4,S2禁止AR4
|
||
2. 难度合规:词汇/句法/句长/蓝思值完全匹配对应Stage要求,无超纲
|
||
3. 自然度合规:无翻译腔,符合4-8岁儿童母语表达习惯
|
||
4. 内容合规:无敏感内容,无中式英语
|
||
|
||
### 输出格式(严格按照格式输出,不要其他内容)
|
||
【Stage {stage} 英文台词(适配{STAGE_CONFIG[stage]["age"]})】
|
||
角色A: 台词内容
|
||
角色B: 台词内容
|
||
...
|
||
【蓝思值】:[估算值]L
|
||
【校验结果】:通过/待优化
|
||
【优化建议】:无/具体建议
|
||
"""
|
||
|
||
def generate_single_script(input_text, stage):
|
||
"""生成单个剧本的台词"""
|
||
try:
|
||
prompt = get_prompt(input_text, stage)
|
||
response = client.chat.completions.create(
|
||
model=MODEL,
|
||
messages=[{"role": "user", "content": prompt}],
|
||
temperature=0.3,
|
||
max_tokens=2000,
|
||
timeout=30
|
||
)
|
||
result = response.choices[0].message.content
|
||
# 增加超纲词校验
|
||
oov_words = check_out_of_vocab(result, stage)
|
||
if oov_words and stage in ["S1", "S2"]:
|
||
result += f"\n【超纲词提醒】:{', '.join(oov_words)}(请确认是否需要替换)"
|
||
return result
|
||
except Exception as e:
|
||
return f"❌ 生成失败: {str(e)}"
|
||
|
||
def check_out_of_vocab(script_content, stage):
|
||
"""检查超纲词汇,规则从配置文件读取"""
|
||
if not VALIDATION_CONFIG['vocab_validation']['enable_OOV_remind'] or stage not in ["S1", "S2"]:
|
||
return []
|
||
# 提取所有英文单词
|
||
import re
|
||
words = re.findall(r"[a-zA-Z']+", script_content)
|
||
words = [word.lower().strip("'") for word in words]
|
||
# 过滤配置里定义的停用词
|
||
stop_words = set(VALIDATION_CONFIG['vocab_validation']['stop_words'])
|
||
words = [word for word in words if word not in stop_words and len(word) > 1]
|
||
# 找超纲词
|
||
out_of_vocab = list(set([word for word in words if word not in L1_WORD_LIST]))
|
||
return out_of_vocab
|
||
|
||
def save_result(output_dir, filename, content):
|
||
"""保存结果到文件"""
|
||
output_dir = Path(output_dir)
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
output_file = output_dir / f"result_{filename}"
|
||
with open(output_file, "w", encoding="utf-8") as f:
|
||
f.write(content)
|
||
return output_file
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="4-8岁儿童英文台词标准化生产工具")
|
||
group = parser.add_mutually_exclusive_group(required=True)
|
||
group.add_argument("--input", type=str, help="直接输入待处理的剧本文本")
|
||
group.add_argument("--path", type=str, help="待处理的单个剧本文件路径或包含多个剧本的目录路径")
|
||
parser.add_argument("--stage", type=str, choices=["S1", "S2", "S3", "S4"], required=True, help="目标难度等级 S1/S2/S3/S4")
|
||
parser.add_argument("--output", type=str, help="结果输出目录,不指定则直接打印到控制台")
|
||
args = parser.parse_args()
|
||
|
||
# 处理输入
|
||
if args.input:
|
||
input_list = [("direct_input", args.input)]
|
||
else:
|
||
input_list = load_input(args.path)
|
||
|
||
# 批量生成
|
||
results = []
|
||
for filename, text in input_list:
|
||
print(f"\n🚀 正在处理: {filename}")
|
||
result = generate_single_script(text, args.stage)
|
||
results.append((filename, result))
|
||
print(result)
|
||
# 保存结果
|
||
if args.output:
|
||
save_path = save_result(args.output, filename, result)
|
||
print(f"💾 结果已保存到: {save_path}")
|
||
|
||
print(f"\n✅ 全部处理完成,共处理{len(results)}个剧本")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|