kids-english-script-product.../kids-english-script-production/scripts/gen_script.py

213 lines
9.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
import argparse
import sys
import os
import yaml
from openai import OpenAI
from pathlib import Path
# 加载配置
BASE_DIR = Path(__file__).parent.parent
ASSETS_DIR = BASE_DIR / "assets"
# 加载外部配置文件所有可调优参数全部在assets目录下的yaml文件无需改代码
try:
# 基础配置
with open(ASSETS_DIR / "sci_fi_map.yaml", "r", encoding="utf-8") as f:
SCI_FI_WORD_MAP = yaml.safe_load(f)
with open(ASSETS_DIR / "stage_config.yaml", "r", encoding="utf-8") as f:
STAGE_CONFIG = yaml.safe_load(f)
# 调优配置
with open(ASSETS_DIR / "expression_map.yaml", "r", encoding="utf-8") as f:
EXPRESSION_MAP = yaml.safe_load(f)
with open(ASSETS_DIR / "prompt_config.yaml", "r", encoding="utf-8") as f:
PROMPT_CONFIG = yaml.safe_load(f)
with open(ASSETS_DIR / "validation_config.yaml", "r", encoding="utf-8") as f:
VALIDATION_CONFIG = yaml.safe_load(f)
# 词表配置
with open(BASE_DIR / "references" / "l1_word_list.json", "r", encoding="utf-8") as f:
L1_WORD_LIST = set([word.lower() for word in yaml.safe_load(f)])
except Exception as e:
print(f"❌ 配置文件加载失败请检查yaml格式是否正确: {str(e)}")
sys.exit(1)
# 初始化LLM客户端配置从环境变量读取
try:
client = OpenAI(
api_key=os.getenv("OPENAI_API_KEY", "your-api-key"),
base_url=os.getenv("OPENAI_BASE_URL", "https://ark.cn-beijing.volces.com/api/v3")
)
MODEL = os.getenv("OPENAI_MODEL", "volcengine/doubao-seed-2-0-pro-260215")
except Exception as e:
print(f"❌ LLM客户端初始化失败: {str(e)}")
sys.exit(1)
def load_input(input_path):
"""加载输入内容,支持单个文件或目录批量加载"""
input_path = Path(input_path)
if not input_path.exists():
print(f"❌ 输入路径不存在: {input_path}")
sys.exit(1)
if input_path.is_file():
with open(input_path, "r", encoding="utf-8") as f:
return [(input_path.name, f.read())]
elif input_path.is_dir():
# 批量加载目录下所有txt文件
script_files = list(input_path.glob("*.txt"))
if not script_files:
print(f"❌ 目录下没有找到txt格式的剧本文件: {input_path}")
sys.exit(1)
results = []
for f in script_files:
with open(f, "r", encoding="utf-8") as fp:
results.append((f.name, fp.read()))
return results
else:
print(f"❌ 不支持的输入类型: {input_path}")
sys.exit(1)
def get_prompt(input_text, stage):
"""生成Prompt所有可调规则从配置文件读取无需改代码"""
sci_fi_map_str = "\n".join([f"{k}{v}" for k, v in SCI_FI_WORD_MAP.items()])
# 动态加载配置规则
emotion_map_rule = "优先使用以下映射匹配情绪词:" + "".join([f"{k}{v}" for k,v in EXPRESSION_MAP['emotion_map'].items()]) if PROMPT_CONFIG['naturalization']['enable_emotion_word'] else "不使用自定义情绪词映射"
synonym_replace_rule = "可使用以下同义口语替换(不改变原意):" + "".join([f"{k}{v}" for k,v in EXPRESSION_MAP['synonym_replace'].items()]) if PROMPT_CONFIG['naturalization']['enable_synonym_replace'] else "不使用同义替换"
split_rule = "包含2个及以上信息的句子拆成单信息短句" if PROMPT_CONFIG['naturalization']['enable_long_sentence_split'] else "不拆分长句"
repeat_rule = "允许自然重复比如It is dirty. Very dirty." if PROMPT_CONFIG['naturalization']['allow_repeat_expression'] else "不允许重复表达"
exclamation_rule = "情绪强烈的句子可用感叹号" if PROMPT_CONFIG['naturalization']['enable_exclamation_mark'] else "统一使用句号"
fidelity_rule = "100%忠于原剧本内容:禁止新增任何原剧本没有的信息、禁止删除任何原剧本已有的信息" if PROMPT_CONFIG['script_fidelity']['strictly_no_add'] and PROMPT_CONFIG['script_fidelity']['strictly_no_delete'] else "允许适当调整细节"
return f"""
你是专为4-8岁儿童打造的英文台词生产专家严格遵守以下所有规则生成内容绝对不允许违反
### 剧本忠实度规则(最高优先级,绝对不能违反)
{fidelity_rule}
### 第一步:输入归一
当前输入是:{input_text}
不管输入是纯中文/纯英文/中英混合,你首先统一转成标准中文「角色: 台词」格式,完整保留所有剧情、动作、角色关系、道具、事件触发点信息,不能丢失任何核心内容。
### 第二步中文AR预处理
严格遵守4个保留机制绝对不能改
1. 保留完整事件动词链
2. 保留所有事件触发点
3. 保留完整道具逻辑链
4. 保留原有角色关系
按以下7条规则拆成单信息短句1句仅表达1个信息不改变剧情
1. 复杂句拆成短句
2. 因果拆分,保留事实不保留连接词
3. 目的拆分,不删目的信息
4. 多步动作拆成单动作句
5. 条件+行为全拆分,去掉假设逻辑
6. 情绪与事实拆分,不修改情绪
7. 去复杂推理,只留可见事实
### 第三步:分级英文生成
目标Stage{stage}
对应要求:{STAGE_CONFIG[stage]["rules"]}
蓝思值要求:{STAGE_CONFIG[stage]["lexile"]}
自然化要求(**严格遵守剧本忠实度规则,禁止新增/删减任何原剧本没有的内容**
1. 情绪词映射规则:{emotion_map_rule}
2. 同义替换规则:{synonym_replace_rule}
3. 长句拆分规则:{split_rule}
4. 重复表达规则:{repeat_rule}
5. 标点规则:{exclamation_rule}
6. 绝对禁止成人化连接词actually/in fact/however等
7. 完全符合母语小朋友说话习惯,绝对不能有翻译腔
8. 科幻词汇自动按以下映射替换:
{sci_fi_map_str}
### 第四步:自动校验
生成后自行校验以下4项
1. AR等级合规S1禁止AR3/AR4S2禁止AR4
2. 难度合规:词汇/句法/句长/蓝思值完全匹配对应Stage要求无超纲
3. 自然度合规无翻译腔符合4-8岁儿童母语表达习惯
4. 内容合规:无敏感内容,无中式英语
### 输出格式(严格按照格式输出,不要其他内容)
【Stage {stage} 英文台词(适配{STAGE_CONFIG[stage]["age"]})】
角色A: 台词内容
角色B: 台词内容
...
【蓝思值】:[估算值]L
【校验结果】:通过/待优化
【优化建议】:无/具体建议
"""
def generate_single_script(input_text, stage):
"""生成单个剧本的台词"""
try:
prompt = get_prompt(input_text, stage)
response = client.chat.completions.create(
model=MODEL,
messages=[{"role": "user", "content": prompt}],
temperature=0.3,
max_tokens=2000,
timeout=30
)
result = response.choices[0].message.content
# 增加超纲词校验
oov_words = check_out_of_vocab(result, stage)
if oov_words and stage in ["S1", "S2"]:
result += f"\n【超纲词提醒】:{', '.join(oov_words)}(请确认是否需要替换)"
return result
except Exception as e:
return f"❌ 生成失败: {str(e)}"
def check_out_of_vocab(script_content, stage):
"""检查超纲词汇,规则从配置文件读取"""
if not VALIDATION_CONFIG['vocab_validation']['enable_OOV_remind'] or stage not in ["S1", "S2"]:
return []
# 提取所有英文单词
import re
words = re.findall(r"[a-zA-Z']+", script_content)
words = [word.lower().strip("'") for word in words]
# 过滤配置里定义的停用词
stop_words = set(VALIDATION_CONFIG['vocab_validation']['stop_words'])
words = [word for word in words if word not in stop_words and len(word) > 1]
# 找超纲词
out_of_vocab = list(set([word for word in words if word not in L1_WORD_LIST]))
return out_of_vocab
def save_result(output_dir, filename, content):
"""保存结果到文件"""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / f"result_{filename}"
with open(output_file, "w", encoding="utf-8") as f:
f.write(content)
return output_file
def main():
parser = argparse.ArgumentParser(description="4-8岁儿童英文台词标准化生产工具")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--input", type=str, help="直接输入待处理的剧本文本")
group.add_argument("--path", type=str, help="待处理的单个剧本文件路径或包含多个剧本的目录路径")
parser.add_argument("--stage", type=str, choices=["S1", "S2", "S3", "S4"], required=True, help="目标难度等级 S1/S2/S3/S4")
parser.add_argument("--output", type=str, help="结果输出目录,不指定则直接打印到控制台")
args = parser.parse_args()
# 处理输入
if args.input:
input_list = [("direct_input", args.input)]
else:
input_list = load_input(args.path)
# 批量生成
results = []
for filename, text in input_list:
print(f"\n🚀 正在处理: {filename}")
result = generate_single_script(text, args.stage)
results.append((filename, result))
print(result)
# 保存结果
if args.output:
save_path = save_result(args.output, filename, result)
print(f"💾 结果已保存到: {save_path}")
print(f"\n✅ 全部处理完成,共处理{len(results)}个剧本")
if __name__ == "__main__":
main()