ai_member_xiaoyan/compare_l1_l2.py

72 lines
1.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
# 文件路径
l2_file = r'/root/.openclaw/workspace-xiaoyan/business_knowledge/L2单词表/L2知识库-三级+A2.xlsx'
l1_file = r'/root/.openclaw/workspace-xiaoyan/business_knowledge/L2单词表/L1完整.xlsx'
# 读取文件
l2_df = pd.read_excel(l2_file)
l1_df = pd.read_excel(l1_file)
print("L2文件列名:", l2_df.columns.tolist())
print(f"L2行数: {len(l2_df)}")
print("\nL1文件列名:", l1_df.columns.tolist())
print(f"L1行数: {len(l1_df)}")
# 创建L1单词查找字典小写为key
l1_dict = {}
for idx, row in l1_df.iterrows():
word = str(row.iloc[0]).strip().lower()
pos = row.iloc[1] if len(row) > 1 else ""
meaning = row.iloc[2] if len(row) > 2 else ""
l1_dict[word] = {
"row": idx + 2, # Excel从第2行开始
"pos": pos,
"meaning": meaning
}
# 处理L2表格
is_l1 = []
l1_rows = []
l1_pos = []
l1_meaning = []
for idx, row in l2_df.iterrows():
word = str(row.iloc[0]).strip().lower()
if word in l1_dict:
info = l1_dict[word]
is_l1.append("")
l1_rows.append(info["row"])
l1_pos.append(info["pos"])
l1_meaning.append(info["meaning"])
else:
is_l1.append("")
l1_rows.append("")
l1_pos.append("")
l1_meaning.append("")
# 添加新列
l2_df["是否为L1单词"] = is_l1
l2_df["在L1中的行数"] = l1_rows
l2_df["L1词性"] = l1_pos
l2_df["L1词义"] = l1_meaning
# 保存结果
l2_df.to_excel(l2_file, index=False)
# 统计
count_yes = is_l1.count("")
count_no = is_l1.count("")
print("\n" + "="*50)
print(f"比对完成!")
print(f"L1单词总数: {len(l1_df)}")
print(f"L2单词总数: {len(l2_df)}")
print(f"重复单词数: {count_yes}")
print(f"不重复单词数: {count_no}")
print(f"重复率: {count_yes/len(l2_df)*100:.2f}%")
print("="*50)
print("\n保存成功!已更新 L2知识库-三级+A2.xlsx")