import pandas as pd # 文件路径 file1_path = r'/root/.openclaw/workspace-xiaoyan/business_knowledge/新知识库初版/全包词汇/L2知识库-三级+A2.xlsx' file2_path = r'/root/.openclaw/workspace-xiaoyan/business_knowledge/新知识库初版/全包词汇/KET和三级不重复.xlsx' # 读取两个文件 df1 = pd.read_excel(file1_path) df2 = pd.read_excel(file2_path) print("文件1 (L2知识库-三级+A2.xlsx) 的列名:") print(df1.columns.tolist()) print("\n文件1 的前5行:") print(df1.head()) print("\n" + "="*80 + "\n") print("文件2 (KET和三级不重复.xlsx) 的列名:") print(df2.columns.tolist()) print("\n文件2 的前5行:") print(df2.head()) # 提取文件1中的单词(第一列) words_from_file1 = set(df1.iloc[:, 0].astype(str).str.strip().str.lower()) # 在文件2中新增一列,标记重复的单词 df2['是否与L2三级+A2重复'] = df2.iloc[:, 0].apply(lambda x: '是' if str(x).strip().lower() in words_from_file1 else '') # 统计重复数量 duplicate_count = (df2['是否与L2三级+A2重复'] == '是').sum() total_count = len(df2) print(f"\n" + "="*80) print(f"总单词数:{total_count}") print(f"重复单词数:{duplicate_count}") print(f"重复率:{duplicate_count/total_count*100:.2f}%") # 保存结果 df2.to_excel(file2_path, index=False) print(f"\n已更新文件:{file2_path}") print("新增列名:是否与L2三级+A2重复")