42 lines
1.4 KiB
Python
42 lines
1.4 KiB
Python
import pandas as pd
|
||
|
||
# 文件路径
|
||
file1_path = r'/root/.openclaw/workspace-xiaoyan/business_knowledge/新知识库初版/全包词汇/A2中独有的单词.xlsx'
|
||
file2_path = r'/root/.openclaw/workspace-xiaoyan/business_knowledge/新知识库初版/全包词汇/KET和三级不重复.xlsx'
|
||
|
||
# 读取两个文件
|
||
df1 = pd.read_excel(file1_path)
|
||
df2 = pd.read_excel(file2_path)
|
||
|
||
print("文件1 (A2中独有的单词.xlsx) 的列名:")
|
||
print(df1.columns.tolist())
|
||
print("\n文件1 的前5行:")
|
||
print(df1.head())
|
||
|
||
print("\n" + "="*80 + "\n")
|
||
|
||
print("文件2 (KET和三级不重复.xlsx) 的列名:")
|
||
print(df2.columns.tolist())
|
||
print("\n文件2 的前5行:")
|
||
print(df2.head())
|
||
|
||
# 提取文件1中的单词(第一列)
|
||
words_from_file1 = set(df1.iloc[:, 0].astype(str).str.strip().str.lower())
|
||
|
||
# 在文件2中新增一列,标记重复的单词
|
||
df2['是否与A2独有单词重复'] = df2.iloc[:, 0].apply(lambda x: '是' if str(x).strip().lower() in words_from_file1 else '')
|
||
|
||
# 统计重复数量
|
||
duplicate_count = (df2['是否与A2独有单词重复'] == '是').sum()
|
||
total_count = len(df2)
|
||
|
||
print(f"\n" + "="*80)
|
||
print(f"总单词数:{total_count}")
|
||
print(f"重复单词数:{duplicate_count}")
|
||
print(f"重复率:{duplicate_count/total_count*100:.2f}%")
|
||
|
||
# 保存结果
|
||
df2.to_excel(file2_path, index=False)
|
||
print(f"\n已更新文件:{file2_path}")
|
||
print("新增列名:是否与A2独有单词重复")
|