ai_member_xiaoyan/compare_vocabulary2.py

42 lines
1.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
# 文件路径
file1_path = r'/root/.openclaw/workspace-xiaoyan/business_knowledge/新知识库初版/全包词汇/A2中独有的单词.xlsx'
file2_path = r'/root/.openclaw/workspace-xiaoyan/business_knowledge/新知识库初版/全包词汇/KET和三级不重复.xlsx'
# 读取两个文件
df1 = pd.read_excel(file1_path)
df2 = pd.read_excel(file2_path)
print("文件1 (A2中独有的单词.xlsx) 的列名:")
print(df1.columns.tolist())
print("\n文件1 的前5行")
print(df1.head())
print("\n" + "="*80 + "\n")
print("文件2 (KET和三级不重复.xlsx) 的列名:")
print(df2.columns.tolist())
print("\n文件2 的前5行")
print(df2.head())
# 提取文件1中的单词第一列
words_from_file1 = set(df1.iloc[:, 0].astype(str).str.strip().str.lower())
# 在文件2中新增一列标记重复的单词
df2['是否与A2独有单词重复'] = df2.iloc[:, 0].apply(lambda x: '' if str(x).strip().lower() in words_from_file1 else '')
# 统计重复数量
duplicate_count = (df2['是否与A2独有单词重复'] == '').sum()
total_count = len(df2)
print(f"\n" + "="*80)
print(f"总单词数:{total_count}")
print(f"重复单词数:{duplicate_count}")
print(f"重复率:{duplicate_count/total_count*100:.2f}%")
# 保存结果
df2.to_excel(file2_path, index=False)
print(f"\n已更新文件:{file2_path}")
print("新增列名是否与A2独有单词重复")