ai_member_xiaoyan/check_all_words_v2.py

46 lines
1.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
# 文件路径
file3034 = r'/root/.openclaw/workspace-xiaoyan/business_knowledge/L2单词表/30-34.xlsx'
fileL2 = r'/root/.openclaw/workspace-xiaoyan/business_knowledge/L2单词表/L2知识库-三级+A2.xlsx'
# 读取文件
df3034 = pd.read_excel(file3034)
dfL2 = pd.read_excel(fileL2)
print(f"30-34单词表: {len(df3034)} 个单词")
print(f"L2知识库: {len(dfL2)} 个单词")
# 创建L2单词查找集合
l2_words = set(str(word).strip().lower() for word in dfL2.iloc[:, 0])
# 检查每个单词在列2"单词"中)
missing_words = []
for idx, row in df3034.iterrows():
word = str(row['单词']).strip().lower()
if word not in l2_words:
missing_words.append(row['单词'])
print("\n" + "="*60)
if len(missing_words) == 0:
print("✅ 所有30-34中的单词都在L2知识库中")
else:
print(f"❌ 有 {len(missing_words)} 个单词不在L2知识库中")
for word in missing_words:
print(f" - {word}")
print("\n详细检查结果前30个")
for idx, row in df3034.head(30).iterrows():
word = str(row['单词']).strip().lower()
status = "" if word in l2_words else ""
print(f"{idx+1:3d}. {str(row['单词']):30s} {status}")
print("\n统计信息:")
total = len(df3034)
found = total - len(missing_words)
print(f"总单词数: {total}")
print(f"已在知识库: {found}")
print(f"缺失单词: {len(missing_words)}")
print(f"覆盖度: {found/total*100:.1f}%")