ai_member_xiaoyan/check_all_words.py


import pandas as pd

# 文件路径
file3034 = r'/root/.openclaw/workspace-xiaoyan/business_knowledge/L2单词表/30-34.xlsx'
fileL2 = r'/root/.openclaw/workspace-xiaoyan/business_knowledge/L2单词表/L2知识库-三级+A2.xlsx'

# 读取文件
df3034 = pd.read_excel(file3034)
dfL2 = pd.read_excel(fileL2)

print(f"30-34单词表: {len(df3034)} 个单词")
print(f"L2知识库: {len(dfL2)} 个单词")

# 创建L2单词查找集合
l2_words = set(str(word).strip().lower() for word in dfL2.iloc[:, 0])

# 检查每个单词
missing_words = []
for idx, row in df3034.iterrows():
    word = str(row.iloc[0]).strip().lower()
    if word not in l2_words:
        missing_words.append(row.iloc[0])

print("\n" + "="*60)
if len(missing_words) == 0:
    print("✅ 所有30-34中的单词都在L2知识库中！")
else:
    print(f"❌ 有 {len(missing_words)} 个单词不在L2知识库中：")
    for word in missing_words:
        print(f"   - {word}")

print("\n详细检查结果（前20个）：")
for idx, row in df3034.head(20).iterrows():
    word = str(row.iloc[0]).strip().lower()
    status = "✅" if word in l2_words else "❌"
    print(f"{idx+1:3d}. {row.iloc[0]:20s} {status}")

print("\n统计信息：")
total = len(df3034)
found = total - len(missing_words)
print(f"总单词数: {total}")
print(f"已在知识库: {found}")
print(f"缺失单词: {len(missing_words)}")
print(f"覆盖度: {found/total*100:.1f}%")