42 lines
1.6 KiB
Python
42 lines
1.6 KiB
Python
import pandas as pd
|
|
|
|
# 文件路径
|
|
final_lib_file = "/root/.openclaw/media/inbound/â_¼ï_LV1-å_ç_å_è_åº_-ç¼_å_é_è_ç_è_é---1de9de11-1a6b-45c7-856a-4d69f9b26aa9.xlsx" # 定稿单词库
|
|
difficulty_file = "/root/.openclaw/media/inbound/é_¾åº_æ_æ_å_è_ç³_æ_1.0---a5011ea1-5bef-47af-be44-633db83f822e.xlsx" # 难度表
|
|
|
|
# 读取
|
|
df_final = pd.read_excel(final_lib_file)
|
|
df_diff = pd.read_excel(difficulty_file)
|
|
|
|
# 处理定稿库单词:去空、去非字符串(比如数字)、转小写统一对比
|
|
final_words = []
|
|
for w in df_final['单词'].tolist():
|
|
if pd.notna(w) and isinstance(w, str):
|
|
final_words.append(w.lower())
|
|
final_set = set(final_words)
|
|
print(f"定稿库有效单词(纯字符串,去空):{len(final_set)}个")
|
|
print(f"定稿库原始总条目数:{len(df_final)}")
|
|
print(f"定稿库非字符串/空值条目数:{len(df_final) - len(final_words)}")
|
|
|
|
# 处理难度表单词
|
|
diff_words = []
|
|
for w in df_diff['单词'].tolist():
|
|
if pd.notna(w) and isinstance(w, str):
|
|
diff_words.append(w.lower())
|
|
diff_set = set(diff_words)
|
|
print(f"\n难度表有效单词:{len(diff_set)}个")
|
|
print(f"难度表原始总条目数:{len(df_diff)}")
|
|
|
|
# 差异统计
|
|
match_count = len(diff_set & final_set)
|
|
unmatch_count = len(diff_set - final_set)
|
|
print(f"\n匹配上的单词数量:{match_count}")
|
|
print(f"未匹配的单词数量:{unmatch_count}")
|
|
|
|
# 查看定稿库中不是单词的内容
|
|
print("\n定稿库中不是有效单词的内容示例:")
|
|
for w in df_final['单词'].tolist():
|
|
if pd.isna(w) or not isinstance(w, str):
|
|
print(w, type(w))
|
|
break
|