ai_member_xiaoban/reclassify_word.py
2026-03-14 08:00:01 +08:00

53 lines
2.2 KiB
Python

import pandas as pd
from openpyxl import load_workbook
# 文件路径
origin_file = "/root/.openclaw/media/inbound/é_¾åº_æ_æ_å_è_ç³_æ_1.0---8b762144-a4a3-481d-bdb8-b3b0dcbf875a.xlsx"
final_lib_file = "/root/.openclaw/media/inbound/â_¼ï_LV1-å_ç_å_è_åº_-ç¼_å_é_è_ç_è_é---23d539f8-33d6-4679-b9ae-91520114ae54.xlsx"
output_file = "/root/.openclaw/workspace-xiaoban/定稿版单词上下册分类结果.xlsx"
# 读取原始单词表(带详细字段)
df_origin = pd.read_excel(origin_file)
# 读取定稿单词库
df_final = pd.read_excel(final_lib_file)
# 给定稿库单词添加上下册分类
def get_category(unit):
if pd.isna(unit) or unit.strip() == '' or unit.strip() == '不常见':
return '不匹配'
unit = unit.strip()
if unit.startswith('S0-'):
return '上册'
if unit.startswith('S1-U'):
unit_num = int(unit.split('-')[1][1:])
if unit_num <=6:
return '上册'
else:
return '下册'
return '不匹配'
df_final['分类'] = df_final['占用情况'].apply(get_category)
# 创建单词到分类的映射(仅包含定稿库中存在的单词)
word_category_map = df_final[df_final['分类'] != '不匹配'].drop_duplicates('单词').set_index('单词')['分类'].to_dict()
# 给原始单词表匹配分类
df_origin['分类'] = df_origin['单词'].map(word_category_map)
# 拆分上下册
df_upper = df_origin[df_origin['分类'] == '上册'].drop(columns=['分类'])
df_lower = df_origin[df_origin['分类'] == '下册'].drop(columns=['分类'])
df_other = df_origin[~df_origin['分类'].isin(['上册', '下册'])].drop(columns=['分类'])
# 写入结果
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
df_upper.to_excel(writer, sheet_name='上册单词(定稿版)', index=False)
df_lower.to_excel(writer, sheet_name='下册单词(定稿版)', index=False)
if len(df_other) > 0:
df_other.to_excel(writer, sheet_name='未匹配到定稿库的单词', index=False)
print(f"处理完成!结果已保存到:{output_file}")
print(f"上册匹配到单词数量:{len(df_upper)}")
print(f"下册匹配到单词数量:{len(df_lower)}")
print(f"未匹配到定稿库的单词数量:{len(df_other)}")