53 lines
2.2 KiB
Python
53 lines
2.2 KiB
Python
import pandas as pd
|
|
from openpyxl import load_workbook
|
|
|
|
# 文件路径
|
|
origin_file = "/root/.openclaw/media/inbound/é_¾åº_æ_æ_å_è_ç³_æ_1.0---8b762144-a4a3-481d-bdb8-b3b0dcbf875a.xlsx"
|
|
final_lib_file = "/root/.openclaw/media/inbound/â_¼ï_LV1-å_ç_å_è_åº_-ç¼_å_é_è_ç_è_é---23d539f8-33d6-4679-b9ae-91520114ae54.xlsx"
|
|
output_file = "/root/.openclaw/workspace-xiaoban/定稿版单词上下册分类结果.xlsx"
|
|
|
|
# 读取原始单词表(带详细字段)
|
|
df_origin = pd.read_excel(origin_file)
|
|
# 读取定稿单词库
|
|
df_final = pd.read_excel(final_lib_file)
|
|
|
|
# 给定稿库单词添加上下册分类
|
|
def get_category(unit):
|
|
if pd.isna(unit) or unit.strip() == '' or unit.strip() == '不常见':
|
|
return '不匹配'
|
|
unit = unit.strip()
|
|
if unit.startswith('S0-'):
|
|
return '上册'
|
|
if unit.startswith('S1-U'):
|
|
unit_num = int(unit.split('-')[1][1:])
|
|
if unit_num <=6:
|
|
return '上册'
|
|
else:
|
|
return '下册'
|
|
return '不匹配'
|
|
|
|
df_final['分类'] = df_final['占用情况'].apply(get_category)
|
|
|
|
# 创建单词到分类的映射(仅包含定稿库中存在的单词)
|
|
word_category_map = df_final[df_final['分类'] != '不匹配'].drop_duplicates('单词').set_index('单词')['分类'].to_dict()
|
|
|
|
# 给原始单词表匹配分类
|
|
df_origin['分类'] = df_origin['单词'].map(word_category_map)
|
|
|
|
# 拆分上下册
|
|
df_upper = df_origin[df_origin['分类'] == '上册'].drop(columns=['分类'])
|
|
df_lower = df_origin[df_origin['分类'] == '下册'].drop(columns=['分类'])
|
|
df_other = df_origin[~df_origin['分类'].isin(['上册', '下册'])].drop(columns=['分类'])
|
|
|
|
# 写入结果
|
|
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
|
|
df_upper.to_excel(writer, sheet_name='上册单词(定稿版)', index=False)
|
|
df_lower.to_excel(writer, sheet_name='下册单词(定稿版)', index=False)
|
|
if len(df_other) > 0:
|
|
df_other.to_excel(writer, sheet_name='未匹配到定稿库的单词', index=False)
|
|
|
|
print(f"处理完成!结果已保存到:{output_file}")
|
|
print(f"上册匹配到单词数量:{len(df_upper)}")
|
|
print(f"下册匹配到单词数量:{len(df_lower)}")
|
|
print(f"未匹配到定稿库的单词数量:{len(df_other)}")
|