ai_member_xiaoxi/scripts/welfare_excluded_export.py
2026-05-20 08:00:01 +08:00

158 lines
6.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""生成剔除数据汇总 Excel被剔除的低价订单 + 重复 trade_no"""
import csv
CHEAP_FILE = '/root/.openclaw/workspace/output/welfare_excluded_cheap.txt'
FINAL_OUTPUT = '/root/.openclaw/workspace/output/剔除数据明细.xlsx'
# 读取低价订单(可能无表头)
cheap_orders = []
with open(CHEAP_FILE, 'r') as f:
lines = [l.strip() for l in f.readlines() if l.strip()]
# 检查第一行是否是表头
# 表头: account_id|trade_no|goods_id|goods_name|key_from|to_char|?column?|order_status
reader = csv.DictReader(lines, delimiter='|')
for row in reader:
cheap_orders.append({
'用户ID': str(row.get('account_id') or ''),
'交易号': str(row.get('trade_no') or ''),
'商品ID': str(row.get('goods_id') or ''),
'商品名称': (row.get('goods_name') or '').strip(),
'渠道': (row.get('key_from') or '').strip(),
'购课日期': str(row.get('to_char') or ''),
'支付金额(元)': str(row.get('?column?') or '0'),
'订单状态': str(row.get('order_status') or ''),
})
print(f"低价订单数: {len(cheap_orders)}")
# 重复 trade_no 数据
dup_orders = [
{'用户ID': '14459', '交易号': '440002908556965', '商品ID': '31',
'商品名称': '瓦拉英语年包', '渠道': 'app-active-h5-0-0',
'购课日期': '2026-02-05 14:23:58', '支付金额(元)': '1999.0',
'订单状态': '3(已完成)', '原因': '主键ID=1969, 与ID=1970重复'},
{'用户ID': '14459', '交易号': '440002908556965', '商品ID': '31',
'商品名称': '瓦拉英语年包', '渠道': 'app-active-h5-0-0',
'购课日期': '2026-02-05 14:23:58', '支付金额(元)': '1999.0',
'订单状态': '4(已退款)', '原因': '主键ID=1970, 状态为已退款已保留此条去重时保留ID更小的'},
]
# 生成 Excel
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
wb = Workbook()
header_fill = PatternFill(start_color='4472C4', end_color='4472C4', fill_type='solid')
header_font = Font(color='FFFFFF', bold=True, size=11)
warn_fill = PatternFill(start_color='FFC000', end_color='FFC000', fill_type='solid')
thin_border = Border(
left=Side(style='thin'), right=Side(style='thin'),
top=Side(style='thin'), bottom=Side(style='thin')
)
# Sheet 1: 低价订单
ws1 = wb.active
ws1.title = "被剔除的低价订单"
cheap_headers = ['用户ID', '交易号', '商品ID', '商品名称', '渠道', '购课日期',
'支付金额(元)', '订单状态', '剔除原因']
for col, h in enumerate(cheap_headers, 1):
cell = ws1.cell(row=1, column=col, value=h)
cell.fill = header_fill
cell.font = header_font
cell.alignment = Alignment(horizontal='center', vertical='center')
cell.border = thin_border
# 订单状态映射
status_map = {'3': '3(已完成)', '4': '4(已退款)'}
for i, row in enumerate(cheap_orders, 2):
amount = float(row['支付金额(元)']) if row['支付金额(元)'] else 0
status_raw = row.get('订单状态', '').strip()
status_label = status_map.get(status_raw, status_raw)
values = [
row['用户ID'],
row['交易号'],
row['商品ID'],
row['商品名称'].strip() if '商品名称' in row else '',
row['渠道'].strip() if row.get('渠道') else '',
row['购课日期'] if row.get('购课日期') else '',
amount,
status_label,
f'金额过低({amount}元),非正常售价',
]
for col, val in enumerate(values, 1):
cell = ws1.cell(row=i, column=col, value=val)
cell.border = thin_border
cell.alignment = Alignment(horizontal='center')
if col == 5:
cell.alignment = Alignment(horizontal='left')
# 金额分布统计
from collections import Counter
amount_dist = Counter(float(r['支付金额(元)']) for r in cheap_orders if r.get('支付金额(元)'))
ws1.cell(row=len(cheap_orders)+3, column=1, value="金额分布").font = Font(bold=True)
for j, (amt, cnt) in enumerate(sorted(amount_dist.items()), len(cheap_orders)+4):
ws1.cell(row=j, column=1, value=f"{amt}")
ws1.cell(row=j, column=2, value=cnt)
col_widths = [10, 24, 8, 20, 36, 22, 14, 14, 26]
for i, w in enumerate(col_widths, 1):
from openpyxl.utils import get_column_letter
ws1.column_dimensions[get_column_letter(i)].width = w
ws1.freeze_panes = 'A2'
# Sheet 2: 重复 trade_no
ws2 = wb.create_sheet("重复trade_no")
dup_headers = ['用户ID', '交易号', '商品ID', '商品名称', '渠道', '购课日期',
'支付金额(元)', '订单状态', '重复原因']
for col, h in enumerate(dup_headers, 1):
cell = ws2.cell(row=1, column=col, value=h)
cell.fill = header_fill
cell.font = header_font
cell.alignment = Alignment(horizontal='center', vertical='center')
cell.border = thin_border
for i, row in enumerate(dup_orders, 2):
for col, val in enumerate(row.values(), 1):
cell = ws2.cell(row=i, column=col, value=val)
cell.border = thin_border
cell.alignment = Alignment(horizontal='center')
if col == 5:
cell.alignment = Alignment(horizontal='left')
for i, w in enumerate(col_widths, 1):
ws2.column_dimensions[get_column_letter(i)].width = w
ws2.freeze_panes = 'A2'
# Sheet 3: 说明
ws3 = wb.create_sheet("剔除说明")
ws3.column_dimensions['A'].width = 80
notes = [
"剔除数据说明",
"",
"一、被剔除的低价订单(共 {} 条)".format(len(cheap_orders)),
" • 筛选条件goods_id IN (31,60,61)已完成或已退款pay_amount_int < 100,000分<1,000元",
" • 这些订单的支付金额远低于正常售价level1/level2 = 1,999元level1+2 = 3,598元",
" • 来源渠道多为 newmedia-dianpu-*(店铺渠道),怀疑为测试/内部订单",
" • 若计入这些订单会产生大量虚假的「两个年包」用户(如 account 6691 共 9 笔 5 元订单)",
"",
"二、重复 trade_no共 1 条)",
" • trade_no: 440002908556965account_id: 14459",
" • bi_vala_order 表中存在两条记录id=1969 和 id=1970",
" • id=1969: order_status=3已完成; id=1970: order_status=4已退款",
" • 去重逻辑:按 (account_id, trade_no) 去重,保留先出现的记录",
"",
"三、筛选阈值说明",
" • 最低金额阈值设为 100,000 分1,000 元),正常年包价格 ≥ 1,999 元",
]
for i, note in enumerate(notes, 1):
cell = ws3.cell(row=i, column=1, value=note)
if i == 1:
cell.font = Font(bold=True, size=14)
wb.save(FINAL_OUTPUT)
print(f"✅ 已保存: {FINAL_OUTPUT}")