ai_member_xiaoxi/scripts/channel_role_refund_analysis.py
2026-05-21 08:00:01 +08:00

335 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
晚柠/老王/三人行/念妈 四个渠道的退费率分析
维度角色数、是否行课、行课课程数用户完成的去重chapter_id数
念妈:限制 2026-04-01 ~ 2026-05-19
其他三个渠道:全量
"""
import psycopg2
import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import Font, Alignment, PatternFill, Border, Side
from openpyxl.utils import get_column_letter
conn = psycopg2.connect(
host="bj-postgres-16pob4sg.sql.tencentcdb.com",
port=28591,
user="ai_member",
password="LdfjdjL83h3h3^$&**YGG*",
dbname="vala_bi"
)
# 渠道定义
channel_keys = {
'晚柠': ["newmedia-daren-xhs-晚柠也是个妈妈了-0"],
'老王': [
"newmedia-daren-douyin-学霸老王讲真话4月8日瓦拉英语首发-0",
"newmedia-daren-xhs-学霸老王讲真话-0",
"newmedia-daren-douyin-学霸老王讲真话-0",
"newmedia-daren-wxxd-学霸老王讲真话-0"
],
'三人行': [
"newmedia-daren-douyin-学霸三人行4.8瓦拉英语首发)-0",
"newmedia-daren-wxxd-学霸三人行-0",
"newmedia-daren-xhs-学霸三人行-0",
"newmedia-daren-douyin-学霸三人行-0",
"newmedia-daren-wxxd001-学霸三人行-0"
],
'念妈': [
"newmedia-daren-douyin-念妈讲学习规划-0",
"newmedia-daren-wxxd-念妈讲学习规划-0",
"newmedia-daren-wxxd001-念妈讲学习规划-0",
"newmedia-daren-xhs-念妈讲学习规划-0"
]
}
all_keys = []
for ks in channel_keys.values():
all_keys.extend(ks)
key_to_channel = {}
for ch, ks in channel_keys.items():
for k in ks:
key_to_channel[k] = ch
# ============================================================
# Step 1: 获取目标订单
# ============================================================
print("Step 1: 获取目标订单(剔除测试/删除账号和删除订单)")
# 先获取所有渠道的订单(念妈有时间限制,其他没有)
orders_parts = []
for ch, ks in channel_keys.items():
if ch == '念妈':
sql = """
SELECT o.id as order_id, o.account_id, o.key_from, o.pay_amount_int,
o.order_status, o.trade_no,
CASE WHEN r.status = 3 AND o.order_status = 4 THEN 1 ELSE 0 END as is_refunded
FROM bi_vala_order o
JOIN bi_vala_app_account a ON o.account_id = a.id
AND a.status = 1 AND a.deleted_at IS NULL
LEFT JOIN bi_refund_order r ON o.trade_no = r.trade_no AND r.status = 3
WHERE o.deleted_at IS NULL
AND o.key_from = ANY(%s)
AND o.pay_success_date >= '2026-04-01'
AND o.pay_success_date < '2026-05-20'
"""
else:
sql = """
SELECT o.id as order_id, o.account_id, o.key_from, o.pay_amount_int,
o.order_status, o.trade_no,
CASE WHEN r.status = 3 AND o.order_status = 4 THEN 1 ELSE 0 END as is_refunded
FROM bi_vala_order o
JOIN bi_vala_app_account a ON o.account_id = a.id
AND a.status = 1 AND a.deleted_at IS NULL
LEFT JOIN bi_refund_order r ON o.trade_no = r.trade_no AND r.status = 3
WHERE o.deleted_at IS NULL
AND o.key_from = ANY(%s)
"""
df = pd.read_sql(sql, conn, params=(ks,))
orders_parts.append(df)
orders_df = pd.concat(orders_parts, ignore_index=True)
orders_df['channel'] = orders_df['key_from'].map(key_to_channel)
print(f" 总订单数: {len(orders_df)}, 总用户数: {orders_df['account_id'].nunique()}")
for ch in ['晚柠', '老王', '三人行', '念妈']:
cd = orders_df[orders_df['channel'] == ch]
print(f" {ch}: {len(cd)}单, {cd['account_id'].nunique()}用户, {cd['is_refunded'].sum():.0f}退费单")
# ============================================================
# Step 2: 统计每个用户的角色数
# ============================================================
print("\nStep 2: 统计每个用户的角色数")
target_account_ids = sorted(orders_df['account_id'].unique().tolist())
batch_size = 5000
all_roles = []
for i in range(0, len(target_account_ids), batch_size):
batch = target_account_ids[i:i+batch_size]
sql = """
SELECT account_id, COUNT(*) as total_roles
FROM bi_vala_app_character
WHERE account_id = ANY(%s) AND deleted_at IS NULL
GROUP BY account_id
"""
df = pd.read_sql(sql, conn, params=(batch,))
all_roles.append(df)
roles_df = pd.concat(all_roles, ignore_index=True) if all_roles else pd.DataFrame(columns=['account_id','total_roles'])
account_roles = pd.DataFrame({'account_id': target_account_ids})
account_roles = account_roles.merge(roles_df, on='account_id', how='left')
account_roles['total_roles'] = account_roles['total_roles'].fillna(0).astype(int)
print(f" 角色数分布: {account_roles['total_roles'].value_counts().sort_index().to_dict()}")
# ============================================================
# Step 3: 统计每个用户的"行课课程数"去重chapter_id, play_status=1
# ============================================================
print("\nStep 3: 统计每个用户的「行课课程数」完成课时去重chapter_id数")
# 获取所有目标用户的角色
all_characters = []
for i in range(0, len(target_account_ids), batch_size):
batch = target_account_ids[i:i+batch_size]
sql = """
SELECT id as character_id, account_id
FROM bi_vala_app_character
WHERE account_id = ANY(%s) AND deleted_at IS NULL
"""
df = pd.read_sql(sql, conn, params=(batch,))
all_characters.append(df)
char_df = pd.concat(all_characters, ignore_index=True) if all_characters else pd.DataFrame()
print(f" 目标用户总角色数: {len(char_df)}")
# 获取这些角色完成的课时去重chapter_id
char_ids = sorted(char_df['character_id'].unique().tolist())
user_chapter_count = {} # account_id -> set of chapter_id
for i in range(0, len(char_ids), batch_size):
batch = char_ids[i:i+batch_size]
# Union all 8 shard tables, get distinct (user_id, chapter_id) where play_status=1
union_sqls = []
for t in range(8):
union_sqls.append(f"""
SELECT DISTINCT user_id, chapter_id
FROM bi_user_chapter_play_record_{t}
WHERE user_id = ANY(%s) AND play_status = 1 AND deleted_at IS NULL
""")
sql = " UNION ".join(union_sqls)
params = tuple([batch] * 8)
df = pd.read_sql(sql, conn, params=params)
# Map character_id -> account_id
char_to_account = dict(zip(char_df['character_id'], char_df['account_id']))
for _, row in df.iterrows():
aid = char_to_account.get(row['user_id'])
if aid:
if aid not in user_chapter_count:
user_chapter_count[aid] = set()
user_chapter_count[aid].add(row['chapter_id'])
# Build account-level chapter count
account_chapter_counts = []
for aid in target_account_ids:
ch_set = user_chapter_count.get(aid, set())
account_chapter_counts.append({
'account_id': aid,
'chapter_count': len(ch_set),
'has_learning': '有行课' if len(ch_set) > 0 else '无行课'
})
chapter_df = pd.DataFrame(account_chapter_counts)
# Merge into account_roles
account_roles = account_roles.merge(chapter_df, on='account_id', how='left')
account_roles['chapter_count'] = account_roles['chapter_count'].fillna(0).astype(int)
account_roles['has_learning'] = account_roles['has_learning'].fillna('无行课')
print(f" 用户行课状态: {account_roles['has_learning'].value_counts().to_dict()}")
print(f" 行课课程数分布:")
cc_dist = account_roles['chapter_count'].value_counts().sort_index().head(15)
for k, v in cc_dist.items():
print(f" {k}节课: {v}用户")
# ============================================================
# Step 4: 合并数据并计算退费率
# ============================================================
print("\nStep 4: 计算各维度退费率")
analysis = orders_df.merge(account_roles, on='account_id', how='left')
def calc_refund_rate(g):
n = len(g)
refunded = g['is_refunded'].sum()
users = g['account_id'].nunique()
rate = refunded / n * 100 if n > 0 else 0
gmv = g['pay_amount_int'].sum() / 100
refund_amt = g[g['is_refunded'] == 1]['pay_amount_int'].sum() / 100
return pd.Series({
'订单数': n,
'用户数': users,
'退费订单数': int(refunded),
'退费率': round(rate, 1),
'GMV(元)': round(gmv, 0),
'退费金额(元)': round(refund_amt, 0),
'GSV(元)': round(gmv - refund_amt, 0)
})
# --- 维度1: 不同角色数 → 退费率 ---
print("\n--- 维度1: 不同角色数 → 退费率 ---")
result1 = []
for ch in ['晚柠', '老王', '三人行', '念妈']:
cd = analysis[analysis['channel'] == ch]
for rc in sorted(cd['total_roles'].unique()):
row = calc_refund_rate(cd[cd['total_roles'] == rc])
row['渠道'] = ch
row['角色数'] = rc
result1.append(row)
r1 = pd.DataFrame(result1).sort_values(['渠道','角色数']).reset_index(drop=True)
print(r1.to_string(index=False))
# --- 维度2: 不同角色数 × 是否行课 → 退费率 ---
print("\n--- 维度2: 不同角色数 × 是否行课 → 退费率 ---")
result2 = []
for ch in ['晚柠', '老王', '三人行', '念妈']:
cd = analysis[analysis['channel'] == ch]
for rc in sorted(cd['total_roles'].unique()):
for hl in ['有行课', '无行课']:
subset = cd[(cd['total_roles'] == rc) & (cd['has_learning'] == hl)]
if len(subset) == 0:
continue
row = calc_refund_rate(subset)
row['渠道'] = ch
row['角色数'] = rc
row['是否行课'] = hl
result2.append(row)
r2 = pd.DataFrame(result2).sort_values(['渠道','角色数','是否行课']).reset_index(drop=True)
print(r2.to_string(index=False))
# --- 维度3: 不同角色数 × 行课课程数 → 退费率 ---
print("\n--- 维度3: 不同角色数 × 行课课程数 → 退费率 ---")
result3 = []
for ch in ['晚柠', '老王', '三人行', '念妈']:
cd = analysis[analysis['channel'] == ch]
for rc in sorted(cd['total_roles'].unique()):
for cc in sorted(cd[cd['total_roles'] == rc]['chapter_count'].unique()):
subset = cd[(cd['total_roles'] == rc) & (cd['chapter_count'] == cc)]
if len(subset) == 0:
continue
row = calc_refund_rate(subset)
row['渠道'] = ch
row['角色数'] = rc
row['行课课程数'] = cc
result3.append(row)
r3 = pd.DataFrame(result3).sort_values(['渠道','角色数','行课课程数']).reset_index(drop=True)
print(r3.to_string(index=False))
# --- 渠道汇总 ---
print("\n--- 渠道汇总 ---")
summary = []
for ch in ['晚柠', '老王', '三人行', '念妈']:
row = calc_refund_rate(analysis[analysis['channel'] == ch])
row['渠道'] = ch
summary.append(row)
s = pd.DataFrame(summary)
print(s.to_string(index=False))
# ============================================================
# Step 5: 写入Excel
# ============================================================
print("\nStep 5: 写入Excel")
output_path = '/root/.openclaw/workspace/output/晚柠老王三人行念妈_角色数退费率分析.xlsx'
cols_1 = ['渠道','角色数','订单数','用户数','退费订单数','退费率','GMV(元)','退费金额(元)','GSV(元)']
cols_2 = ['渠道','角色数','是否行课','订单数','用户数','退费订单数','退费率','GMV(元)','退费金额(元)','GSV(元)']
cols_3 = ['渠道','角色数','行课课程数','订单数','用户数','退费订单数','退费率','GMV(元)','退费金额(元)','GSV(元)']
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
s[['渠道','订单数','用户数','退费订单数','退费率','GMV(元)','退费金额(元)','GSV(元)']].to_excel(writer, sheet_name='渠道汇总', index=False)
r1[cols_1].to_excel(writer, sheet_name='角色数-退费率', index=False)
r2[cols_2].to_excel(writer, sheet_name='角色数×是否行课-退费率', index=False)
r3[cols_3].to_excel(writer, sheet_name='角色数×行课课程数-退费率', index=False)
# Format
wb = load_workbook(output_path)
header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid")
header_font = Font(bold=True, size=11, color="FFFFFF")
thin_border = Border(left=Side(style='thin'), right=Side(style='thin'), top=Side(style='thin'), bottom=Side(style='thin'))
center = Alignment(horizontal='center', vertical='center')
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
for sn in wb.sheetnames:
ws = wb[sn]
for cell in ws[1]:
cell.font = header_font
cell.fill = header_fill
cell.alignment = center
cell.border = thin_border
for row in ws.iter_rows(min_row=2, max_row=ws.max_row, max_col=ws.max_column):
for cell in row:
cell.alignment = center
cell.border = thin_border
for col in range(1, ws.max_column+1):
max_len = 0
for row in range(1, ws.max_row+1):
v = str(ws.cell(row=row, column=col).value or '')
l = sum(2 if ord(c) > 127 else 1 for c in v)
max_len = max(max_len, l)
ws.column_dimensions[get_column_letter(col)].width = min(max_len + 4, 25)
# Highlight high refund rate
for col in range(1, ws.max_column+1):
if ws.cell(row=1, column=col).value == '退费率':
for row in range(2, ws.max_row+1):
cell = ws.cell(row=row, column=col)
if cell.value and isinstance(cell.value, (int,float)) and cell.value > 50:
cell.fill = red_fill
break
wb.save(output_path)
print(f"\n✅ 报告已生成: {output_path}")
conn.close()