335 lines
14 KiB
Python
335 lines
14 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
晚柠/老王/三人行/念妈 四个渠道的退费率分析
|
||
维度:角色数、是否行课、行课课程数(用户完成的去重chapter_id数)
|
||
念妈:限制 2026-04-01 ~ 2026-05-19
|
||
其他三个渠道:全量
|
||
"""
|
||
import psycopg2
|
||
import pandas as pd
|
||
from openpyxl import load_workbook
|
||
from openpyxl.styles import Font, Alignment, PatternFill, Border, Side
|
||
from openpyxl.utils import get_column_letter
|
||
|
||
conn = psycopg2.connect(
|
||
host="bj-postgres-16pob4sg.sql.tencentcdb.com",
|
||
port=28591,
|
||
user="ai_member",
|
||
password="LdfjdjL83h3h3^$&**YGG*",
|
||
dbname="vala_bi"
|
||
)
|
||
|
||
# 渠道定义
|
||
channel_keys = {
|
||
'晚柠': ["newmedia-daren-xhs-晚柠也是个妈妈了-0"],
|
||
'老王': [
|
||
"newmedia-daren-douyin-学霸老王讲真话(4月8日瓦拉英语首发)-0",
|
||
"newmedia-daren-xhs-学霸老王讲真话-0",
|
||
"newmedia-daren-douyin-学霸老王讲真话-0",
|
||
"newmedia-daren-wxxd-学霸老王讲真话-0"
|
||
],
|
||
'三人行': [
|
||
"newmedia-daren-douyin-学霸三人行(4.8瓦拉英语首发)-0",
|
||
"newmedia-daren-wxxd-学霸三人行-0",
|
||
"newmedia-daren-xhs-学霸三人行-0",
|
||
"newmedia-daren-douyin-学霸三人行-0",
|
||
"newmedia-daren-wxxd001-学霸三人行-0"
|
||
],
|
||
'念妈': [
|
||
"newmedia-daren-douyin-念妈讲学习规划-0",
|
||
"newmedia-daren-wxxd-念妈讲学习规划-0",
|
||
"newmedia-daren-wxxd001-念妈讲学习规划-0",
|
||
"newmedia-daren-xhs-念妈讲学习规划-0"
|
||
]
|
||
}
|
||
|
||
all_keys = []
|
||
for ks in channel_keys.values():
|
||
all_keys.extend(ks)
|
||
|
||
key_to_channel = {}
|
||
for ch, ks in channel_keys.items():
|
||
for k in ks:
|
||
key_to_channel[k] = ch
|
||
|
||
# ============================================================
|
||
# Step 1: 获取目标订单
|
||
# ============================================================
|
||
print("Step 1: 获取目标订单(剔除测试/删除账号和删除订单)")
|
||
|
||
# 先获取所有渠道的订单(念妈有时间限制,其他没有)
|
||
orders_parts = []
|
||
|
||
for ch, ks in channel_keys.items():
|
||
if ch == '念妈':
|
||
sql = """
|
||
SELECT o.id as order_id, o.account_id, o.key_from, o.pay_amount_int,
|
||
o.order_status, o.trade_no,
|
||
CASE WHEN r.status = 3 AND o.order_status = 4 THEN 1 ELSE 0 END as is_refunded
|
||
FROM bi_vala_order o
|
||
JOIN bi_vala_app_account a ON o.account_id = a.id
|
||
AND a.status = 1 AND a.deleted_at IS NULL
|
||
LEFT JOIN bi_refund_order r ON o.trade_no = r.trade_no AND r.status = 3
|
||
WHERE o.deleted_at IS NULL
|
||
AND o.key_from = ANY(%s)
|
||
AND o.pay_success_date >= '2026-04-01'
|
||
AND o.pay_success_date < '2026-05-20'
|
||
"""
|
||
else:
|
||
sql = """
|
||
SELECT o.id as order_id, o.account_id, o.key_from, o.pay_amount_int,
|
||
o.order_status, o.trade_no,
|
||
CASE WHEN r.status = 3 AND o.order_status = 4 THEN 1 ELSE 0 END as is_refunded
|
||
FROM bi_vala_order o
|
||
JOIN bi_vala_app_account a ON o.account_id = a.id
|
||
AND a.status = 1 AND a.deleted_at IS NULL
|
||
LEFT JOIN bi_refund_order r ON o.trade_no = r.trade_no AND r.status = 3
|
||
WHERE o.deleted_at IS NULL
|
||
AND o.key_from = ANY(%s)
|
||
"""
|
||
df = pd.read_sql(sql, conn, params=(ks,))
|
||
orders_parts.append(df)
|
||
|
||
orders_df = pd.concat(orders_parts, ignore_index=True)
|
||
orders_df['channel'] = orders_df['key_from'].map(key_to_channel)
|
||
|
||
print(f" 总订单数: {len(orders_df)}, 总用户数: {orders_df['account_id'].nunique()}")
|
||
for ch in ['晚柠', '老王', '三人行', '念妈']:
|
||
cd = orders_df[orders_df['channel'] == ch]
|
||
print(f" {ch}: {len(cd)}单, {cd['account_id'].nunique()}用户, {cd['is_refunded'].sum():.0f}退费单")
|
||
|
||
# ============================================================
|
||
# Step 2: 统计每个用户的角色数
|
||
# ============================================================
|
||
print("\nStep 2: 统计每个用户的角色数")
|
||
target_account_ids = sorted(orders_df['account_id'].unique().tolist())
|
||
batch_size = 5000
|
||
|
||
all_roles = []
|
||
for i in range(0, len(target_account_ids), batch_size):
|
||
batch = target_account_ids[i:i+batch_size]
|
||
sql = """
|
||
SELECT account_id, COUNT(*) as total_roles
|
||
FROM bi_vala_app_character
|
||
WHERE account_id = ANY(%s) AND deleted_at IS NULL
|
||
GROUP BY account_id
|
||
"""
|
||
df = pd.read_sql(sql, conn, params=(batch,))
|
||
all_roles.append(df)
|
||
|
||
roles_df = pd.concat(all_roles, ignore_index=True) if all_roles else pd.DataFrame(columns=['account_id','total_roles'])
|
||
|
||
account_roles = pd.DataFrame({'account_id': target_account_ids})
|
||
account_roles = account_roles.merge(roles_df, on='account_id', how='left')
|
||
account_roles['total_roles'] = account_roles['total_roles'].fillna(0).astype(int)
|
||
|
||
print(f" 角色数分布: {account_roles['total_roles'].value_counts().sort_index().to_dict()}")
|
||
|
||
# ============================================================
|
||
# Step 3: 统计每个用户的"行课课程数"(去重chapter_id, play_status=1)
|
||
# ============================================================
|
||
print("\nStep 3: 统计每个用户的「行课课程数」(完成课时去重chapter_id数)")
|
||
|
||
# 获取所有目标用户的角色
|
||
all_characters = []
|
||
for i in range(0, len(target_account_ids), batch_size):
|
||
batch = target_account_ids[i:i+batch_size]
|
||
sql = """
|
||
SELECT id as character_id, account_id
|
||
FROM bi_vala_app_character
|
||
WHERE account_id = ANY(%s) AND deleted_at IS NULL
|
||
"""
|
||
df = pd.read_sql(sql, conn, params=(batch,))
|
||
all_characters.append(df)
|
||
|
||
char_df = pd.concat(all_characters, ignore_index=True) if all_characters else pd.DataFrame()
|
||
print(f" 目标用户总角色数: {len(char_df)}")
|
||
|
||
# 获取这些角色完成的课时(去重chapter_id)
|
||
char_ids = sorted(char_df['character_id'].unique().tolist())
|
||
user_chapter_count = {} # account_id -> set of chapter_id
|
||
|
||
for i in range(0, len(char_ids), batch_size):
|
||
batch = char_ids[i:i+batch_size]
|
||
# Union all 8 shard tables, get distinct (user_id, chapter_id) where play_status=1
|
||
union_sqls = []
|
||
for t in range(8):
|
||
union_sqls.append(f"""
|
||
SELECT DISTINCT user_id, chapter_id
|
||
FROM bi_user_chapter_play_record_{t}
|
||
WHERE user_id = ANY(%s) AND play_status = 1 AND deleted_at IS NULL
|
||
""")
|
||
sql = " UNION ".join(union_sqls)
|
||
params = tuple([batch] * 8)
|
||
df = pd.read_sql(sql, conn, params=params)
|
||
|
||
# Map character_id -> account_id
|
||
char_to_account = dict(zip(char_df['character_id'], char_df['account_id']))
|
||
for _, row in df.iterrows():
|
||
aid = char_to_account.get(row['user_id'])
|
||
if aid:
|
||
if aid not in user_chapter_count:
|
||
user_chapter_count[aid] = set()
|
||
user_chapter_count[aid].add(row['chapter_id'])
|
||
|
||
# Build account-level chapter count
|
||
account_chapter_counts = []
|
||
for aid in target_account_ids:
|
||
ch_set = user_chapter_count.get(aid, set())
|
||
account_chapter_counts.append({
|
||
'account_id': aid,
|
||
'chapter_count': len(ch_set),
|
||
'has_learning': '有行课' if len(ch_set) > 0 else '无行课'
|
||
})
|
||
|
||
chapter_df = pd.DataFrame(account_chapter_counts)
|
||
|
||
# Merge into account_roles
|
||
account_roles = account_roles.merge(chapter_df, on='account_id', how='left')
|
||
account_roles['chapter_count'] = account_roles['chapter_count'].fillna(0).astype(int)
|
||
account_roles['has_learning'] = account_roles['has_learning'].fillna('无行课')
|
||
|
||
print(f" 用户行课状态: {account_roles['has_learning'].value_counts().to_dict()}")
|
||
print(f" 行课课程数分布:")
|
||
cc_dist = account_roles['chapter_count'].value_counts().sort_index().head(15)
|
||
for k, v in cc_dist.items():
|
||
print(f" {k}节课: {v}用户")
|
||
|
||
# ============================================================
|
||
# Step 4: 合并数据并计算退费率
|
||
# ============================================================
|
||
print("\nStep 4: 计算各维度退费率")
|
||
|
||
analysis = orders_df.merge(account_roles, on='account_id', how='left')
|
||
|
||
def calc_refund_rate(g):
|
||
n = len(g)
|
||
refunded = g['is_refunded'].sum()
|
||
users = g['account_id'].nunique()
|
||
rate = refunded / n * 100 if n > 0 else 0
|
||
gmv = g['pay_amount_int'].sum() / 100
|
||
refund_amt = g[g['is_refunded'] == 1]['pay_amount_int'].sum() / 100
|
||
return pd.Series({
|
||
'订单数': n,
|
||
'用户数': users,
|
||
'退费订单数': int(refunded),
|
||
'退费率': round(rate, 1),
|
||
'GMV(元)': round(gmv, 0),
|
||
'退费金额(元)': round(refund_amt, 0),
|
||
'GSV(元)': round(gmv - refund_amt, 0)
|
||
})
|
||
|
||
# --- 维度1: 不同角色数 → 退费率 ---
|
||
print("\n--- 维度1: 不同角色数 → 退费率 ---")
|
||
result1 = []
|
||
for ch in ['晚柠', '老王', '三人行', '念妈']:
|
||
cd = analysis[analysis['channel'] == ch]
|
||
for rc in sorted(cd['total_roles'].unique()):
|
||
row = calc_refund_rate(cd[cd['total_roles'] == rc])
|
||
row['渠道'] = ch
|
||
row['角色数'] = rc
|
||
result1.append(row)
|
||
r1 = pd.DataFrame(result1).sort_values(['渠道','角色数']).reset_index(drop=True)
|
||
print(r1.to_string(index=False))
|
||
|
||
# --- 维度2: 不同角色数 × 是否行课 → 退费率 ---
|
||
print("\n--- 维度2: 不同角色数 × 是否行课 → 退费率 ---")
|
||
result2 = []
|
||
for ch in ['晚柠', '老王', '三人行', '念妈']:
|
||
cd = analysis[analysis['channel'] == ch]
|
||
for rc in sorted(cd['total_roles'].unique()):
|
||
for hl in ['有行课', '无行课']:
|
||
subset = cd[(cd['total_roles'] == rc) & (cd['has_learning'] == hl)]
|
||
if len(subset) == 0:
|
||
continue
|
||
row = calc_refund_rate(subset)
|
||
row['渠道'] = ch
|
||
row['角色数'] = rc
|
||
row['是否行课'] = hl
|
||
result2.append(row)
|
||
r2 = pd.DataFrame(result2).sort_values(['渠道','角色数','是否行课']).reset_index(drop=True)
|
||
print(r2.to_string(index=False))
|
||
|
||
# --- 维度3: 不同角色数 × 行课课程数 → 退费率 ---
|
||
print("\n--- 维度3: 不同角色数 × 行课课程数 → 退费率 ---")
|
||
result3 = []
|
||
for ch in ['晚柠', '老王', '三人行', '念妈']:
|
||
cd = analysis[analysis['channel'] == ch]
|
||
for rc in sorted(cd['total_roles'].unique()):
|
||
for cc in sorted(cd[cd['total_roles'] == rc]['chapter_count'].unique()):
|
||
subset = cd[(cd['total_roles'] == rc) & (cd['chapter_count'] == cc)]
|
||
if len(subset) == 0:
|
||
continue
|
||
row = calc_refund_rate(subset)
|
||
row['渠道'] = ch
|
||
row['角色数'] = rc
|
||
row['行课课程数'] = cc
|
||
result3.append(row)
|
||
r3 = pd.DataFrame(result3).sort_values(['渠道','角色数','行课课程数']).reset_index(drop=True)
|
||
print(r3.to_string(index=False))
|
||
|
||
# --- 渠道汇总 ---
|
||
print("\n--- 渠道汇总 ---")
|
||
summary = []
|
||
for ch in ['晚柠', '老王', '三人行', '念妈']:
|
||
row = calc_refund_rate(analysis[analysis['channel'] == ch])
|
||
row['渠道'] = ch
|
||
summary.append(row)
|
||
s = pd.DataFrame(summary)
|
||
print(s.to_string(index=False))
|
||
|
||
# ============================================================
|
||
# Step 5: 写入Excel
|
||
# ============================================================
|
||
print("\nStep 5: 写入Excel")
|
||
output_path = '/root/.openclaw/workspace/output/晚柠老王三人行念妈_角色数退费率分析.xlsx'
|
||
|
||
cols_1 = ['渠道','角色数','订单数','用户数','退费订单数','退费率','GMV(元)','退费金额(元)','GSV(元)']
|
||
cols_2 = ['渠道','角色数','是否行课','订单数','用户数','退费订单数','退费率','GMV(元)','退费金额(元)','GSV(元)']
|
||
cols_3 = ['渠道','角色数','行课课程数','订单数','用户数','退费订单数','退费率','GMV(元)','退费金额(元)','GSV(元)']
|
||
|
||
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
|
||
s[['渠道','订单数','用户数','退费订单数','退费率','GMV(元)','退费金额(元)','GSV(元)']].to_excel(writer, sheet_name='渠道汇总', index=False)
|
||
r1[cols_1].to_excel(writer, sheet_name='角色数-退费率', index=False)
|
||
r2[cols_2].to_excel(writer, sheet_name='角色数×是否行课-退费率', index=False)
|
||
r3[cols_3].to_excel(writer, sheet_name='角色数×行课课程数-退费率', index=False)
|
||
|
||
# Format
|
||
wb = load_workbook(output_path)
|
||
header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid")
|
||
header_font = Font(bold=True, size=11, color="FFFFFF")
|
||
thin_border = Border(left=Side(style='thin'), right=Side(style='thin'), top=Side(style='thin'), bottom=Side(style='thin'))
|
||
center = Alignment(horizontal='center', vertical='center')
|
||
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
|
||
|
||
for sn in wb.sheetnames:
|
||
ws = wb[sn]
|
||
for cell in ws[1]:
|
||
cell.font = header_font
|
||
cell.fill = header_fill
|
||
cell.alignment = center
|
||
cell.border = thin_border
|
||
for row in ws.iter_rows(min_row=2, max_row=ws.max_row, max_col=ws.max_column):
|
||
for cell in row:
|
||
cell.alignment = center
|
||
cell.border = thin_border
|
||
for col in range(1, ws.max_column+1):
|
||
max_len = 0
|
||
for row in range(1, ws.max_row+1):
|
||
v = str(ws.cell(row=row, column=col).value or '')
|
||
l = sum(2 if ord(c) > 127 else 1 for c in v)
|
||
max_len = max(max_len, l)
|
||
ws.column_dimensions[get_column_letter(col)].width = min(max_len + 4, 25)
|
||
# Highlight high refund rate
|
||
for col in range(1, ws.max_column+1):
|
||
if ws.cell(row=1, column=col).value == '退费率':
|
||
for row in range(2, ws.max_row+1):
|
||
cell = ws.cell(row=row, column=col)
|
||
if cell.value and isinstance(cell.value, (int,float)) and cell.value > 50:
|
||
cell.fill = red_fill
|
||
break
|
||
|
||
wb.save(output_path)
|
||
print(f"\n✅ 报告已生成: {output_path}")
|
||
conn.close()
|