#!/usr/bin/env python3 """ 晚柠/老王/三人行/念妈 四个渠道的退费率分析 维度:角色数、是否行课、行课课程数(用户完成的去重chapter_id数) 念妈:限制 2026-04-01 ~ 2026-05-19 其他三个渠道:全量 """ import psycopg2 import pandas as pd from openpyxl import load_workbook from openpyxl.styles import Font, Alignment, PatternFill, Border, Side from openpyxl.utils import get_column_letter conn = psycopg2.connect( host="bj-postgres-16pob4sg.sql.tencentcdb.com", port=28591, user="ai_member", password="LdfjdjL83h3h3^$&**YGG*", dbname="vala_bi" ) # 渠道定义 channel_keys = { '晚柠': ["newmedia-daren-xhs-晚柠也是个妈妈了-0"], '老王': [ "newmedia-daren-douyin-学霸老王讲真话(4月8日瓦拉英语首发)-0", "newmedia-daren-xhs-学霸老王讲真话-0", "newmedia-daren-douyin-学霸老王讲真话-0", "newmedia-daren-wxxd-学霸老王讲真话-0" ], '三人行': [ "newmedia-daren-douyin-学霸三人行(4.8瓦拉英语首发)-0", "newmedia-daren-wxxd-学霸三人行-0", "newmedia-daren-xhs-学霸三人行-0", "newmedia-daren-douyin-学霸三人行-0", "newmedia-daren-wxxd001-学霸三人行-0" ], '念妈': [ "newmedia-daren-douyin-念妈讲学习规划-0", "newmedia-daren-wxxd-念妈讲学习规划-0", "newmedia-daren-wxxd001-念妈讲学习规划-0", "newmedia-daren-xhs-念妈讲学习规划-0" ] } all_keys = [] for ks in channel_keys.values(): all_keys.extend(ks) key_to_channel = {} for ch, ks in channel_keys.items(): for k in ks: key_to_channel[k] = ch # ============================================================ # Step 1: 获取目标订单 # ============================================================ print("Step 1: 获取目标订单(剔除测试/删除账号和删除订单)") # 先获取所有渠道的订单(念妈有时间限制,其他没有) orders_parts = [] for ch, ks in channel_keys.items(): if ch == '念妈': sql = """ SELECT o.id as order_id, o.account_id, o.key_from, o.pay_amount_int, o.order_status, o.trade_no, CASE WHEN r.status = 3 AND o.order_status = 4 THEN 1 ELSE 0 END as is_refunded FROM bi_vala_order o JOIN bi_vala_app_account a ON o.account_id = a.id AND a.status = 1 AND a.deleted_at IS NULL LEFT JOIN bi_refund_order r ON o.trade_no = r.trade_no AND r.status = 3 WHERE o.deleted_at IS NULL AND o.key_from = ANY(%s) AND o.pay_success_date >= '2026-04-01' AND o.pay_success_date < '2026-05-20' """ else: sql = """ SELECT o.id as order_id, o.account_id, o.key_from, o.pay_amount_int, o.order_status, o.trade_no, CASE WHEN r.status = 3 AND o.order_status = 4 THEN 1 ELSE 0 END as is_refunded FROM bi_vala_order o JOIN bi_vala_app_account a ON o.account_id = a.id AND a.status = 1 AND a.deleted_at IS NULL LEFT JOIN bi_refund_order r ON o.trade_no = r.trade_no AND r.status = 3 WHERE o.deleted_at IS NULL AND o.key_from = ANY(%s) """ df = pd.read_sql(sql, conn, params=(ks,)) orders_parts.append(df) orders_df = pd.concat(orders_parts, ignore_index=True) orders_df['channel'] = orders_df['key_from'].map(key_to_channel) print(f" 总订单数: {len(orders_df)}, 总用户数: {orders_df['account_id'].nunique()}") for ch in ['晚柠', '老王', '三人行', '念妈']: cd = orders_df[orders_df['channel'] == ch] print(f" {ch}: {len(cd)}单, {cd['account_id'].nunique()}用户, {cd['is_refunded'].sum():.0f}退费单") # ============================================================ # Step 2: 统计每个用户的角色数 # ============================================================ print("\nStep 2: 统计每个用户的角色数") target_account_ids = sorted(orders_df['account_id'].unique().tolist()) batch_size = 5000 all_roles = [] for i in range(0, len(target_account_ids), batch_size): batch = target_account_ids[i:i+batch_size] sql = """ SELECT account_id, COUNT(*) as total_roles FROM bi_vala_app_character WHERE account_id = ANY(%s) AND deleted_at IS NULL GROUP BY account_id """ df = pd.read_sql(sql, conn, params=(batch,)) all_roles.append(df) roles_df = pd.concat(all_roles, ignore_index=True) if all_roles else pd.DataFrame(columns=['account_id','total_roles']) account_roles = pd.DataFrame({'account_id': target_account_ids}) account_roles = account_roles.merge(roles_df, on='account_id', how='left') account_roles['total_roles'] = account_roles['total_roles'].fillna(0).astype(int) print(f" 角色数分布: {account_roles['total_roles'].value_counts().sort_index().to_dict()}") # ============================================================ # Step 3: 统计每个用户的"行课课程数"(去重chapter_id, play_status=1) # ============================================================ print("\nStep 3: 统计每个用户的「行课课程数」(完成课时去重chapter_id数)") # 获取所有目标用户的角色 all_characters = [] for i in range(0, len(target_account_ids), batch_size): batch = target_account_ids[i:i+batch_size] sql = """ SELECT id as character_id, account_id FROM bi_vala_app_character WHERE account_id = ANY(%s) AND deleted_at IS NULL """ df = pd.read_sql(sql, conn, params=(batch,)) all_characters.append(df) char_df = pd.concat(all_characters, ignore_index=True) if all_characters else pd.DataFrame() print(f" 目标用户总角色数: {len(char_df)}") # 获取这些角色完成的课时(去重chapter_id) char_ids = sorted(char_df['character_id'].unique().tolist()) user_chapter_count = {} # account_id -> set of chapter_id for i in range(0, len(char_ids), batch_size): batch = char_ids[i:i+batch_size] # Union all 8 shard tables, get distinct (user_id, chapter_id) where play_status=1 union_sqls = [] for t in range(8): union_sqls.append(f""" SELECT DISTINCT user_id, chapter_id FROM bi_user_chapter_play_record_{t} WHERE user_id = ANY(%s) AND play_status = 1 AND deleted_at IS NULL """) sql = " UNION ".join(union_sqls) params = tuple([batch] * 8) df = pd.read_sql(sql, conn, params=params) # Map character_id -> account_id char_to_account = dict(zip(char_df['character_id'], char_df['account_id'])) for _, row in df.iterrows(): aid = char_to_account.get(row['user_id']) if aid: if aid not in user_chapter_count: user_chapter_count[aid] = set() user_chapter_count[aid].add(row['chapter_id']) # Build account-level chapter count account_chapter_counts = [] for aid in target_account_ids: ch_set = user_chapter_count.get(aid, set()) account_chapter_counts.append({ 'account_id': aid, 'chapter_count': len(ch_set), 'has_learning': '有行课' if len(ch_set) > 0 else '无行课' }) chapter_df = pd.DataFrame(account_chapter_counts) # Merge into account_roles account_roles = account_roles.merge(chapter_df, on='account_id', how='left') account_roles['chapter_count'] = account_roles['chapter_count'].fillna(0).astype(int) account_roles['has_learning'] = account_roles['has_learning'].fillna('无行课') print(f" 用户行课状态: {account_roles['has_learning'].value_counts().to_dict()}") print(f" 行课课程数分布:") cc_dist = account_roles['chapter_count'].value_counts().sort_index().head(15) for k, v in cc_dist.items(): print(f" {k}节课: {v}用户") # ============================================================ # Step 4: 合并数据并计算退费率 # ============================================================ print("\nStep 4: 计算各维度退费率") analysis = orders_df.merge(account_roles, on='account_id', how='left') def calc_refund_rate(g): n = len(g) refunded = g['is_refunded'].sum() users = g['account_id'].nunique() rate = refunded / n * 100 if n > 0 else 0 gmv = g['pay_amount_int'].sum() / 100 refund_amt = g[g['is_refunded'] == 1]['pay_amount_int'].sum() / 100 return pd.Series({ '订单数': n, '用户数': users, '退费订单数': int(refunded), '退费率': round(rate, 1), 'GMV(元)': round(gmv, 0), '退费金额(元)': round(refund_amt, 0), 'GSV(元)': round(gmv - refund_amt, 0) }) # --- 维度1: 不同角色数 → 退费率 --- print("\n--- 维度1: 不同角色数 → 退费率 ---") result1 = [] for ch in ['晚柠', '老王', '三人行', '念妈']: cd = analysis[analysis['channel'] == ch] for rc in sorted(cd['total_roles'].unique()): row = calc_refund_rate(cd[cd['total_roles'] == rc]) row['渠道'] = ch row['角色数'] = rc result1.append(row) r1 = pd.DataFrame(result1).sort_values(['渠道','角色数']).reset_index(drop=True) print(r1.to_string(index=False)) # --- 维度2: 不同角色数 × 是否行课 → 退费率 --- print("\n--- 维度2: 不同角色数 × 是否行课 → 退费率 ---") result2 = [] for ch in ['晚柠', '老王', '三人行', '念妈']: cd = analysis[analysis['channel'] == ch] for rc in sorted(cd['total_roles'].unique()): for hl in ['有行课', '无行课']: subset = cd[(cd['total_roles'] == rc) & (cd['has_learning'] == hl)] if len(subset) == 0: continue row = calc_refund_rate(subset) row['渠道'] = ch row['角色数'] = rc row['是否行课'] = hl result2.append(row) r2 = pd.DataFrame(result2).sort_values(['渠道','角色数','是否行课']).reset_index(drop=True) print(r2.to_string(index=False)) # --- 维度3: 不同角色数 × 行课课程数 → 退费率 --- print("\n--- 维度3: 不同角色数 × 行课课程数 → 退费率 ---") result3 = [] for ch in ['晚柠', '老王', '三人行', '念妈']: cd = analysis[analysis['channel'] == ch] for rc in sorted(cd['total_roles'].unique()): for cc in sorted(cd[cd['total_roles'] == rc]['chapter_count'].unique()): subset = cd[(cd['total_roles'] == rc) & (cd['chapter_count'] == cc)] if len(subset) == 0: continue row = calc_refund_rate(subset) row['渠道'] = ch row['角色数'] = rc row['行课课程数'] = cc result3.append(row) r3 = pd.DataFrame(result3).sort_values(['渠道','角色数','行课课程数']).reset_index(drop=True) print(r3.to_string(index=False)) # --- 渠道汇总 --- print("\n--- 渠道汇总 ---") summary = [] for ch in ['晚柠', '老王', '三人行', '念妈']: row = calc_refund_rate(analysis[analysis['channel'] == ch]) row['渠道'] = ch summary.append(row) s = pd.DataFrame(summary) print(s.to_string(index=False)) # ============================================================ # Step 5: 写入Excel # ============================================================ print("\nStep 5: 写入Excel") output_path = '/root/.openclaw/workspace/output/晚柠老王三人行念妈_角色数退费率分析.xlsx' cols_1 = ['渠道','角色数','订单数','用户数','退费订单数','退费率','GMV(元)','退费金额(元)','GSV(元)'] cols_2 = ['渠道','角色数','是否行课','订单数','用户数','退费订单数','退费率','GMV(元)','退费金额(元)','GSV(元)'] cols_3 = ['渠道','角色数','行课课程数','订单数','用户数','退费订单数','退费率','GMV(元)','退费金额(元)','GSV(元)'] with pd.ExcelWriter(output_path, engine='openpyxl') as writer: s[['渠道','订单数','用户数','退费订单数','退费率','GMV(元)','退费金额(元)','GSV(元)']].to_excel(writer, sheet_name='渠道汇总', index=False) r1[cols_1].to_excel(writer, sheet_name='角色数-退费率', index=False) r2[cols_2].to_excel(writer, sheet_name='角色数×是否行课-退费率', index=False) r3[cols_3].to_excel(writer, sheet_name='角色数×行课课程数-退费率', index=False) # Format wb = load_workbook(output_path) header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid") header_font = Font(bold=True, size=11, color="FFFFFF") thin_border = Border(left=Side(style='thin'), right=Side(style='thin'), top=Side(style='thin'), bottom=Side(style='thin')) center = Alignment(horizontal='center', vertical='center') red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid") for sn in wb.sheetnames: ws = wb[sn] for cell in ws[1]: cell.font = header_font cell.fill = header_fill cell.alignment = center cell.border = thin_border for row in ws.iter_rows(min_row=2, max_row=ws.max_row, max_col=ws.max_column): for cell in row: cell.alignment = center cell.border = thin_border for col in range(1, ws.max_column+1): max_len = 0 for row in range(1, ws.max_row+1): v = str(ws.cell(row=row, column=col).value or '') l = sum(2 if ord(c) > 127 else 1 for c in v) max_len = max(max_len, l) ws.column_dimensions[get_column_letter(col)].width = min(max_len + 4, 25) # Highlight high refund rate for col in range(1, ws.max_column+1): if ws.cell(row=1, column=col).value == '退费率': for row in range(2, ws.max_row+1): cell = ws.cell(row=row, column=col) if cell.value and isinstance(cell.value, (int,float)) and cell.value > 50: cell.fill = red_fill break wb.save(output_path) print(f"\n✅ 报告已生成: {output_path}") conn.close()