#!/usr/bin/env python3 """ 行课进度与退款率关系分析 分析维度: 1. 按用户完成课时数分桶,计算每桶的退款率 2. 退款用户的退款前已完成课时数分布 3. 不同课程等级(L1/L2)的行课进度与退款率 """ import os import sys import psycopg2 import pandas as pd from collections import defaultdict # ── 数据库 ── DB_HOST = "bj-postgres-16pob4sg.sql.tencentcdb.com" DB_PORT = 28591 DB_USER = "ai_member" DB_NAME = "vala_bi" def get_password(): pw = os.environ.get("PG_ONLINE_PASSWORD", "") if pw: return pw secrets_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "secrets.env") if os.path.exists(secrets_path): with open(secrets_path) as f: for line in f: if line.startswith("PG_ONLINE_PASSWORD="): return line.strip().split("=", 1)[1].strip("'\"") raise RuntimeError("PG_ONLINE_PASSWORD not found") def get_conn(): return psycopg2.connect(host=DB_HOST, port=DB_PORT, user=DB_USER, password=get_password(), dbname=DB_NAME, connect_timeout=60) def main(): conn = get_conn() cur = conn.cursor() print("=" * 60) print("行课进度与退款率关系分析") print("=" * 60) # ── 1. 获取所有付费用户(排除测试账号)── print("\n[1/5] 获取付费用户...") cur.execute(""" SELECT o.account_id, o.trade_no, o.pay_success_date, o.pay_amount_int, o.order_status, o.goods_name, o.key_from FROM bi_vala_order o INNER JOIN bi_vala_app_account a ON o.account_id = a.id AND a.status = 1 WHERE o.deleted_at IS NULL AND o.pay_success_date IS NOT NULL AND o.order_status IN (3, 4) ORDER BY o.account_id, o.pay_success_date """) orders = cur.fetchall() print(f" 共 {len(orders)} 条订单") # ── 2. 获取退款信息 ── print("\n[2/5] 获取退款信息...") trade_nos = list(set(o[1] for o in orders if o[1])) refund_set = set() # trade_nos that were refunded for i in range(0, len(trade_nos), 500): batch = trade_nos[i:i+500] ph = ",".join(["%s"] * len(batch)) cur.execute(f""" SELECT DISTINCT trade_no FROM bi_refund_order WHERE trade_no IN ({ph}) AND status = 3 AND deleted_at IS NULL """, batch) for (tn,) in cur.fetchall(): refund_set.add(tn) print(f" 退款订单 trade_no 数: {len(refund_set)}") # ── 3. 获取所有角色 ── print("\n[3/5] 获取用户角色...") account_ids = list(set(o[0] for o in orders)) # 分批 all_chars = [] for i in range(0, len(account_ids), 1000): batch = account_ids[i:i+1000] ph = ",".join(["%s"] * len(batch)) cur.execute(f""" SELECT id AS character_id, account_id, nickname FROM bi_vala_app_character WHERE account_id IN ({ph}) AND nickname IS NOT NULL AND nickname != '' AND deleted_at IS NULL """, batch) all_chars.extend(cur.fetchall()) print(f" 共 {len(all_chars)} 个角色") char_account_map = {} # character_id -> account_id account_chars = defaultdict(list) # account_id -> [character_ids] for cid, aid, nick in all_chars: char_account_map[cid] = aid account_chars[aid].append(cid) # ── 4. 获取课时完成记录(所有8个分表)── print("\n[4/5] 获取课时完成记录...") char_ids = list(set(c[0] for c in all_chars)) char_lesson_count = defaultdict(int) # character_id -> 完成课时数 char_first_done = {} # character_id -> 首次完成时间 char_last_done = {} # character_id -> 最近完成时间 for tbl_idx in range(8): table = f"bi_user_chapter_play_record_{tbl_idx}" for i in range(0, len(char_ids), 2000): batch = char_ids[i:i+2000] ph = ",".join(["%s"] * len(batch)) try: cur.execute(f""" SELECT user_id, chapter_id, MIN(created_at), MAX(created_at), COUNT(*) FROM {table} WHERE user_id IN ({ph}) AND play_status = 1 AND deleted_at IS NULL GROUP BY user_id, chapter_id """, batch) for user_id, ch_id, first_at, last_at, cnt in cur.fetchall(): char_lesson_count[user_id] += 1 if user_id not in char_first_done or first_at < char_first_done[user_id]: char_first_done[user_id] = first_at if user_id not in char_last_done or last_at > char_last_done[user_id]: char_last_done[user_id] = last_at except Exception as e: print(f" 警告: {table} 查询失败: {e}") print(f" 有行课记录的角色: {len(char_lesson_count)}") # ── 5. 构建分析数据 ── print("\n[5/5] 构建分析数据...") # 按 account_id 聚合 # 每个用户的:订单列表、退款订单列表、所有角色的总完成课时数 user_orders = defaultdict(list) # account_id -> [(trade_no, pay_date, amount, order_status, goods_name, key_from)] for o in orders: aid, tn, pd_, amt, os_, gn, kf = o user_orders[aid].append({ "trade_no": tn, "pay_date": pd_, "amount": amt / 100.0, "order_status": os_, "goods_name": gn, "key_from": kf, "is_refunded": tn in refund_set, }) # 用户维度分析 rows = [] for aid in account_ids: my_orders = user_orders.get(aid, []) if not my_orders: continue # 总完成课时数(所有角色汇总) my_chars = account_chars.get(aid, []) total_lessons = sum(char_lesson_count.get(cid, 0) for cid in my_chars) # 首个角色首次行课时间 first_lesson = None for cid in my_chars: if cid in char_first_done: if first_lesson is None or char_first_done[cid] < first_lesson: first_lesson = char_first_done[cid] # 最近行课时间 last_lesson = None for cid in my_chars: if cid in char_last_done: if last_lesson is None or char_last_done[cid] > last_lesson: last_lesson = char_last_done[cid] # 订单分析 total_orders = len(my_orders) refunded_orders = sum(1 for o in my_orders if o["is_refunded"]) total_gmv = sum(o["amount"] for o in my_orders) total_refund = sum(o["amount"] for o in my_orders if o["is_refunded"]) gsv = total_gmv - total_refund # 是否全部退款 all_refunded = (refunded_orders == total_orders and total_orders > 0) # 首次购买时间 first_pay = min(o["pay_date"] for o in my_orders if o["pay_date"]) # 退款时间(取最早的退款订单) refund_orders_list = [o for o in my_orders if o["is_refunded"]] first_refund_date = None if refund_orders_list: first_refund_date = min(o["pay_date"] for o in refund_orders_list) # 退款前完成课时数(退款日期之前完成的课时) # 这里用简化方式:取退款日期 lessons_before_refund = None if first_refund_date: # 计算退款前完成课时数 cnt = 0 for cid in my_chars: # 需要查分表,这里简化:如果首次行课时间在退款之前,则计入 if cid in char_first_done and char_first_done[cid] < first_refund_date: cnt += char_lesson_count.get(cid, 0) lessons_before_refund = cnt rows.append({ "用户ID": aid, "角色数": len(my_chars), "总完成课时数": total_lessons, "首次行课时间": first_lesson, "最近行课时间": last_lesson, "首次购买时间": first_pay, "订单数": total_orders, "退款订单数": refunded_orders, "GMV": round(total_gmv, 2), "GSV": round(gsv, 2), "退款金额": round(total_refund, 2), "是否全部退款": "是" if all_refunded else "否", "是否退过款": "是" if refunded_orders > 0 else "否", }) df = pd.DataFrame(rows) conn.close() # ── 6. 分析输出 ── print("\n" + "=" * 60) print("分析结果") print("=" * 60) # 6.1 按完成课时数分桶 print("\n【维度1】按完成课时数分桶的退款率") def lesson_bucket(n): if n == 0: return "0课时" elif n <= 3: return "1-3课时" elif n <= 7: return "4-7课时" elif n <= 15: return "8-15课时" elif n <= 30: return "16-30课时" elif n <= 60: return "31-60课时" else: return "60课时以上" df["课时桶"] = df["总完成课时数"].apply(lesson_bucket) bucket_order = ["0课时", "1-3课时", "4-7课时", "8-15课时", "16-30课时", "31-60课时", "60课时以上"] bucket_stats = df.groupby("课时桶").agg( 用户数=("用户ID", "count"), 退款用户数=("是否退过款", lambda x: (x == "是").sum()), 全部退款用户数=("是否全部退款", lambda x: (x == "是").sum()), 平均完成课时=("总完成课时数", "mean"), GMV总额=("GMV", "sum"), GSV总额=("GSV", "sum"), ).reindex(bucket_order).fillna(0) bucket_stats["退款率(用户)"] = (bucket_stats["退款用户数"] / bucket_stats["用户数"] * 100).round(1) bucket_stats["全部退款率(用户)"] = (bucket_stats["全部退款用户数"] / bucket_stats["用户数"] * 100).round(1) bucket_stats["平均完成课时"] = bucket_stats["平均完成课时"].round(1) bucket_stats["GMV总额"] = bucket_stats["GMV总额"].round(2) bucket_stats["GSV总额"] = bucket_stats["GSV总额"].round(2) print(bucket_stats.to_string()) # 6.2 退款用户的完成课时分布 print("\n\n【维度2】退款用户的行课完成情况") refund_users = df[df["是否退过款"] == "是"] print(f"退款用户总数: {len(refund_users)}") print(f" 其中0课时的: {(refund_users['总完成课时数'] == 0).sum()} 人") print(f" 其中1-3课时的: {((refund_users['总完成课时数'] >= 1) & (refund_users['总完成课时数'] <= 3)).sum()} 人") print(f" 其中4-7课时的: {((refund_users['总完成课时数'] >= 4) & (refund_users['总完成课时数'] <= 7)).sum()} 人") print(f" 其中8-15课时的: {((refund_users['总完成课时数'] >= 8) & (refund_users['总完成课时数'] <= 15)).sum()} 人") print(f" 其中16课时以上的: {(refund_users['总完成课时数'] >= 16).sum()} 人") print(f" 退款用户平均完成课时: {refund_users['总完成课时数'].mean():.1f}") # 6.3 未退款用户 vs 退款用户对比 print("\n\n【维度3】退款 vs 未退款用户行课对比") no_refund = df[df["是否退过款"] == "否"] print(f" 未退款用户数: {len(no_refund)}, 平均完成课时: {no_refund['总完成课时数'].mean():.1f}") print(f" 退款用户数: {len(refund_users)}, 平均完成课时: {refund_users['总完成课时数'].mean():.1f}") print(f" 全部退款用户数: {(df['是否全部退款'] == '是').sum()}, 平均完成课时: {df[df['是否全部退款'] == '是']['总完成课时数'].mean():.1f}") # 6.4 整体统计 print("\n\n【维度4】整体统计") total_users = len(df) total_refund_users = len(refund_users) print(f" 总付费用户: {total_users}") print(f" 总退款用户: {total_refund_users} ({total_refund_users/total_users*100:.1f}%)") print(f" 全部退款用户: {(df['是否全部退款'] == '是').sum()}") print(f" 平均完成课时: {df['总完成课时数'].mean():.1f}") print(f" 总GMV: {df['GMV'].sum():.2f}") print(f" 总GSV: {df['GSV'].sum():.2f}") # ── 7. 输出 Excel ── output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output") os.makedirs(output_dir, exist_ok=True) output_path = os.path.join(output_dir, "行课进度与退款率分析.xlsx") with pd.ExcelWriter(output_path, engine="openpyxl") as writer: # Sheet1: 明细 df_out = df.drop(columns=["课时桶"], errors="ignore") for col in ["首次行课时间", "最近行课时间", "首次购买时间"]: if col in df_out.columns: df_out[col] = pd.to_datetime(df_out[col]).dt.tz_localize(None) df_out.to_excel(writer, sheet_name="用户明细", index=False) # Sheet2: 分桶统计 bucket_stats.to_excel(writer, sheet_name="分桶统计") # Sheet3: 退款用户分桶 refund_bucket = refund_users.groupby("课时桶").agg( 用户数=("用户ID", "count"), 平均完成课时=("总完成课时数", "mean"), GMV=("GMV", "sum"), ).reindex(bucket_order).fillna(0) refund_bucket["占比"] = (refund_bucket["用户数"] / total_refund_users * 100).round(1) refund_bucket.to_excel(writer, sheet_name="退款用户分布") print(f"\n✅ 报表已生成: {output_path}") if __name__ == "__main__": main()