#!/usr/bin/env python3 """ 行课进度与退款率关系分析(按购买等级匹配行课) 规则: - 用户购买 L1+L2 联报课包(goods_id=61) → 只看 L1 行课 - 用户购买 L2 课包(goods_id IN 31,32,33,54) → 只看 L2 行课 - 用户购买 L1 课包(goods_id IN 57,60,63) → 只看 L1 行课 - 同时买了不同类型的 → 分别归入对应分类(一个用户可出现在多个分类) 行课完成:通过 bi_level_unit_lesson 获取 L1/L2 各自的 chapter_id 列表 """ import os import sys import psycopg2 import pandas as pd from collections import defaultdict from datetime import datetime # ── 数据库 ── DB_HOST = "bj-postgres-16pob4sg.sql.tencentcdb.com" DB_PORT = 28591 DB_USER = "ai_member" DB_NAME = "vala_bi" def get_password(): pw = os.environ.get("PG_ONLINE_PASSWORD", "") if pw: return pw secrets_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "secrets.env") if os.path.exists(secrets_path): with open(secrets_path) as f: for line in f: if line.startswith("PG_ONLINE_PASSWORD="): return line.strip().split("=", 1)[1].strip("'\"") raise RuntimeError("PG_ONLINE_PASSWORD not found") def get_conn(): return psycopg2.connect(host=DB_HOST, port=DB_PORT, user=DB_USER, password=get_password(), dbname=DB_NAME, connect_timeout=60) # ── 商品分类 ── L1_GOODS = {57, 60, 63} L2_GOODS = {31, 32, 33, 54} L1L2_GOODS = {61} # 联报 → 看 L1 行课 def classify_user(goods_ids): """根据用户购买的 goods_id 集合,返回 [(分类标签, 对应等级)]""" goods_set = set(goods_ids) result = [] has_l1l2 = bool(goods_set & L1L2_GOODS) has_l1 = bool(goods_set & L1_GOODS) has_l2 = bool(goods_set & L2_GOODS) if has_l1l2: result.append(("L1+L2联报", "L1")) if has_l1 and not has_l1l2: # 纯L1(没有联报) result.append(("仅L1", "L1")) if has_l2 and not has_l1l2: # 纯L2(没有联报) result.append(("仅L2", "L2")) if has_l1l2 and has_l2: # 联报+L2 → 联报看L1,L2看L2 result.append(("联报+仅L2", "L2")) return result def lesson_bucket(n): if n == 0: return "0课时" elif n <= 3: return "1-3课时" elif n <= 7: return "4-7课时" elif n <= 15: return "8-15课时" elif n <= 30: return "16-30课时" elif n <= 60: return "31-60课时" else: return "60课时以上" BUCKET_ORDER = ["0课时", "1-3课时", "4-7课时", "8-15课时", "16-30课时", "31-60课时", "60课时以上"] def main(): conn = get_conn() cur = conn.cursor() print("=" * 60) print("行课进度与退款率关系分析(按购买等级匹配行课)") print("=" * 60) # ── 0. 获取 L1/L2 的 chapter_id 列表 ── print("\n[0/6] 获取课程结构...") cur.execute(""" SELECT id AS chapter_id, course_level FROM bi_level_unit_lesson WHERE course_level IN ('L1', 'L2') """) level_chapters = defaultdict(set) # "L1" -> {chapter_ids}, "L2" -> {chapter_ids} for ch_id, lv in cur.fetchall(): level_chapters[lv].add(ch_id) print(f" L1 chapter数: {len(level_chapters['L1'])}") print(f" L2 chapter数: {len(level_chapters['L2'])}") # ── 1. 获取所有付费订单(排除测试账号)── print("\n[1/6] 获取付费订单...") cur.execute(""" SELECT o.id AS order_id, o.account_id, o.trade_no, o.pay_success_date, o.pay_amount_int, o.order_status, o.goods_id, o.goods_name, o.key_from FROM bi_vala_order o INNER JOIN bi_vala_app_account a ON o.account_id = a.id AND a.status = 1 WHERE o.deleted_at IS NULL AND o.pay_success_date IS NOT NULL AND o.order_status IN (3, 4) ORDER BY o.account_id, o.pay_success_date """) orders = cur.fetchall() print(f" 共 {len(orders)} 条订单") # ── 2. 获取退款信息 ── print("\n[2/6] 获取退款信息...") trade_nos = list(set(o[2] for o in orders if o[2])) refund_set = set() for i in range(0, len(trade_nos), 500): batch = trade_nos[i:i+500] ph = ",".join(["%s"] * len(batch)) cur.execute(f""" SELECT DISTINCT trade_no FROM bi_refund_order WHERE trade_no IN ({ph}) AND status = 3 AND deleted_at IS NULL """, batch) for (tn,) in cur.fetchall(): refund_set.add(tn) print(f" 退款 trade_no: {len(refund_set)}") # ── 3. 用户-订单聚合 + 分类 ── print("\n[3/6] 用户分类...") user_orders = defaultdict(list) for o in orders: _, aid, tn, pd_, amt, os_, gid, gn, kf = o user_orders[aid].append({ "trade_no": tn, "pay_date": pd_, "amount": amt / 100.0, "order_status": os_, "goods_id": gid, "goods_name": gn, "key_from": kf, "is_refunded": tn in refund_set, }) # 用户分类 user_classifications = {} # account_id -> [(分类标签, 对应等级), ...] for aid, my_orders in user_orders.items(): goods_ids = [o["goods_id"] for o in my_orders] user_classifications[aid] = classify_user(goods_ids) # 统计分类 cat_count = defaultdict(int) for aid, cats in user_classifications.items(): for cat, lv in cats: cat_count[cat] += 1 print(" 用户分类统计:") for k, v in sorted(cat_count.items()): print(f" {k}: {v} 人") # ── 4. 获取角色 ── print("\n[4/6] 获取角色...") all_account_ids = list(user_orders.keys()) all_chars = [] for i in range(0, len(all_account_ids), 1000): batch = all_account_ids[i:i+1000] ph = ",".join(["%s"] * len(batch)) cur.execute(f""" SELECT id AS character_id, account_id FROM bi_vala_app_character WHERE account_id IN ({ph}) AND nickname IS NOT NULL AND nickname != '' AND deleted_at IS NULL """, batch) all_chars.extend(cur.fetchall()) print(f" 共 {len(all_chars)} 个角色") account_chars = defaultdict(list) all_char_ids = [] for cid, aid in all_chars: account_chars[aid].append(cid) all_char_ids.append(cid) # ── 5. 获取课时完成记录(按 chapter_id 过滤等级)── print("\n[5/6] 获取课时完成记录...") # 每个角色在每个等级的完成课时数 char_lesson_l1 = defaultdict(int) char_lesson_l2 = defaultdict(int) l1_chapter_list = list(level_chapters["L1"]) l2_chapter_list = list(level_chapters["L2"]) for tbl_idx in range(8): table = f"bi_user_chapter_play_record_{tbl_idx}" for i in range(0, len(all_char_ids), 2000): batch = all_char_ids[i:i+2000] ph = ",".join(["%s"] * len(batch)) try: cur.execute(f""" SELECT user_id, chapter_id FROM {table} WHERE user_id IN ({ph}) AND play_status = 1 AND deleted_at IS NULL """, batch) for user_id, ch_id in cur.fetchall(): if ch_id in level_chapters["L1"]: char_lesson_l1[user_id] += 1 if ch_id in level_chapters["L2"]: char_lesson_l2[user_id] += 1 except Exception as e: print(f" 警告: {table} 查询失败: {e}") print(f" 有L1行课的角色: {len(char_lesson_l1)}") print(f" 有L2行课的角色: {len(char_lesson_l2)}") # ── 6. 构建分析数据 ── print("\n[6/6] 构建分析...") # 每个 (account_id, 分类, 等级) 一行 rows = [] for aid in all_account_ids: my_orders = user_orders.get(aid, []) if not my_orders: continue my_chars = account_chars.get(aid, []) # 计算每个等级的总完成课时 total_l1 = sum(char_lesson_l1.get(cid, 0) for cid in my_chars) total_l2 = sum(char_lesson_l2.get(cid, 0) for cid in my_chars) # 订单统计 total_orders = len(my_orders) refunded_orders = sum(1 for o in my_orders if o["is_refunded"]) total_gmv = sum(o["amount"] for o in my_orders) total_refund = sum(o["amount"] for o in my_orders if o["is_refunded"]) all_refunded = (refunded_orders == total_orders and total_orders > 0) has_any_refund = refunded_orders > 0 classifications = user_classifications.get(aid, []) if not classifications: continue for cat_label, watch_level in classifications: lesson_count = total_l1 if watch_level == "L1" else total_l2 rows.append({ "用户ID": aid, "购买分类": cat_label, "行课等级": watch_level, "完成课时数": lesson_count, "订单数": total_orders, "退款订单数": refunded_orders, "GMV": round(total_gmv, 2), "GSV": round(total_gmv - total_refund, 2), "退款金额": round(total_refund, 2), "是否退过款": "是" if has_any_refund else "否", "是否全部退款": "是" if all_refunded else "否", }) df = pd.DataFrame(rows) conn.close() # ── 分析输出 ── print("\n" + "=" * 60) print("分析结果") print("=" * 60) df["课时桶"] = df["完成课时数"].apply(lesson_bucket) # ── 按购买分类拆分 ── for cat_label in ["仅L1", "仅L2", "L1+L2联报"]: df_cat = df[df["购买分类"] == cat_label] if len(df_cat) == 0: print(f"\n【{cat_label}】无数据") continue print(f"\n{'='*50}") print(f"【{cat_label}】({len(df_cat)} 用户)") print(f"{'='*50}") bucket_stats = df_cat.groupby("课时桶").agg( 用户数=("用户ID", "count"), 退款用户数=("是否退过款", lambda x: (x == "是").sum()), 全部退款用户数=("是否全部退款", lambda x: (x == "是").sum()), 平均完成课时=("完成课时数", "mean"), GMV总额=("GMV", "sum"), GSV总额=("GSV", "sum"), ).reindex(BUCKET_ORDER).fillna(0) bucket_stats["退款率"] = (bucket_stats["退款用户数"] / bucket_stats["用户数"] * 100).round(1) bucket_stats["全部退款率"] = (bucket_stats["全部退款用户数"] / bucket_stats["用户数"] * 100).round(1) bucket_stats["平均完成课时"] = bucket_stats["平均完成课时"].round(1) bucket_stats["GMV总额"] = bucket_stats["GMV总额"].round(2) bucket_stats["GSV总额"] = bucket_stats["GSV总额"].round(2) # 去掉用户数为0的行 bucket_stats = bucket_stats[bucket_stats["用户数"] > 0] print(bucket_stats.to_string()) # 退款用户分布 refund_users = df_cat[df_cat["是否退过款"] == "是"] no_refund_users = df_cat[df_cat["是否退过款"] == "否"] print(f"\n 退款用户: {len(refund_users)} 人, 平均完成课时: {refund_users['完成课时数'].mean():.1f}") print(f" 未退款用户: {len(no_refund_users)} 人, 平均完成课时: {no_refund_users['完成课时数'].mean():.1f}") print(f" 整体退费率: {len(refund_users)/len(df_cat)*100:.1f}%") # ── 输出 Excel ── output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output") os.makedirs(output_dir, exist_ok=True) output_path = os.path.join(output_dir, "行课进度与退款率分析_按等级.xlsx") with pd.ExcelWriter(output_path, engine="openpyxl") as writer: # Sheet1: 明细 df_out = df.drop(columns=["课时桶"], errors="ignore") df_out.to_excel(writer, sheet_name="用户明细", index=False) # Sheet2-4: 各分类分桶 for cat_label in ["仅L1", "仅L2", "L1+L2联报"]: df_cat = df[df["购买分类"] == cat_label] if len(df_cat) == 0: continue bucket_stats = df_cat.groupby("课时桶").agg( 用户数=("用户ID", "count"), 退款用户数=("是否退过款", lambda x: (x == "是").sum()), 全部退款用户数=("是否全部退款", lambda x: (x == "是").sum()), 平均完成课时=("完成课时数", "mean"), GMV总额=("GMV", "sum"), GSV总额=("GSV", "sum"), ).reindex(BUCKET_ORDER).fillna(0) bucket_stats["退款率"] = (bucket_stats["退款用户数"] / bucket_stats["用户数"] * 100).round(1) bucket_stats["全部退款率"] = (bucket_stats["全部退款用户数"] / bucket_stats["用户数"] * 100).round(1) bucket_stats["平均完成课时"] = bucket_stats["平均完成课时"].round(1) bucket_stats["GMV总额"] = bucket_stats["GMV总额"].round(2) bucket_stats["GSV总额"] = bucket_stats["GSV总额"].round(2) bucket_stats = bucket_stats[bucket_stats["用户数"] > 0] sheet_name = cat_label[:31] # Excel sheet name max 31 chars bucket_stats.to_excel(writer, sheet_name=sheet_name) print(f"\n✅ 报表已生成: {output_path}") if __name__ == "__main__": main()