ai_member_xiaoxi/scripts/course_progress_refund_v2.py

#!/usr/bin/env python3
"""
行课进度与退款率关系分析（按购买等级匹配行课）

规则：
- 用户购买 L1+L2 联报课包(goods_id=61) → 只看 L1 行课
- 用户购买 L2 课包(goods_id IN 31,32,33,54) → 只看 L2 行课
- 用户购买 L1 课包(goods_id IN 57,60,63) → 只看 L1 行课
- 同时买了不同类型的 → 分别归入对应分类（一个用户可出现在多个分类）

行课完成：通过 bi_level_unit_lesson 获取 L1/L2 各自的 chapter_id 列表
"""

import os
import sys
import psycopg2
import pandas as pd
from collections import defaultdict
from datetime import datetime

# ── 数据库 ──
DB_HOST = "bj-postgres-16pob4sg.sql.tencentcdb.com"
DB_PORT = 28591
DB_USER = "ai_member"
DB_NAME = "vala_bi"

def get_password():
    pw = os.environ.get("PG_ONLINE_PASSWORD", "")
    if pw:
        return pw
    secrets_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "secrets.env")
    if os.path.exists(secrets_path):
        with open(secrets_path) as f:
            for line in f:
                if line.startswith("PG_ONLINE_PASSWORD="):
                    return line.strip().split("=", 1)[1].strip("'\"")
    raise RuntimeError("PG_ONLINE_PASSWORD not found")

def get_conn():
    return psycopg2.connect(host=DB_HOST, port=DB_PORT, user=DB_USER, password=get_password(), dbname=DB_NAME, connect_timeout=60)

# ── 商品分类 ──
L1_GOODS = {57, 60, 63}
L2_GOODS = {31, 32, 33, 54}
L1L2_GOODS = {61}  # 联报 → 看 L1 行课

def classify_user(goods_ids):
    """根据用户购买的 goods_id 集合，返回 [(分类标签, 对应等级)]"""
    goods_set = set(goods_ids)
    result = []
    has_l1l2 = bool(goods_set & L1L2_GOODS)
    has_l1 = bool(goods_set & L1_GOODS)
    has_l2 = bool(goods_set & L2_GOODS)

    if has_l1l2:
        result.append(("L1+L2联报", "L1"))
    if has_l1 and not has_l1l2:
        # 纯L1（没有联报）
        result.append(("仅L1", "L1"))
    if has_l2 and not has_l1l2:
        # 纯L2（没有联报）
        result.append(("仅L2", "L2"))
    if has_l1l2 and has_l2:
        # 联报+L2 → 联报看L1，L2看L2
        result.append(("联报+仅L2", "L2"))

    return result

def lesson_bucket(n):
    if n == 0:
        return "0课时"
    elif n <= 3:
        return "1-3课时"
    elif n <= 7:
        return "4-7课时"
    elif n <= 15:
        return "8-15课时"
    elif n <= 30:
        return "16-30课时"
    elif n <= 60:
        return "31-60课时"
    else:
        return "60课时以上"

BUCKET_ORDER = ["0课时", "1-3课时", "4-7课时", "8-15课时", "16-30课时", "31-60课时", "60课时以上"]

def main():
    conn = get_conn()
    cur = conn.cursor()

    print("=" * 60)
    print("行课进度与退款率关系分析（按购买等级匹配行课）")
    print("=" * 60)

    # ── 0. 获取 L1/L2 的 chapter_id 列表 ──
    print("\n[0/6] 获取课程结构...")
    cur.execute("""
        SELECT id AS chapter_id, course_level
        FROM bi_level_unit_lesson
        WHERE course_level IN ('L1', 'L2')
    """)
    level_chapters = defaultdict(set)  # "L1" -> {chapter_ids}, "L2" -> {chapter_ids}
    for ch_id, lv in cur.fetchall():
        level_chapters[lv].add(ch_id)
    print(f"  L1 chapter数: {len(level_chapters['L1'])}")
    print(f"  L2 chapter数: {len(level_chapters['L2'])}")

    # ── 1. 获取所有付费订单（排除测试账号）──
    print("\n[1/6] 获取付费订单...")
    cur.execute("""
        SELECT o.id AS order_id, o.account_id, o.trade_no, o.pay_success_date,
               o.pay_amount_int, o.order_status, o.goods_id, o.goods_name, o.key_from
        FROM bi_vala_order o
        INNER JOIN bi_vala_app_account a ON o.account_id = a.id AND a.status = 1
        WHERE o.deleted_at IS NULL
          AND o.pay_success_date IS NOT NULL
          AND o.order_status IN (3, 4)
        ORDER BY o.account_id, o.pay_success_date
    """)
    orders = cur.fetchall()
    print(f"  共 {len(orders)} 条订单")

    # ── 2. 获取退款信息 ──
    print("\n[2/6] 获取退款信息...")
    trade_nos = list(set(o[2] for o in orders if o[2]))
    refund_set = set()
    for i in range(0, len(trade_nos), 500):
        batch = trade_nos[i:i+500]
        ph = ",".join(["%s"] * len(batch))
        cur.execute(f"""
            SELECT DISTINCT trade_no
            FROM bi_refund_order
            WHERE trade_no IN ({ph}) AND status = 3 AND deleted_at IS NULL
        """, batch)
        for (tn,) in cur.fetchall():
            refund_set.add(tn)
    print(f"  退款 trade_no: {len(refund_set)}")

    # ── 3. 用户-订单聚合 + 分类 ──
    print("\n[3/6] 用户分类...")
    user_orders = defaultdict(list)
    for o in orders:
        _, aid, tn, pd_, amt, os_, gid, gn, kf = o
        user_orders[aid].append({
            "trade_no": tn,
            "pay_date": pd_,
            "amount": amt / 100.0,
            "order_status": os_,
            "goods_id": gid,
            "goods_name": gn,
            "key_from": kf,
            "is_refunded": tn in refund_set,
        })

    # 用户分类
    user_classifications = {}  # account_id -> [(分类标签, 对应等级), ...]
    for aid, my_orders in user_orders.items():
        goods_ids = [o["goods_id"] for o in my_orders]
        user_classifications[aid] = classify_user(goods_ids)

    # 统计分类
    cat_count = defaultdict(int)
    for aid, cats in user_classifications.items():
        for cat, lv in cats:
            cat_count[cat] += 1
    print("  用户分类统计:")
    for k, v in sorted(cat_count.items()):
        print(f"    {k}: {v} 人")

    # ── 4. 获取角色 ──
    print("\n[4/6] 获取角色...")
    all_account_ids = list(user_orders.keys())
    all_chars = []
    for i in range(0, len(all_account_ids), 1000):
        batch = all_account_ids[i:i+1000]
        ph = ",".join(["%s"] * len(batch))
        cur.execute(f"""
            SELECT id AS character_id, account_id
            FROM bi_vala_app_character
            WHERE account_id IN ({ph})
              AND nickname IS NOT NULL AND nickname != ''
              AND deleted_at IS NULL
        """, batch)
        all_chars.extend(cur.fetchall())
    print(f"  共 {len(all_chars)} 个角色")

    account_chars = defaultdict(list)
    all_char_ids = []
    for cid, aid in all_chars:
        account_chars[aid].append(cid)
        all_char_ids.append(cid)

    # ── 5. 获取课时完成记录（按 chapter_id 过滤等级）──
    print("\n[5/6] 获取课时完成记录...")
    # 每个角色在每个等级的完成课时数
    char_lesson_l1 = defaultdict(int)
    char_lesson_l2 = defaultdict(int)

    l1_chapter_list = list(level_chapters["L1"])
    l2_chapter_list = list(level_chapters["L2"])

    for tbl_idx in range(8):
        table = f"bi_user_chapter_play_record_{tbl_idx}"
        for i in range(0, len(all_char_ids), 2000):
            batch = all_char_ids[i:i+2000]
            ph = ",".join(["%s"] * len(batch))
            try:
                cur.execute(f"""
                    SELECT user_id, chapter_id
                    FROM {table}
                    WHERE user_id IN ({ph})
                      AND play_status = 1
                      AND deleted_at IS NULL
                """, batch)
                for user_id, ch_id in cur.fetchall():
                    if ch_id in level_chapters["L1"]:
                        char_lesson_l1[user_id] += 1
                    if ch_id in level_chapters["L2"]:
                        char_lesson_l2[user_id] += 1
            except Exception as e:
                print(f"  警告: {table} 查询失败: {e}")

    print(f"  有L1行课的角色: {len(char_lesson_l1)}")
    print(f"  有L2行课的角色: {len(char_lesson_l2)}")

    # ── 6. 构建分析数据 ──
    print("\n[6/6] 构建分析...")

    # 每个 (account_id, 分类, 等级) 一行
    rows = []
    for aid in all_account_ids:
        my_orders = user_orders.get(aid, [])
        if not my_orders:
            continue
        my_chars = account_chars.get(aid, [])

        # 计算每个等级的总完成课时
        total_l1 = sum(char_lesson_l1.get(cid, 0) for cid in my_chars)
        total_l2 = sum(char_lesson_l2.get(cid, 0) for cid in my_chars)

        # 订单统计
        total_orders = len(my_orders)
        refunded_orders = sum(1 for o in my_orders if o["is_refunded"])
        total_gmv = sum(o["amount"] for o in my_orders)
        total_refund = sum(o["amount"] for o in my_orders if o["is_refunded"])
        all_refunded = (refunded_orders == total_orders and total_orders > 0)
        has_any_refund = refunded_orders > 0

        classifications = user_classifications.get(aid, [])
        if not classifications:
            continue

        for cat_label, watch_level in classifications:
            lesson_count = total_l1 if watch_level == "L1" else total_l2
            rows.append({
                "用户ID": aid,
                "购买分类": cat_label,
                "行课等级": watch_level,
                "完成课时数": lesson_count,
                "订单数": total_orders,
                "退款订单数": refunded_orders,
                "GMV": round(total_gmv, 2),
                "GSV": round(total_gmv - total_refund, 2),
                "退款金额": round(total_refund, 2),
                "是否退过款": "是" if has_any_refund else "否",
                "是否全部退款": "是" if all_refunded else "否",
            })

    df = pd.DataFrame(rows)
    conn.close()

    # ── 分析输出 ──
    print("\n" + "=" * 60)
    print("分析结果")
    print("=" * 60)

    df["课时桶"] = df["完成课时数"].apply(lesson_bucket)

    # ── 按购买分类拆分 ──
    for cat_label in ["仅L1", "仅L2", "L1+L2联报"]:
        df_cat = df[df["购买分类"] == cat_label]
        if len(df_cat) == 0:
            print(f"\n【{cat_label}】无数据")
            continue

        print(f"\n{'='*50}")
        print(f"【{cat_label}】({len(df_cat)} 用户)")
        print(f"{'='*50}")

        bucket_stats = df_cat.groupby("课时桶").agg(
            用户数=("用户ID", "count"),
            退款用户数=("是否退过款", lambda x: (x == "是").sum()),
            全部退款用户数=("是否全部退款", lambda x: (x == "是").sum()),
            平均完成课时=("完成课时数", "mean"),
            GMV总额=("GMV", "sum"),
            GSV总额=("GSV", "sum"),
        ).reindex(BUCKET_ORDER).fillna(0)

        bucket_stats["退款率"] = (bucket_stats["退款用户数"] / bucket_stats["用户数"] * 100).round(1)
        bucket_stats["全部退款率"] = (bucket_stats["全部退款用户数"] / bucket_stats["用户数"] * 100).round(1)
        bucket_stats["平均完成课时"] = bucket_stats["平均完成课时"].round(1)
        bucket_stats["GMV总额"] = bucket_stats["GMV总额"].round(2)
        bucket_stats["GSV总额"] = bucket_stats["GSV总额"].round(2)

        # 去掉用户数为0的行
        bucket_stats = bucket_stats[bucket_stats["用户数"] > 0]
        print(bucket_stats.to_string())

        # 退款用户分布
        refund_users = df_cat[df_cat["是否退过款"] == "是"]
        no_refund_users = df_cat[df_cat["是否退过款"] == "否"]
        print(f"\n  退款用户: {len(refund_users)} 人, 平均完成课时: {refund_users['完成课时数'].mean():.1f}")
        print(f"  未退款用户: {len(no_refund_users)} 人, 平均完成课时: {no_refund_users['完成课时数'].mean():.1f}")
        print(f"  整体退费率: {len(refund_users)/len(df_cat)*100:.1f}%")

    # ── 输出 Excel ──
    output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output")
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, "行课进度与退款率分析_按等级.xlsx")

    with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
        # Sheet1: 明细
        df_out = df.drop(columns=["课时桶"], errors="ignore")
        df_out.to_excel(writer, sheet_name="用户明细", index=False)

        # Sheet2-4: 各分类分桶
        for cat_label in ["仅L1", "仅L2", "L1+L2联报"]:
            df_cat = df[df["购买分类"] == cat_label]
            if len(df_cat) == 0:
                continue
            bucket_stats = df_cat.groupby("课时桶").agg(
                用户数=("用户ID", "count"),
                退款用户数=("是否退过款", lambda x: (x == "是").sum()),
                全部退款用户数=("是否全部退款", lambda x: (x == "是").sum()),
                平均完成课时=("完成课时数", "mean"),
                GMV总额=("GMV", "sum"),
                GSV总额=("GSV", "sum"),
            ).reindex(BUCKET_ORDER).fillna(0)
            bucket_stats["退款率"] = (bucket_stats["退款用户数"] / bucket_stats["用户数"] * 100).round(1)
            bucket_stats["全部退款率"] = (bucket_stats["全部退款用户数"] / bucket_stats["用户数"] * 100).round(1)
            bucket_stats["平均完成课时"] = bucket_stats["平均完成课时"].round(1)
            bucket_stats["GMV总额"] = bucket_stats["GMV总额"].round(2)
            bucket_stats["GSV总额"] = bucket_stats["GSV总额"].round(2)
            bucket_stats = bucket_stats[bucket_stats["用户数"] > 0]
            sheet_name = cat_label[:31]  # Excel sheet name max 31 chars
            bucket_stats.to_excel(writer, sheet_name=sheet_name)

    print(f"\n✅ 报表已生成: {output_path}")


if __name__ == "__main__":
    main()