ai_member_xiaoxi/scripts/course_progress_refund_analysis.py
2026-06-02 08:00:01 +08:00

333 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
行课进度与退款率关系分析
分析维度:
1. 按用户完成课时数分桶,计算每桶的退款率
2. 退款用户的退款前已完成课时数分布
3. 不同课程等级(L1/L2)的行课进度与退款率
"""
import os
import sys
import psycopg2
import pandas as pd
from collections import defaultdict
# ── 数据库 ──
DB_HOST = "bj-postgres-16pob4sg.sql.tencentcdb.com"
DB_PORT = 28591
DB_USER = "ai_member"
DB_NAME = "vala_bi"
def get_password():
pw = os.environ.get("PG_ONLINE_PASSWORD", "")
if pw:
return pw
secrets_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "secrets.env")
if os.path.exists(secrets_path):
with open(secrets_path) as f:
for line in f:
if line.startswith("PG_ONLINE_PASSWORD="):
return line.strip().split("=", 1)[1].strip("'\"")
raise RuntimeError("PG_ONLINE_PASSWORD not found")
def get_conn():
return psycopg2.connect(host=DB_HOST, port=DB_PORT, user=DB_USER, password=get_password(), dbname=DB_NAME, connect_timeout=60)
def main():
conn = get_conn()
cur = conn.cursor()
print("=" * 60)
print("行课进度与退款率关系分析")
print("=" * 60)
# ── 1. 获取所有付费用户(排除测试账号)──
print("\n[1/5] 获取付费用户...")
cur.execute("""
SELECT o.account_id, o.trade_no, o.pay_success_date, o.pay_amount_int, o.order_status,
o.goods_name, o.key_from
FROM bi_vala_order o
INNER JOIN bi_vala_app_account a ON o.account_id = a.id AND a.status = 1
WHERE o.deleted_at IS NULL
AND o.pay_success_date IS NOT NULL
AND o.order_status IN (3, 4)
ORDER BY o.account_id, o.pay_success_date
""")
orders = cur.fetchall()
print(f"{len(orders)} 条订单")
# ── 2. 获取退款信息 ──
print("\n[2/5] 获取退款信息...")
trade_nos = list(set(o[1] for o in orders if o[1]))
refund_set = set() # trade_nos that were refunded
for i in range(0, len(trade_nos), 500):
batch = trade_nos[i:i+500]
ph = ",".join(["%s"] * len(batch))
cur.execute(f"""
SELECT DISTINCT trade_no
FROM bi_refund_order
WHERE trade_no IN ({ph})
AND status = 3
AND deleted_at IS NULL
""", batch)
for (tn,) in cur.fetchall():
refund_set.add(tn)
print(f" 退款订单 trade_no 数: {len(refund_set)}")
# ── 3. 获取所有角色 ──
print("\n[3/5] 获取用户角色...")
account_ids = list(set(o[0] for o in orders))
# 分批
all_chars = []
for i in range(0, len(account_ids), 1000):
batch = account_ids[i:i+1000]
ph = ",".join(["%s"] * len(batch))
cur.execute(f"""
SELECT id AS character_id, account_id, nickname
FROM bi_vala_app_character
WHERE account_id IN ({ph})
AND nickname IS NOT NULL AND nickname != ''
AND deleted_at IS NULL
""", batch)
all_chars.extend(cur.fetchall())
print(f"{len(all_chars)} 个角色")
char_account_map = {} # character_id -> account_id
account_chars = defaultdict(list) # account_id -> [character_ids]
for cid, aid, nick in all_chars:
char_account_map[cid] = aid
account_chars[aid].append(cid)
# ── 4. 获取课时完成记录所有8个分表──
print("\n[4/5] 获取课时完成记录...")
char_ids = list(set(c[0] for c in all_chars))
char_lesson_count = defaultdict(int) # character_id -> 完成课时数
char_first_done = {} # character_id -> 首次完成时间
char_last_done = {} # character_id -> 最近完成时间
for tbl_idx in range(8):
table = f"bi_user_chapter_play_record_{tbl_idx}"
for i in range(0, len(char_ids), 2000):
batch = char_ids[i:i+2000]
ph = ",".join(["%s"] * len(batch))
try:
cur.execute(f"""
SELECT user_id, chapter_id, MIN(created_at), MAX(created_at), COUNT(*)
FROM {table}
WHERE user_id IN ({ph})
AND play_status = 1
AND deleted_at IS NULL
GROUP BY user_id, chapter_id
""", batch)
for user_id, ch_id, first_at, last_at, cnt in cur.fetchall():
char_lesson_count[user_id] += 1
if user_id not in char_first_done or first_at < char_first_done[user_id]:
char_first_done[user_id] = first_at
if user_id not in char_last_done or last_at > char_last_done[user_id]:
char_last_done[user_id] = last_at
except Exception as e:
print(f" 警告: {table} 查询失败: {e}")
print(f" 有行课记录的角色: {len(char_lesson_count)}")
# ── 5. 构建分析数据 ──
print("\n[5/5] 构建分析数据...")
# 按 account_id 聚合
# 每个用户的:订单列表、退款订单列表、所有角色的总完成课时数
user_orders = defaultdict(list) # account_id -> [(trade_no, pay_date, amount, order_status, goods_name, key_from)]
for o in orders:
aid, tn, pd_, amt, os_, gn, kf = o
user_orders[aid].append({
"trade_no": tn,
"pay_date": pd_,
"amount": amt / 100.0,
"order_status": os_,
"goods_name": gn,
"key_from": kf,
"is_refunded": tn in refund_set,
})
# 用户维度分析
rows = []
for aid in account_ids:
my_orders = user_orders.get(aid, [])
if not my_orders:
continue
# 总完成课时数(所有角色汇总)
my_chars = account_chars.get(aid, [])
total_lessons = sum(char_lesson_count.get(cid, 0) for cid in my_chars)
# 首个角色首次行课时间
first_lesson = None
for cid in my_chars:
if cid in char_first_done:
if first_lesson is None or char_first_done[cid] < first_lesson:
first_lesson = char_first_done[cid]
# 最近行课时间
last_lesson = None
for cid in my_chars:
if cid in char_last_done:
if last_lesson is None or char_last_done[cid] > last_lesson:
last_lesson = char_last_done[cid]
# 订单分析
total_orders = len(my_orders)
refunded_orders = sum(1 for o in my_orders if o["is_refunded"])
total_gmv = sum(o["amount"] for o in my_orders)
total_refund = sum(o["amount"] for o in my_orders if o["is_refunded"])
gsv = total_gmv - total_refund
# 是否全部退款
all_refunded = (refunded_orders == total_orders and total_orders > 0)
# 首次购买时间
first_pay = min(o["pay_date"] for o in my_orders if o["pay_date"])
# 退款时间(取最早的退款订单)
refund_orders_list = [o for o in my_orders if o["is_refunded"]]
first_refund_date = None
if refund_orders_list:
first_refund_date = min(o["pay_date"] for o in refund_orders_list)
# 退款前完成课时数(退款日期之前完成的课时)
# 这里用简化方式:取退款日期
lessons_before_refund = None
if first_refund_date:
# 计算退款前完成课时数
cnt = 0
for cid in my_chars:
# 需要查分表,这里简化:如果首次行课时间在退款之前,则计入
if cid in char_first_done and char_first_done[cid] < first_refund_date:
cnt += char_lesson_count.get(cid, 0)
lessons_before_refund = cnt
rows.append({
"用户ID": aid,
"角色数": len(my_chars),
"总完成课时数": total_lessons,
"首次行课时间": first_lesson,
"最近行课时间": last_lesson,
"首次购买时间": first_pay,
"订单数": total_orders,
"退款订单数": refunded_orders,
"GMV": round(total_gmv, 2),
"GSV": round(gsv, 2),
"退款金额": round(total_refund, 2),
"是否全部退款": "" if all_refunded else "",
"是否退过款": "" if refunded_orders > 0 else "",
})
df = pd.DataFrame(rows)
conn.close()
# ── 6. 分析输出 ──
print("\n" + "=" * 60)
print("分析结果")
print("=" * 60)
# 6.1 按完成课时数分桶
print("\n【维度1】按完成课时数分桶的退款率")
def lesson_bucket(n):
if n == 0:
return "0课时"
elif n <= 3:
return "1-3课时"
elif n <= 7:
return "4-7课时"
elif n <= 15:
return "8-15课时"
elif n <= 30:
return "16-30课时"
elif n <= 60:
return "31-60课时"
else:
return "60课时以上"
df["课时桶"] = df["总完成课时数"].apply(lesson_bucket)
bucket_order = ["0课时", "1-3课时", "4-7课时", "8-15课时", "16-30课时", "31-60课时", "60课时以上"]
bucket_stats = df.groupby("课时桶").agg(
用户数=("用户ID", "count"),
退款用户数=("是否退过款", lambda x: (x == "").sum()),
全部退款用户数=("是否全部退款", lambda x: (x == "").sum()),
平均完成课时=("总完成课时数", "mean"),
GMV总额=("GMV", "sum"),
GSV总额=("GSV", "sum"),
).reindex(bucket_order).fillna(0)
bucket_stats["退款率(用户)"] = (bucket_stats["退款用户数"] / bucket_stats["用户数"] * 100).round(1)
bucket_stats["全部退款率(用户)"] = (bucket_stats["全部退款用户数"] / bucket_stats["用户数"] * 100).round(1)
bucket_stats["平均完成课时"] = bucket_stats["平均完成课时"].round(1)
bucket_stats["GMV总额"] = bucket_stats["GMV总额"].round(2)
bucket_stats["GSV总额"] = bucket_stats["GSV总额"].round(2)
print(bucket_stats.to_string())
# 6.2 退款用户的完成课时分布
print("\n\n【维度2】退款用户的行课完成情况")
refund_users = df[df["是否退过款"] == ""]
print(f"退款用户总数: {len(refund_users)}")
print(f" 其中0课时的: {(refund_users['总完成课时数'] == 0).sum()}")
print(f" 其中1-3课时的: {((refund_users['总完成课时数'] >= 1) & (refund_users['总完成课时数'] <= 3)).sum()}")
print(f" 其中4-7课时的: {((refund_users['总完成课时数'] >= 4) & (refund_users['总完成课时数'] <= 7)).sum()}")
print(f" 其中8-15课时的: {((refund_users['总完成课时数'] >= 8) & (refund_users['总完成课时数'] <= 15)).sum()}")
print(f" 其中16课时以上的: {(refund_users['总完成课时数'] >= 16).sum()}")
print(f" 退款用户平均完成课时: {refund_users['总完成课时数'].mean():.1f}")
# 6.3 未退款用户 vs 退款用户对比
print("\n\n【维度3】退款 vs 未退款用户行课对比")
no_refund = df[df["是否退过款"] == ""]
print(f" 未退款用户数: {len(no_refund)}, 平均完成课时: {no_refund['总完成课时数'].mean():.1f}")
print(f" 退款用户数: {len(refund_users)}, 平均完成课时: {refund_users['总完成课时数'].mean():.1f}")
print(f" 全部退款用户数: {(df['是否全部退款'] == '').sum()}, 平均完成课时: {df[df['是否全部退款'] == '']['总完成课时数'].mean():.1f}")
# 6.4 整体统计
print("\n\n【维度4】整体统计")
total_users = len(df)
total_refund_users = len(refund_users)
print(f" 总付费用户: {total_users}")
print(f" 总退款用户: {total_refund_users} ({total_refund_users/total_users*100:.1f}%)")
print(f" 全部退款用户: {(df['是否全部退款'] == '').sum()}")
print(f" 平均完成课时: {df['总完成课时数'].mean():.1f}")
print(f" 总GMV: {df['GMV'].sum():.2f}")
print(f" 总GSV: {df['GSV'].sum():.2f}")
# ── 7. 输出 Excel ──
output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output")
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "行课进度与退款率分析.xlsx")
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
# Sheet1: 明细
df_out = df.drop(columns=["课时桶"], errors="ignore")
for col in ["首次行课时间", "最近行课时间", "首次购买时间"]:
if col in df_out.columns:
df_out[col] = pd.to_datetime(df_out[col]).dt.tz_localize(None)
df_out.to_excel(writer, sheet_name="用户明细", index=False)
# Sheet2: 分桶统计
bucket_stats.to_excel(writer, sheet_name="分桶统计")
# Sheet3: 退款用户分桶
refund_bucket = refund_users.groupby("课时桶").agg(
用户数=("用户ID", "count"),
平均完成课时=("总完成课时数", "mean"),
GMV=("GMV", "sum"),
).reindex(bucket_order).fillna(0)
refund_bucket["占比"] = (refund_bucket["用户数"] / total_refund_users * 100).round(1)
refund_bucket.to_excel(writer, sheet_name="退款用户分布")
print(f"\n✅ 报表已生成: {output_path}")
if __name__ == "__main__":
main()