333 lines
13 KiB
Python
333 lines
13 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
行课进度与退款率关系分析
|
||
|
||
分析维度:
|
||
1. 按用户完成课时数分桶,计算每桶的退款率
|
||
2. 退款用户的退款前已完成课时数分布
|
||
3. 不同课程等级(L1/L2)的行课进度与退款率
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import psycopg2
|
||
import pandas as pd
|
||
from collections import defaultdict
|
||
|
||
# ── 数据库 ──
|
||
DB_HOST = "bj-postgres-16pob4sg.sql.tencentcdb.com"
|
||
DB_PORT = 28591
|
||
DB_USER = "ai_member"
|
||
DB_NAME = "vala_bi"
|
||
|
||
def get_password():
|
||
pw = os.environ.get("PG_ONLINE_PASSWORD", "")
|
||
if pw:
|
||
return pw
|
||
secrets_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "secrets.env")
|
||
if os.path.exists(secrets_path):
|
||
with open(secrets_path) as f:
|
||
for line in f:
|
||
if line.startswith("PG_ONLINE_PASSWORD="):
|
||
return line.strip().split("=", 1)[1].strip("'\"")
|
||
raise RuntimeError("PG_ONLINE_PASSWORD not found")
|
||
|
||
def get_conn():
|
||
return psycopg2.connect(host=DB_HOST, port=DB_PORT, user=DB_USER, password=get_password(), dbname=DB_NAME, connect_timeout=60)
|
||
|
||
def main():
|
||
conn = get_conn()
|
||
cur = conn.cursor()
|
||
|
||
print("=" * 60)
|
||
print("行课进度与退款率关系分析")
|
||
print("=" * 60)
|
||
|
||
# ── 1. 获取所有付费用户(排除测试账号)──
|
||
print("\n[1/5] 获取付费用户...")
|
||
cur.execute("""
|
||
SELECT o.account_id, o.trade_no, o.pay_success_date, o.pay_amount_int, o.order_status,
|
||
o.goods_name, o.key_from
|
||
FROM bi_vala_order o
|
||
INNER JOIN bi_vala_app_account a ON o.account_id = a.id AND a.status = 1
|
||
WHERE o.deleted_at IS NULL
|
||
AND o.pay_success_date IS NOT NULL
|
||
AND o.order_status IN (3, 4)
|
||
ORDER BY o.account_id, o.pay_success_date
|
||
""")
|
||
orders = cur.fetchall()
|
||
print(f" 共 {len(orders)} 条订单")
|
||
|
||
# ── 2. 获取退款信息 ──
|
||
print("\n[2/5] 获取退款信息...")
|
||
trade_nos = list(set(o[1] for o in orders if o[1]))
|
||
refund_set = set() # trade_nos that were refunded
|
||
|
||
for i in range(0, len(trade_nos), 500):
|
||
batch = trade_nos[i:i+500]
|
||
ph = ",".join(["%s"] * len(batch))
|
||
cur.execute(f"""
|
||
SELECT DISTINCT trade_no
|
||
FROM bi_refund_order
|
||
WHERE trade_no IN ({ph})
|
||
AND status = 3
|
||
AND deleted_at IS NULL
|
||
""", batch)
|
||
for (tn,) in cur.fetchall():
|
||
refund_set.add(tn)
|
||
|
||
print(f" 退款订单 trade_no 数: {len(refund_set)}")
|
||
|
||
# ── 3. 获取所有角色 ──
|
||
print("\n[3/5] 获取用户角色...")
|
||
account_ids = list(set(o[0] for o in orders))
|
||
# 分批
|
||
all_chars = []
|
||
for i in range(0, len(account_ids), 1000):
|
||
batch = account_ids[i:i+1000]
|
||
ph = ",".join(["%s"] * len(batch))
|
||
cur.execute(f"""
|
||
SELECT id AS character_id, account_id, nickname
|
||
FROM bi_vala_app_character
|
||
WHERE account_id IN ({ph})
|
||
AND nickname IS NOT NULL AND nickname != ''
|
||
AND deleted_at IS NULL
|
||
""", batch)
|
||
all_chars.extend(cur.fetchall())
|
||
print(f" 共 {len(all_chars)} 个角色")
|
||
|
||
char_account_map = {} # character_id -> account_id
|
||
account_chars = defaultdict(list) # account_id -> [character_ids]
|
||
for cid, aid, nick in all_chars:
|
||
char_account_map[cid] = aid
|
||
account_chars[aid].append(cid)
|
||
|
||
# ── 4. 获取课时完成记录(所有8个分表)──
|
||
print("\n[4/5] 获取课时完成记录...")
|
||
char_ids = list(set(c[0] for c in all_chars))
|
||
char_lesson_count = defaultdict(int) # character_id -> 完成课时数
|
||
char_first_done = {} # character_id -> 首次完成时间
|
||
char_last_done = {} # character_id -> 最近完成时间
|
||
|
||
for tbl_idx in range(8):
|
||
table = f"bi_user_chapter_play_record_{tbl_idx}"
|
||
for i in range(0, len(char_ids), 2000):
|
||
batch = char_ids[i:i+2000]
|
||
ph = ",".join(["%s"] * len(batch))
|
||
try:
|
||
cur.execute(f"""
|
||
SELECT user_id, chapter_id, MIN(created_at), MAX(created_at), COUNT(*)
|
||
FROM {table}
|
||
WHERE user_id IN ({ph})
|
||
AND play_status = 1
|
||
AND deleted_at IS NULL
|
||
GROUP BY user_id, chapter_id
|
||
""", batch)
|
||
for user_id, ch_id, first_at, last_at, cnt in cur.fetchall():
|
||
char_lesson_count[user_id] += 1
|
||
if user_id not in char_first_done or first_at < char_first_done[user_id]:
|
||
char_first_done[user_id] = first_at
|
||
if user_id not in char_last_done or last_at > char_last_done[user_id]:
|
||
char_last_done[user_id] = last_at
|
||
except Exception as e:
|
||
print(f" 警告: {table} 查询失败: {e}")
|
||
|
||
print(f" 有行课记录的角色: {len(char_lesson_count)}")
|
||
|
||
# ── 5. 构建分析数据 ──
|
||
print("\n[5/5] 构建分析数据...")
|
||
|
||
# 按 account_id 聚合
|
||
# 每个用户的:订单列表、退款订单列表、所有角色的总完成课时数
|
||
user_orders = defaultdict(list) # account_id -> [(trade_no, pay_date, amount, order_status, goods_name, key_from)]
|
||
for o in orders:
|
||
aid, tn, pd_, amt, os_, gn, kf = o
|
||
user_orders[aid].append({
|
||
"trade_no": tn,
|
||
"pay_date": pd_,
|
||
"amount": amt / 100.0,
|
||
"order_status": os_,
|
||
"goods_name": gn,
|
||
"key_from": kf,
|
||
"is_refunded": tn in refund_set,
|
||
})
|
||
|
||
# 用户维度分析
|
||
rows = []
|
||
for aid in account_ids:
|
||
my_orders = user_orders.get(aid, [])
|
||
if not my_orders:
|
||
continue
|
||
|
||
# 总完成课时数(所有角色汇总)
|
||
my_chars = account_chars.get(aid, [])
|
||
total_lessons = sum(char_lesson_count.get(cid, 0) for cid in my_chars)
|
||
|
||
# 首个角色首次行课时间
|
||
first_lesson = None
|
||
for cid in my_chars:
|
||
if cid in char_first_done:
|
||
if first_lesson is None or char_first_done[cid] < first_lesson:
|
||
first_lesson = char_first_done[cid]
|
||
|
||
# 最近行课时间
|
||
last_lesson = None
|
||
for cid in my_chars:
|
||
if cid in char_last_done:
|
||
if last_lesson is None or char_last_done[cid] > last_lesson:
|
||
last_lesson = char_last_done[cid]
|
||
|
||
# 订单分析
|
||
total_orders = len(my_orders)
|
||
refunded_orders = sum(1 for o in my_orders if o["is_refunded"])
|
||
total_gmv = sum(o["amount"] for o in my_orders)
|
||
total_refund = sum(o["amount"] for o in my_orders if o["is_refunded"])
|
||
gsv = total_gmv - total_refund
|
||
|
||
# 是否全部退款
|
||
all_refunded = (refunded_orders == total_orders and total_orders > 0)
|
||
|
||
# 首次购买时间
|
||
first_pay = min(o["pay_date"] for o in my_orders if o["pay_date"])
|
||
|
||
# 退款时间(取最早的退款订单)
|
||
refund_orders_list = [o for o in my_orders if o["is_refunded"]]
|
||
first_refund_date = None
|
||
if refund_orders_list:
|
||
first_refund_date = min(o["pay_date"] for o in refund_orders_list)
|
||
|
||
# 退款前完成课时数(退款日期之前完成的课时)
|
||
# 这里用简化方式:取退款日期
|
||
lessons_before_refund = None
|
||
if first_refund_date:
|
||
# 计算退款前完成课时数
|
||
cnt = 0
|
||
for cid in my_chars:
|
||
# 需要查分表,这里简化:如果首次行课时间在退款之前,则计入
|
||
if cid in char_first_done and char_first_done[cid] < first_refund_date:
|
||
cnt += char_lesson_count.get(cid, 0)
|
||
lessons_before_refund = cnt
|
||
|
||
rows.append({
|
||
"用户ID": aid,
|
||
"角色数": len(my_chars),
|
||
"总完成课时数": total_lessons,
|
||
"首次行课时间": first_lesson,
|
||
"最近行课时间": last_lesson,
|
||
"首次购买时间": first_pay,
|
||
"订单数": total_orders,
|
||
"退款订单数": refunded_orders,
|
||
"GMV": round(total_gmv, 2),
|
||
"GSV": round(gsv, 2),
|
||
"退款金额": round(total_refund, 2),
|
||
"是否全部退款": "是" if all_refunded else "否",
|
||
"是否退过款": "是" if refunded_orders > 0 else "否",
|
||
})
|
||
|
||
df = pd.DataFrame(rows)
|
||
conn.close()
|
||
|
||
# ── 6. 分析输出 ──
|
||
print("\n" + "=" * 60)
|
||
print("分析结果")
|
||
print("=" * 60)
|
||
|
||
# 6.1 按完成课时数分桶
|
||
print("\n【维度1】按完成课时数分桶的退款率")
|
||
|
||
def lesson_bucket(n):
|
||
if n == 0:
|
||
return "0课时"
|
||
elif n <= 3:
|
||
return "1-3课时"
|
||
elif n <= 7:
|
||
return "4-7课时"
|
||
elif n <= 15:
|
||
return "8-15课时"
|
||
elif n <= 30:
|
||
return "16-30课时"
|
||
elif n <= 60:
|
||
return "31-60课时"
|
||
else:
|
||
return "60课时以上"
|
||
|
||
df["课时桶"] = df["总完成课时数"].apply(lesson_bucket)
|
||
bucket_order = ["0课时", "1-3课时", "4-7课时", "8-15课时", "16-30课时", "31-60课时", "60课时以上"]
|
||
|
||
bucket_stats = df.groupby("课时桶").agg(
|
||
用户数=("用户ID", "count"),
|
||
退款用户数=("是否退过款", lambda x: (x == "是").sum()),
|
||
全部退款用户数=("是否全部退款", lambda x: (x == "是").sum()),
|
||
平均完成课时=("总完成课时数", "mean"),
|
||
GMV总额=("GMV", "sum"),
|
||
GSV总额=("GSV", "sum"),
|
||
).reindex(bucket_order).fillna(0)
|
||
|
||
bucket_stats["退款率(用户)"] = (bucket_stats["退款用户数"] / bucket_stats["用户数"] * 100).round(1)
|
||
bucket_stats["全部退款率(用户)"] = (bucket_stats["全部退款用户数"] / bucket_stats["用户数"] * 100).round(1)
|
||
bucket_stats["平均完成课时"] = bucket_stats["平均完成课时"].round(1)
|
||
bucket_stats["GMV总额"] = bucket_stats["GMV总额"].round(2)
|
||
bucket_stats["GSV总额"] = bucket_stats["GSV总额"].round(2)
|
||
|
||
print(bucket_stats.to_string())
|
||
|
||
# 6.2 退款用户的完成课时分布
|
||
print("\n\n【维度2】退款用户的行课完成情况")
|
||
refund_users = df[df["是否退过款"] == "是"]
|
||
print(f"退款用户总数: {len(refund_users)}")
|
||
print(f" 其中0课时的: {(refund_users['总完成课时数'] == 0).sum()} 人")
|
||
print(f" 其中1-3课时的: {((refund_users['总完成课时数'] >= 1) & (refund_users['总完成课时数'] <= 3)).sum()} 人")
|
||
print(f" 其中4-7课时的: {((refund_users['总完成课时数'] >= 4) & (refund_users['总完成课时数'] <= 7)).sum()} 人")
|
||
print(f" 其中8-15课时的: {((refund_users['总完成课时数'] >= 8) & (refund_users['总完成课时数'] <= 15)).sum()} 人")
|
||
print(f" 其中16课时以上的: {(refund_users['总完成课时数'] >= 16).sum()} 人")
|
||
print(f" 退款用户平均完成课时: {refund_users['总完成课时数'].mean():.1f}")
|
||
|
||
# 6.3 未退款用户 vs 退款用户对比
|
||
print("\n\n【维度3】退款 vs 未退款用户行课对比")
|
||
no_refund = df[df["是否退过款"] == "否"]
|
||
print(f" 未退款用户数: {len(no_refund)}, 平均完成课时: {no_refund['总完成课时数'].mean():.1f}")
|
||
print(f" 退款用户数: {len(refund_users)}, 平均完成课时: {refund_users['总完成课时数'].mean():.1f}")
|
||
print(f" 全部退款用户数: {(df['是否全部退款'] == '是').sum()}, 平均完成课时: {df[df['是否全部退款'] == '是']['总完成课时数'].mean():.1f}")
|
||
|
||
# 6.4 整体统计
|
||
print("\n\n【维度4】整体统计")
|
||
total_users = len(df)
|
||
total_refund_users = len(refund_users)
|
||
print(f" 总付费用户: {total_users}")
|
||
print(f" 总退款用户: {total_refund_users} ({total_refund_users/total_users*100:.1f}%)")
|
||
print(f" 全部退款用户: {(df['是否全部退款'] == '是').sum()}")
|
||
print(f" 平均完成课时: {df['总完成课时数'].mean():.1f}")
|
||
print(f" 总GMV: {df['GMV'].sum():.2f}")
|
||
print(f" 总GSV: {df['GSV'].sum():.2f}")
|
||
|
||
# ── 7. 输出 Excel ──
|
||
output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output")
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
output_path = os.path.join(output_dir, "行课进度与退款率分析.xlsx")
|
||
|
||
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
|
||
# Sheet1: 明细
|
||
df_out = df.drop(columns=["课时桶"], errors="ignore")
|
||
for col in ["首次行课时间", "最近行课时间", "首次购买时间"]:
|
||
if col in df_out.columns:
|
||
df_out[col] = pd.to_datetime(df_out[col]).dt.tz_localize(None)
|
||
df_out.to_excel(writer, sheet_name="用户明细", index=False)
|
||
|
||
# Sheet2: 分桶统计
|
||
bucket_stats.to_excel(writer, sheet_name="分桶统计")
|
||
|
||
# Sheet3: 退款用户分桶
|
||
refund_bucket = refund_users.groupby("课时桶").agg(
|
||
用户数=("用户ID", "count"),
|
||
平均完成课时=("总完成课时数", "mean"),
|
||
GMV=("GMV", "sum"),
|
||
).reindex(bucket_order).fillna(0)
|
||
refund_bucket["占比"] = (refund_bucket["用户数"] / total_refund_users * 100).round(1)
|
||
refund_bucket.to_excel(writer, sheet_name="退款用户分布")
|
||
|
||
print(f"\n✅ 报表已生成: {output_path}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|