ai_member_xiaoxi/scripts/course_progress_refund_v2.py
2026-06-02 08:00:01 +08:00

353 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
行课进度与退款率关系分析(按购买等级匹配行课)
规则:
- 用户购买 L1+L2 联报课包(goods_id=61) → 只看 L1 行课
- 用户购买 L2 课包(goods_id IN 31,32,33,54) → 只看 L2 行课
- 用户购买 L1 课包(goods_id IN 57,60,63) → 只看 L1 行课
- 同时买了不同类型的 → 分别归入对应分类(一个用户可出现在多个分类)
行课完成:通过 bi_level_unit_lesson 获取 L1/L2 各自的 chapter_id 列表
"""
import os
import sys
import psycopg2
import pandas as pd
from collections import defaultdict
from datetime import datetime
# ── 数据库 ──
DB_HOST = "bj-postgres-16pob4sg.sql.tencentcdb.com"
DB_PORT = 28591
DB_USER = "ai_member"
DB_NAME = "vala_bi"
def get_password():
pw = os.environ.get("PG_ONLINE_PASSWORD", "")
if pw:
return pw
secrets_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "secrets.env")
if os.path.exists(secrets_path):
with open(secrets_path) as f:
for line in f:
if line.startswith("PG_ONLINE_PASSWORD="):
return line.strip().split("=", 1)[1].strip("'\"")
raise RuntimeError("PG_ONLINE_PASSWORD not found")
def get_conn():
return psycopg2.connect(host=DB_HOST, port=DB_PORT, user=DB_USER, password=get_password(), dbname=DB_NAME, connect_timeout=60)
# ── 商品分类 ──
L1_GOODS = {57, 60, 63}
L2_GOODS = {31, 32, 33, 54}
L1L2_GOODS = {61} # 联报 → 看 L1 行课
def classify_user(goods_ids):
"""根据用户购买的 goods_id 集合,返回 [(分类标签, 对应等级)]"""
goods_set = set(goods_ids)
result = []
has_l1l2 = bool(goods_set & L1L2_GOODS)
has_l1 = bool(goods_set & L1_GOODS)
has_l2 = bool(goods_set & L2_GOODS)
if has_l1l2:
result.append(("L1+L2联报", "L1"))
if has_l1 and not has_l1l2:
# 纯L1没有联报
result.append(("仅L1", "L1"))
if has_l2 and not has_l1l2:
# 纯L2没有联报
result.append(("仅L2", "L2"))
if has_l1l2 and has_l2:
# 联报+L2 → 联报看L1L2看L2
result.append(("联报+仅L2", "L2"))
return result
def lesson_bucket(n):
if n == 0:
return "0课时"
elif n <= 3:
return "1-3课时"
elif n <= 7:
return "4-7课时"
elif n <= 15:
return "8-15课时"
elif n <= 30:
return "16-30课时"
elif n <= 60:
return "31-60课时"
else:
return "60课时以上"
BUCKET_ORDER = ["0课时", "1-3课时", "4-7课时", "8-15课时", "16-30课时", "31-60课时", "60课时以上"]
def main():
conn = get_conn()
cur = conn.cursor()
print("=" * 60)
print("行课进度与退款率关系分析(按购买等级匹配行课)")
print("=" * 60)
# ── 0. 获取 L1/L2 的 chapter_id 列表 ──
print("\n[0/6] 获取课程结构...")
cur.execute("""
SELECT id AS chapter_id, course_level
FROM bi_level_unit_lesson
WHERE course_level IN ('L1', 'L2')
""")
level_chapters = defaultdict(set) # "L1" -> {chapter_ids}, "L2" -> {chapter_ids}
for ch_id, lv in cur.fetchall():
level_chapters[lv].add(ch_id)
print(f" L1 chapter数: {len(level_chapters['L1'])}")
print(f" L2 chapter数: {len(level_chapters['L2'])}")
# ── 1. 获取所有付费订单(排除测试账号)──
print("\n[1/6] 获取付费订单...")
cur.execute("""
SELECT o.id AS order_id, o.account_id, o.trade_no, o.pay_success_date,
o.pay_amount_int, o.order_status, o.goods_id, o.goods_name, o.key_from
FROM bi_vala_order o
INNER JOIN bi_vala_app_account a ON o.account_id = a.id AND a.status = 1
WHERE o.deleted_at IS NULL
AND o.pay_success_date IS NOT NULL
AND o.order_status IN (3, 4)
ORDER BY o.account_id, o.pay_success_date
""")
orders = cur.fetchall()
print(f"{len(orders)} 条订单")
# ── 2. 获取退款信息 ──
print("\n[2/6] 获取退款信息...")
trade_nos = list(set(o[2] for o in orders if o[2]))
refund_set = set()
for i in range(0, len(trade_nos), 500):
batch = trade_nos[i:i+500]
ph = ",".join(["%s"] * len(batch))
cur.execute(f"""
SELECT DISTINCT trade_no
FROM bi_refund_order
WHERE trade_no IN ({ph}) AND status = 3 AND deleted_at IS NULL
""", batch)
for (tn,) in cur.fetchall():
refund_set.add(tn)
print(f" 退款 trade_no: {len(refund_set)}")
# ── 3. 用户-订单聚合 + 分类 ──
print("\n[3/6] 用户分类...")
user_orders = defaultdict(list)
for o in orders:
_, aid, tn, pd_, amt, os_, gid, gn, kf = o
user_orders[aid].append({
"trade_no": tn,
"pay_date": pd_,
"amount": amt / 100.0,
"order_status": os_,
"goods_id": gid,
"goods_name": gn,
"key_from": kf,
"is_refunded": tn in refund_set,
})
# 用户分类
user_classifications = {} # account_id -> [(分类标签, 对应等级), ...]
for aid, my_orders in user_orders.items():
goods_ids = [o["goods_id"] for o in my_orders]
user_classifications[aid] = classify_user(goods_ids)
# 统计分类
cat_count = defaultdict(int)
for aid, cats in user_classifications.items():
for cat, lv in cats:
cat_count[cat] += 1
print(" 用户分类统计:")
for k, v in sorted(cat_count.items()):
print(f" {k}: {v}")
# ── 4. 获取角色 ──
print("\n[4/6] 获取角色...")
all_account_ids = list(user_orders.keys())
all_chars = []
for i in range(0, len(all_account_ids), 1000):
batch = all_account_ids[i:i+1000]
ph = ",".join(["%s"] * len(batch))
cur.execute(f"""
SELECT id AS character_id, account_id
FROM bi_vala_app_character
WHERE account_id IN ({ph})
AND nickname IS NOT NULL AND nickname != ''
AND deleted_at IS NULL
""", batch)
all_chars.extend(cur.fetchall())
print(f"{len(all_chars)} 个角色")
account_chars = defaultdict(list)
all_char_ids = []
for cid, aid in all_chars:
account_chars[aid].append(cid)
all_char_ids.append(cid)
# ── 5. 获取课时完成记录(按 chapter_id 过滤等级)──
print("\n[5/6] 获取课时完成记录...")
# 每个角色在每个等级的完成课时数
char_lesson_l1 = defaultdict(int)
char_lesson_l2 = defaultdict(int)
l1_chapter_list = list(level_chapters["L1"])
l2_chapter_list = list(level_chapters["L2"])
for tbl_idx in range(8):
table = f"bi_user_chapter_play_record_{tbl_idx}"
for i in range(0, len(all_char_ids), 2000):
batch = all_char_ids[i:i+2000]
ph = ",".join(["%s"] * len(batch))
try:
cur.execute(f"""
SELECT user_id, chapter_id
FROM {table}
WHERE user_id IN ({ph})
AND play_status = 1
AND deleted_at IS NULL
""", batch)
for user_id, ch_id in cur.fetchall():
if ch_id in level_chapters["L1"]:
char_lesson_l1[user_id] += 1
if ch_id in level_chapters["L2"]:
char_lesson_l2[user_id] += 1
except Exception as e:
print(f" 警告: {table} 查询失败: {e}")
print(f" 有L1行课的角色: {len(char_lesson_l1)}")
print(f" 有L2行课的角色: {len(char_lesson_l2)}")
# ── 6. 构建分析数据 ──
print("\n[6/6] 构建分析...")
# 每个 (account_id, 分类, 等级) 一行
rows = []
for aid in all_account_ids:
my_orders = user_orders.get(aid, [])
if not my_orders:
continue
my_chars = account_chars.get(aid, [])
# 计算每个等级的总完成课时
total_l1 = sum(char_lesson_l1.get(cid, 0) for cid in my_chars)
total_l2 = sum(char_lesson_l2.get(cid, 0) for cid in my_chars)
# 订单统计
total_orders = len(my_orders)
refunded_orders = sum(1 for o in my_orders if o["is_refunded"])
total_gmv = sum(o["amount"] for o in my_orders)
total_refund = sum(o["amount"] for o in my_orders if o["is_refunded"])
all_refunded = (refunded_orders == total_orders and total_orders > 0)
has_any_refund = refunded_orders > 0
classifications = user_classifications.get(aid, [])
if not classifications:
continue
for cat_label, watch_level in classifications:
lesson_count = total_l1 if watch_level == "L1" else total_l2
rows.append({
"用户ID": aid,
"购买分类": cat_label,
"行课等级": watch_level,
"完成课时数": lesson_count,
"订单数": total_orders,
"退款订单数": refunded_orders,
"GMV": round(total_gmv, 2),
"GSV": round(total_gmv - total_refund, 2),
"退款金额": round(total_refund, 2),
"是否退过款": "" if has_any_refund else "",
"是否全部退款": "" if all_refunded else "",
})
df = pd.DataFrame(rows)
conn.close()
# ── 分析输出 ──
print("\n" + "=" * 60)
print("分析结果")
print("=" * 60)
df["课时桶"] = df["完成课时数"].apply(lesson_bucket)
# ── 按购买分类拆分 ──
for cat_label in ["仅L1", "仅L2", "L1+L2联报"]:
df_cat = df[df["购买分类"] == cat_label]
if len(df_cat) == 0:
print(f"\n{cat_label}】无数据")
continue
print(f"\n{'='*50}")
print(f"{cat_label}】({len(df_cat)} 用户)")
print(f"{'='*50}")
bucket_stats = df_cat.groupby("课时桶").agg(
用户数=("用户ID", "count"),
退款用户数=("是否退过款", lambda x: (x == "").sum()),
全部退款用户数=("是否全部退款", lambda x: (x == "").sum()),
平均完成课时=("完成课时数", "mean"),
GMV总额=("GMV", "sum"),
GSV总额=("GSV", "sum"),
).reindex(BUCKET_ORDER).fillna(0)
bucket_stats["退款率"] = (bucket_stats["退款用户数"] / bucket_stats["用户数"] * 100).round(1)
bucket_stats["全部退款率"] = (bucket_stats["全部退款用户数"] / bucket_stats["用户数"] * 100).round(1)
bucket_stats["平均完成课时"] = bucket_stats["平均完成课时"].round(1)
bucket_stats["GMV总额"] = bucket_stats["GMV总额"].round(2)
bucket_stats["GSV总额"] = bucket_stats["GSV总额"].round(2)
# 去掉用户数为0的行
bucket_stats = bucket_stats[bucket_stats["用户数"] > 0]
print(bucket_stats.to_string())
# 退款用户分布
refund_users = df_cat[df_cat["是否退过款"] == ""]
no_refund_users = df_cat[df_cat["是否退过款"] == ""]
print(f"\n 退款用户: {len(refund_users)} 人, 平均完成课时: {refund_users['完成课时数'].mean():.1f}")
print(f" 未退款用户: {len(no_refund_users)} 人, 平均完成课时: {no_refund_users['完成课时数'].mean():.1f}")
print(f" 整体退费率: {len(refund_users)/len(df_cat)*100:.1f}%")
# ── 输出 Excel ──
output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output")
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "行课进度与退款率分析_按等级.xlsx")
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
# Sheet1: 明细
df_out = df.drop(columns=["课时桶"], errors="ignore")
df_out.to_excel(writer, sheet_name="用户明细", index=False)
# Sheet2-4: 各分类分桶
for cat_label in ["仅L1", "仅L2", "L1+L2联报"]:
df_cat = df[df["购买分类"] == cat_label]
if len(df_cat) == 0:
continue
bucket_stats = df_cat.groupby("课时桶").agg(
用户数=("用户ID", "count"),
退款用户数=("是否退过款", lambda x: (x == "").sum()),
全部退款用户数=("是否全部退款", lambda x: (x == "").sum()),
平均完成课时=("完成课时数", "mean"),
GMV总额=("GMV", "sum"),
GSV总额=("GSV", "sum"),
).reindex(BUCKET_ORDER).fillna(0)
bucket_stats["退款率"] = (bucket_stats["退款用户数"] / bucket_stats["用户数"] * 100).round(1)
bucket_stats["全部退款率"] = (bucket_stats["全部退款用户数"] / bucket_stats["用户数"] * 100).round(1)
bucket_stats["平均完成课时"] = bucket_stats["平均完成课时"].round(1)
bucket_stats["GMV总额"] = bucket_stats["GMV总额"].round(2)
bucket_stats["GSV总额"] = bucket_stats["GSV总额"].round(2)
bucket_stats = bucket_stats[bucket_stats["用户数"] > 0]
sheet_name = cat_label[:31] # Excel sheet name max 31 chars
bucket_stats.to_excel(writer, sheet_name=sheet_name)
print(f"\n✅ 报表已生成: {output_path}")
if __name__ == "__main__":
main()