353 lines
13 KiB
Python
353 lines
13 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
行课进度与退款率关系分析(按购买等级匹配行课)
|
||
|
||
规则:
|
||
- 用户购买 L1+L2 联报课包(goods_id=61) → 只看 L1 行课
|
||
- 用户购买 L2 课包(goods_id IN 31,32,33,54) → 只看 L2 行课
|
||
- 用户购买 L1 课包(goods_id IN 57,60,63) → 只看 L1 行课
|
||
- 同时买了不同类型的 → 分别归入对应分类(一个用户可出现在多个分类)
|
||
|
||
行课完成:通过 bi_level_unit_lesson 获取 L1/L2 各自的 chapter_id 列表
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import psycopg2
|
||
import pandas as pd
|
||
from collections import defaultdict
|
||
from datetime import datetime
|
||
|
||
# ── 数据库 ──
|
||
DB_HOST = "bj-postgres-16pob4sg.sql.tencentcdb.com"
|
||
DB_PORT = 28591
|
||
DB_USER = "ai_member"
|
||
DB_NAME = "vala_bi"
|
||
|
||
def get_password():
|
||
pw = os.environ.get("PG_ONLINE_PASSWORD", "")
|
||
if pw:
|
||
return pw
|
||
secrets_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "secrets.env")
|
||
if os.path.exists(secrets_path):
|
||
with open(secrets_path) as f:
|
||
for line in f:
|
||
if line.startswith("PG_ONLINE_PASSWORD="):
|
||
return line.strip().split("=", 1)[1].strip("'\"")
|
||
raise RuntimeError("PG_ONLINE_PASSWORD not found")
|
||
|
||
def get_conn():
|
||
return psycopg2.connect(host=DB_HOST, port=DB_PORT, user=DB_USER, password=get_password(), dbname=DB_NAME, connect_timeout=60)
|
||
|
||
# ── 商品分类 ──
|
||
L1_GOODS = {57, 60, 63}
|
||
L2_GOODS = {31, 32, 33, 54}
|
||
L1L2_GOODS = {61} # 联报 → 看 L1 行课
|
||
|
||
def classify_user(goods_ids):
|
||
"""根据用户购买的 goods_id 集合,返回 [(分类标签, 对应等级)]"""
|
||
goods_set = set(goods_ids)
|
||
result = []
|
||
has_l1l2 = bool(goods_set & L1L2_GOODS)
|
||
has_l1 = bool(goods_set & L1_GOODS)
|
||
has_l2 = bool(goods_set & L2_GOODS)
|
||
|
||
if has_l1l2:
|
||
result.append(("L1+L2联报", "L1"))
|
||
if has_l1 and not has_l1l2:
|
||
# 纯L1(没有联报)
|
||
result.append(("仅L1", "L1"))
|
||
if has_l2 and not has_l1l2:
|
||
# 纯L2(没有联报)
|
||
result.append(("仅L2", "L2"))
|
||
if has_l1l2 and has_l2:
|
||
# 联报+L2 → 联报看L1,L2看L2
|
||
result.append(("联报+仅L2", "L2"))
|
||
|
||
return result
|
||
|
||
def lesson_bucket(n):
|
||
if n == 0:
|
||
return "0课时"
|
||
elif n <= 3:
|
||
return "1-3课时"
|
||
elif n <= 7:
|
||
return "4-7课时"
|
||
elif n <= 15:
|
||
return "8-15课时"
|
||
elif n <= 30:
|
||
return "16-30课时"
|
||
elif n <= 60:
|
||
return "31-60课时"
|
||
else:
|
||
return "60课时以上"
|
||
|
||
BUCKET_ORDER = ["0课时", "1-3课时", "4-7课时", "8-15课时", "16-30课时", "31-60课时", "60课时以上"]
|
||
|
||
def main():
|
||
conn = get_conn()
|
||
cur = conn.cursor()
|
||
|
||
print("=" * 60)
|
||
print("行课进度与退款率关系分析(按购买等级匹配行课)")
|
||
print("=" * 60)
|
||
|
||
# ── 0. 获取 L1/L2 的 chapter_id 列表 ──
|
||
print("\n[0/6] 获取课程结构...")
|
||
cur.execute("""
|
||
SELECT id AS chapter_id, course_level
|
||
FROM bi_level_unit_lesson
|
||
WHERE course_level IN ('L1', 'L2')
|
||
""")
|
||
level_chapters = defaultdict(set) # "L1" -> {chapter_ids}, "L2" -> {chapter_ids}
|
||
for ch_id, lv in cur.fetchall():
|
||
level_chapters[lv].add(ch_id)
|
||
print(f" L1 chapter数: {len(level_chapters['L1'])}")
|
||
print(f" L2 chapter数: {len(level_chapters['L2'])}")
|
||
|
||
# ── 1. 获取所有付费订单(排除测试账号)──
|
||
print("\n[1/6] 获取付费订单...")
|
||
cur.execute("""
|
||
SELECT o.id AS order_id, o.account_id, o.trade_no, o.pay_success_date,
|
||
o.pay_amount_int, o.order_status, o.goods_id, o.goods_name, o.key_from
|
||
FROM bi_vala_order o
|
||
INNER JOIN bi_vala_app_account a ON o.account_id = a.id AND a.status = 1
|
||
WHERE o.deleted_at IS NULL
|
||
AND o.pay_success_date IS NOT NULL
|
||
AND o.order_status IN (3, 4)
|
||
ORDER BY o.account_id, o.pay_success_date
|
||
""")
|
||
orders = cur.fetchall()
|
||
print(f" 共 {len(orders)} 条订单")
|
||
|
||
# ── 2. 获取退款信息 ──
|
||
print("\n[2/6] 获取退款信息...")
|
||
trade_nos = list(set(o[2] for o in orders if o[2]))
|
||
refund_set = set()
|
||
for i in range(0, len(trade_nos), 500):
|
||
batch = trade_nos[i:i+500]
|
||
ph = ",".join(["%s"] * len(batch))
|
||
cur.execute(f"""
|
||
SELECT DISTINCT trade_no
|
||
FROM bi_refund_order
|
||
WHERE trade_no IN ({ph}) AND status = 3 AND deleted_at IS NULL
|
||
""", batch)
|
||
for (tn,) in cur.fetchall():
|
||
refund_set.add(tn)
|
||
print(f" 退款 trade_no: {len(refund_set)}")
|
||
|
||
# ── 3. 用户-订单聚合 + 分类 ──
|
||
print("\n[3/6] 用户分类...")
|
||
user_orders = defaultdict(list)
|
||
for o in orders:
|
||
_, aid, tn, pd_, amt, os_, gid, gn, kf = o
|
||
user_orders[aid].append({
|
||
"trade_no": tn,
|
||
"pay_date": pd_,
|
||
"amount": amt / 100.0,
|
||
"order_status": os_,
|
||
"goods_id": gid,
|
||
"goods_name": gn,
|
||
"key_from": kf,
|
||
"is_refunded": tn in refund_set,
|
||
})
|
||
|
||
# 用户分类
|
||
user_classifications = {} # account_id -> [(分类标签, 对应等级), ...]
|
||
for aid, my_orders in user_orders.items():
|
||
goods_ids = [o["goods_id"] for o in my_orders]
|
||
user_classifications[aid] = classify_user(goods_ids)
|
||
|
||
# 统计分类
|
||
cat_count = defaultdict(int)
|
||
for aid, cats in user_classifications.items():
|
||
for cat, lv in cats:
|
||
cat_count[cat] += 1
|
||
print(" 用户分类统计:")
|
||
for k, v in sorted(cat_count.items()):
|
||
print(f" {k}: {v} 人")
|
||
|
||
# ── 4. 获取角色 ──
|
||
print("\n[4/6] 获取角色...")
|
||
all_account_ids = list(user_orders.keys())
|
||
all_chars = []
|
||
for i in range(0, len(all_account_ids), 1000):
|
||
batch = all_account_ids[i:i+1000]
|
||
ph = ",".join(["%s"] * len(batch))
|
||
cur.execute(f"""
|
||
SELECT id AS character_id, account_id
|
||
FROM bi_vala_app_character
|
||
WHERE account_id IN ({ph})
|
||
AND nickname IS NOT NULL AND nickname != ''
|
||
AND deleted_at IS NULL
|
||
""", batch)
|
||
all_chars.extend(cur.fetchall())
|
||
print(f" 共 {len(all_chars)} 个角色")
|
||
|
||
account_chars = defaultdict(list)
|
||
all_char_ids = []
|
||
for cid, aid in all_chars:
|
||
account_chars[aid].append(cid)
|
||
all_char_ids.append(cid)
|
||
|
||
# ── 5. 获取课时完成记录(按 chapter_id 过滤等级)──
|
||
print("\n[5/6] 获取课时完成记录...")
|
||
# 每个角色在每个等级的完成课时数
|
||
char_lesson_l1 = defaultdict(int)
|
||
char_lesson_l2 = defaultdict(int)
|
||
|
||
l1_chapter_list = list(level_chapters["L1"])
|
||
l2_chapter_list = list(level_chapters["L2"])
|
||
|
||
for tbl_idx in range(8):
|
||
table = f"bi_user_chapter_play_record_{tbl_idx}"
|
||
for i in range(0, len(all_char_ids), 2000):
|
||
batch = all_char_ids[i:i+2000]
|
||
ph = ",".join(["%s"] * len(batch))
|
||
try:
|
||
cur.execute(f"""
|
||
SELECT user_id, chapter_id
|
||
FROM {table}
|
||
WHERE user_id IN ({ph})
|
||
AND play_status = 1
|
||
AND deleted_at IS NULL
|
||
""", batch)
|
||
for user_id, ch_id in cur.fetchall():
|
||
if ch_id in level_chapters["L1"]:
|
||
char_lesson_l1[user_id] += 1
|
||
if ch_id in level_chapters["L2"]:
|
||
char_lesson_l2[user_id] += 1
|
||
except Exception as e:
|
||
print(f" 警告: {table} 查询失败: {e}")
|
||
|
||
print(f" 有L1行课的角色: {len(char_lesson_l1)}")
|
||
print(f" 有L2行课的角色: {len(char_lesson_l2)}")
|
||
|
||
# ── 6. 构建分析数据 ──
|
||
print("\n[6/6] 构建分析...")
|
||
|
||
# 每个 (account_id, 分类, 等级) 一行
|
||
rows = []
|
||
for aid in all_account_ids:
|
||
my_orders = user_orders.get(aid, [])
|
||
if not my_orders:
|
||
continue
|
||
my_chars = account_chars.get(aid, [])
|
||
|
||
# 计算每个等级的总完成课时
|
||
total_l1 = sum(char_lesson_l1.get(cid, 0) for cid in my_chars)
|
||
total_l2 = sum(char_lesson_l2.get(cid, 0) for cid in my_chars)
|
||
|
||
# 订单统计
|
||
total_orders = len(my_orders)
|
||
refunded_orders = sum(1 for o in my_orders if o["is_refunded"])
|
||
total_gmv = sum(o["amount"] for o in my_orders)
|
||
total_refund = sum(o["amount"] for o in my_orders if o["is_refunded"])
|
||
all_refunded = (refunded_orders == total_orders and total_orders > 0)
|
||
has_any_refund = refunded_orders > 0
|
||
|
||
classifications = user_classifications.get(aid, [])
|
||
if not classifications:
|
||
continue
|
||
|
||
for cat_label, watch_level in classifications:
|
||
lesson_count = total_l1 if watch_level == "L1" else total_l2
|
||
rows.append({
|
||
"用户ID": aid,
|
||
"购买分类": cat_label,
|
||
"行课等级": watch_level,
|
||
"完成课时数": lesson_count,
|
||
"订单数": total_orders,
|
||
"退款订单数": refunded_orders,
|
||
"GMV": round(total_gmv, 2),
|
||
"GSV": round(total_gmv - total_refund, 2),
|
||
"退款金额": round(total_refund, 2),
|
||
"是否退过款": "是" if has_any_refund else "否",
|
||
"是否全部退款": "是" if all_refunded else "否",
|
||
})
|
||
|
||
df = pd.DataFrame(rows)
|
||
conn.close()
|
||
|
||
# ── 分析输出 ──
|
||
print("\n" + "=" * 60)
|
||
print("分析结果")
|
||
print("=" * 60)
|
||
|
||
df["课时桶"] = df["完成课时数"].apply(lesson_bucket)
|
||
|
||
# ── 按购买分类拆分 ──
|
||
for cat_label in ["仅L1", "仅L2", "L1+L2联报"]:
|
||
df_cat = df[df["购买分类"] == cat_label]
|
||
if len(df_cat) == 0:
|
||
print(f"\n【{cat_label}】无数据")
|
||
continue
|
||
|
||
print(f"\n{'='*50}")
|
||
print(f"【{cat_label}】({len(df_cat)} 用户)")
|
||
print(f"{'='*50}")
|
||
|
||
bucket_stats = df_cat.groupby("课时桶").agg(
|
||
用户数=("用户ID", "count"),
|
||
退款用户数=("是否退过款", lambda x: (x == "是").sum()),
|
||
全部退款用户数=("是否全部退款", lambda x: (x == "是").sum()),
|
||
平均完成课时=("完成课时数", "mean"),
|
||
GMV总额=("GMV", "sum"),
|
||
GSV总额=("GSV", "sum"),
|
||
).reindex(BUCKET_ORDER).fillna(0)
|
||
|
||
bucket_stats["退款率"] = (bucket_stats["退款用户数"] / bucket_stats["用户数"] * 100).round(1)
|
||
bucket_stats["全部退款率"] = (bucket_stats["全部退款用户数"] / bucket_stats["用户数"] * 100).round(1)
|
||
bucket_stats["平均完成课时"] = bucket_stats["平均完成课时"].round(1)
|
||
bucket_stats["GMV总额"] = bucket_stats["GMV总额"].round(2)
|
||
bucket_stats["GSV总额"] = bucket_stats["GSV总额"].round(2)
|
||
|
||
# 去掉用户数为0的行
|
||
bucket_stats = bucket_stats[bucket_stats["用户数"] > 0]
|
||
print(bucket_stats.to_string())
|
||
|
||
# 退款用户分布
|
||
refund_users = df_cat[df_cat["是否退过款"] == "是"]
|
||
no_refund_users = df_cat[df_cat["是否退过款"] == "否"]
|
||
print(f"\n 退款用户: {len(refund_users)} 人, 平均完成课时: {refund_users['完成课时数'].mean():.1f}")
|
||
print(f" 未退款用户: {len(no_refund_users)} 人, 平均完成课时: {no_refund_users['完成课时数'].mean():.1f}")
|
||
print(f" 整体退费率: {len(refund_users)/len(df_cat)*100:.1f}%")
|
||
|
||
# ── 输出 Excel ──
|
||
output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output")
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
output_path = os.path.join(output_dir, "行课进度与退款率分析_按等级.xlsx")
|
||
|
||
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
|
||
# Sheet1: 明细
|
||
df_out = df.drop(columns=["课时桶"], errors="ignore")
|
||
df_out.to_excel(writer, sheet_name="用户明细", index=False)
|
||
|
||
# Sheet2-4: 各分类分桶
|
||
for cat_label in ["仅L1", "仅L2", "L1+L2联报"]:
|
||
df_cat = df[df["购买分类"] == cat_label]
|
||
if len(df_cat) == 0:
|
||
continue
|
||
bucket_stats = df_cat.groupby("课时桶").agg(
|
||
用户数=("用户ID", "count"),
|
||
退款用户数=("是否退过款", lambda x: (x == "是").sum()),
|
||
全部退款用户数=("是否全部退款", lambda x: (x == "是").sum()),
|
||
平均完成课时=("完成课时数", "mean"),
|
||
GMV总额=("GMV", "sum"),
|
||
GSV总额=("GSV", "sum"),
|
||
).reindex(BUCKET_ORDER).fillna(0)
|
||
bucket_stats["退款率"] = (bucket_stats["退款用户数"] / bucket_stats["用户数"] * 100).round(1)
|
||
bucket_stats["全部退款率"] = (bucket_stats["全部退款用户数"] / bucket_stats["用户数"] * 100).round(1)
|
||
bucket_stats["平均完成课时"] = bucket_stats["平均完成课时"].round(1)
|
||
bucket_stats["GMV总额"] = bucket_stats["GMV总额"].round(2)
|
||
bucket_stats["GSV总额"] = bucket_stats["GSV总额"].round(2)
|
||
bucket_stats = bucket_stats[bucket_stats["用户数"] > 0]
|
||
sheet_name = cat_label[:31] # Excel sheet name max 31 chars
|
||
bucket_stats.to_excel(writer, sheet_name=sheet_name)
|
||
|
||
print(f"\n✅ 报表已生成: {output_path}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|