398 lines
17 KiB
Python
398 lines
17 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
老狼退款用户 — 行课进度 × 完课时长 × 退款率交叉分析
|
||
|
||
规则:
|
||
- 购买 L1+L2 联报 → 看 L1 行课
|
||
- 购买 L2 课包 → 看 L2 行课
|
||
|
||
新增:完课时长 = SUM(bi_user_component_play_record.interval_time) / 60000 分钟
|
||
"""
|
||
|
||
import os, sys, re
|
||
import psycopg2
|
||
import pandas as pd
|
||
from collections import defaultdict
|
||
|
||
DB_HOST = "bj-postgres-16pob4sg.sql.tencentcdb.com"
|
||
DB_PORT = 28591
|
||
DB_USER = "ai_member"
|
||
DB_NAME = "vala_bi"
|
||
|
||
def get_password():
|
||
pw = os.environ.get("PG_ONLINE_PASSWORD", "")
|
||
if pw:
|
||
return pw
|
||
secrets_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "secrets.env")
|
||
if os.path.exists(secrets_path):
|
||
with open(secrets_path) as f:
|
||
for line in f:
|
||
if line.startswith("PG_ONLINE_PASSWORD="):
|
||
return line.strip().split("=", 1)[1].strip("'\"")
|
||
raise RuntimeError("PG_ONLINE_PASSWORD not found")
|
||
|
||
def get_conn():
|
||
return psycopg2.connect(host=DB_HOST, port=DB_PORT, user=DB_USER, password=get_password(), dbname=DB_NAME, connect_timeout=120)
|
||
|
||
def lesson_bucket(n):
|
||
if n == 0: return "0课时"
|
||
elif n <= 3: return "1-3课时"
|
||
elif n <= 7: return "4-7课时"
|
||
elif n <= 15: return "8-15课时"
|
||
elif n <= 30: return "16-30课时"
|
||
elif n <= 60: return "31-60课时"
|
||
else: return "60课时以上"
|
||
|
||
def time_bucket(minutes):
|
||
if minutes == 0: return "0分钟"
|
||
elif minutes <= 10: return "1-10分钟"
|
||
elif minutes <= 30: return "11-30分钟"
|
||
elif minutes <= 60: return "31-60分钟"
|
||
elif minutes <= 120: return "61-120分钟"
|
||
elif minutes <= 300: return "121-300分钟"
|
||
else: return "300分钟以上"
|
||
|
||
BUCKET_ORDER = ["0课时", "1-3课时", "4-7课时", "8-15课时", "16-30课时", "31-60课时", "60课时以上"]
|
||
TIME_BUCKET_ORDER = ["0分钟", "1-10分钟", "11-30分钟", "31-60分钟", "61-120分钟", "121-300分钟", "300分钟以上"]
|
||
|
||
L1_GOODS = {57, 60, 63}
|
||
L2_GOODS = {31, 32, 33, 54}
|
||
L1L2_GOODS = {61}
|
||
|
||
def classify_orders(goods_ids):
|
||
gs = set(goods_ids)
|
||
result = []
|
||
has_l1l2 = bool(gs & L1L2_GOODS)
|
||
has_l2 = bool(gs & L2_GOODS)
|
||
if has_l1l2:
|
||
result.append(("L1+L2联报→看L1", "L1"))
|
||
if has_l2 and not has_l1l2:
|
||
result.append(("仅L2→看L2", "L2"))
|
||
if has_l1l2 and has_l2:
|
||
result.append(("联报+仅L2→看L2", "L2"))
|
||
return result
|
||
|
||
def main():
|
||
input_file = sys.argv[1] if len(sys.argv) > 1 else "output/销售线索_用户分析.xlsx"
|
||
df_in = pd.read_excel(input_file, dtype=str)
|
||
user_ids = [int(x) for x in df_in["用户ID"].dropna().unique()]
|
||
print(f"老狼线索用户: {len(user_ids)} 人")
|
||
|
||
conn = get_conn()
|
||
cur = conn.cursor()
|
||
|
||
# ── 0. 获取 L1/L2 chapter_id ──
|
||
cur.execute("SELECT id, course_level FROM bi_level_unit_lesson WHERE course_level IN ('L1','L2')")
|
||
l1_chapters = set()
|
||
l2_chapters = set()
|
||
for ch_id, lv in cur.fetchall():
|
||
if lv == "L1": l1_chapters.add(ch_id)
|
||
else: l2_chapters.add(ch_id)
|
||
print(f" L1: {len(l1_chapters)}, L2: {len(l2_chapters)}")
|
||
|
||
# ── 1. 订单 ──
|
||
ph = ",".join(["%s"]*len(user_ids))
|
||
cur.execute(f"""
|
||
SELECT account_id, goods_id, goods_name, trade_no, pay_success_date,
|
||
pay_amount_int, order_status, key_from
|
||
FROM bi_vala_order
|
||
WHERE account_id IN ({ph}) AND deleted_at IS NULL
|
||
AND pay_success_date IS NOT NULL AND order_status IN (3,4)
|
||
ORDER BY account_id, pay_success_date
|
||
""", user_ids)
|
||
orders = cur.fetchall()
|
||
print(f" 订单: {len(orders)}")
|
||
|
||
# ── 2. 退款 ──
|
||
trade_nos = [o[3] for o in orders if o[3]]
|
||
refund_set = set()
|
||
for i in range(0, len(trade_nos), 500):
|
||
batch = trade_nos[i:i+500]
|
||
p2 = ",".join(["%s"]*len(batch))
|
||
cur.execute(f"SELECT DISTINCT trade_no FROM bi_refund_order WHERE trade_no IN ({p2}) AND status=3 AND deleted_at IS NULL", batch)
|
||
for (tn,) in cur.fetchall(): refund_set.add(tn)
|
||
|
||
# ── 3. 角色(含生日、年龄)──
|
||
cur.execute(f"""
|
||
SELECT id, account_id, nickname, birthday FROM bi_vala_app_character
|
||
WHERE account_id IN ({ph}) AND nickname IS NOT NULL AND nickname!='' AND deleted_at IS NULL
|
||
""", user_ids)
|
||
chars = cur.fetchall()
|
||
char_ids = [c[0] for c in chars]
|
||
account_chars = defaultdict(list)
|
||
char_info = {} # char_id -> (nickname, birthday, age)
|
||
from datetime import date
|
||
today = date.today()
|
||
for cid, aid, nick, bday in chars:
|
||
account_chars[aid].append(cid)
|
||
age = None
|
||
if bday:
|
||
try:
|
||
bd = pd.Timestamp(str(bday)[:10]).date()
|
||
age = today.year - bd.year - ((today.month, today.day) < (bd.month, bd.day))
|
||
except: pass
|
||
char_info[cid] = (nick, str(bday)[:10] if bday else '', age)
|
||
print(f" 角色: {len(chars)}")
|
||
|
||
# ── 4. 课时完成 + 获取 chapter_unique_id ──
|
||
# char -> {chapter_id -> chapter_unique_id} (取首次完成的记录)
|
||
char_chapter_cuid = defaultdict(dict) # character_id -> {chapter_id -> chapter_unique_id}
|
||
char_lessons_l1 = defaultdict(int)
|
||
char_lessons_l2 = defaultdict(int)
|
||
|
||
all_cuids = set()
|
||
|
||
for tbl_idx in range(8):
|
||
table = f"bi_user_chapter_play_record_{tbl_idx}"
|
||
for i in range(0, len(char_ids), 2000):
|
||
batch = char_ids[i:i+2000]
|
||
p2 = ",".join(["%s"]*len(batch))
|
||
try:
|
||
cur.execute(f"""
|
||
SELECT user_id, chapter_id, chapter_unique_id
|
||
FROM {table}
|
||
WHERE user_id IN ({p2}) AND play_status=1 AND deleted_at IS NULL
|
||
""", batch)
|
||
for uid, ch_id, cuid in cur.fetchall():
|
||
# 取首次完成的记录
|
||
if ch_id not in char_chapter_cuid[uid]:
|
||
char_chapter_cuid[uid][ch_id] = cuid
|
||
if ch_id in l1_chapters:
|
||
char_lessons_l1[uid] += 1
|
||
elif ch_id in l2_chapters:
|
||
char_lessons_l2[uid] += 1
|
||
all_cuids.add(cuid)
|
||
except Exception as e:
|
||
print(f" warn {table}: {e}")
|
||
|
||
print(f" chapter_unique_id 数: {len(all_cuids)}")
|
||
|
||
# ── 5. 从 component_play_record 获取完课时长 ──
|
||
print(f" 正在查询完课时长({len(all_cuids)} 个 cuid)...")
|
||
cuid_duration = defaultdict(int) # chapter_unique_id -> 总耗时(毫秒)
|
||
|
||
cuid_list = list(all_cuids)
|
||
for tbl_idx in range(8):
|
||
table = f"bi_user_component_play_record_{tbl_idx}"
|
||
for i in range(0, len(cuid_list), 2000):
|
||
batch = cuid_list[i:i+2000]
|
||
p2 = ",".join(["%s"]*len(batch))
|
||
try:
|
||
cur.execute(f"""
|
||
SELECT chapter_unique_id, SUM(COALESCE(interval_time, 0))
|
||
FROM {table}
|
||
WHERE chapter_unique_id IN ({p2}) AND deleted_at IS NULL
|
||
GROUP BY chapter_unique_id
|
||
""", batch)
|
||
for cuid, total_ms in cur.fetchall():
|
||
if total_ms:
|
||
cuid_duration[cuid] += total_ms
|
||
except Exception as e:
|
||
print(f" warn {table}: {e}")
|
||
|
||
print(f" 有耗时记录的 cuid: {len(cuid_duration)}")
|
||
|
||
# ── 6. 账户信息 ──
|
||
cur.execute(f"SELECT id, created_at FROM bi_vala_app_account WHERE id IN ({ph}) AND status=1", user_ids)
|
||
account_info = {aid: reg for aid, reg in cur.fetchall()}
|
||
|
||
conn.close()
|
||
|
||
# ── 6.5 加微判断(vala_class.student_info)──
|
||
wechat_bound = set()
|
||
try:
|
||
conn_class = psycopg2.connect(host=DB_HOST, port=DB_PORT, user=DB_USER, password=get_password(), dbname='vala_class', connect_timeout=30)
|
||
cur2 = conn_class.cursor()
|
||
cur2.execute(f"SELECT DISTINCT vala_account_id FROM student_info WHERE vala_account_id IN ({ph})", user_ids)
|
||
wechat_bound = {r[0] for r in cur2.fetchall()}
|
||
cur2.close()
|
||
conn_class.close()
|
||
print(f" 加微判断: {len(wechat_bound)}/{len(user_ids)} 人已加微")
|
||
except Exception as e:
|
||
print(f" 加微判断跳过: {e}")
|
||
|
||
# ── 7. 按用户+等级汇总课时数和耗时 ──
|
||
# 先算每个角色在每个等级的总耗时
|
||
char_duration_l1 = defaultdict(int)
|
||
char_duration_l2 = defaultdict(int)
|
||
|
||
for cid, chapter_map in char_chapter_cuid.items():
|
||
for ch_id, cuid in chapter_map.items():
|
||
dur = cuid_duration.get(cuid, 0)
|
||
if ch_id in l1_chapters:
|
||
char_duration_l1[cid] += dur
|
||
elif ch_id in l2_chapters:
|
||
char_duration_l2[cid] += dur
|
||
|
||
# ── 8. 构建分析数据 ──
|
||
user_orders_map = defaultdict(list)
|
||
for o in orders:
|
||
aid, gid, gn, tn, pd_, amt, os_, kf = o
|
||
user_orders_map[aid].append({
|
||
"goods_id": gid, "goods_name": gn, "trade_no": tn,
|
||
"pay_date": pd_, "amount": amt/100.0, "order_status": os_,
|
||
"key_from": kf, "is_refunded": tn in refund_set,
|
||
})
|
||
|
||
rows = []
|
||
for aid in user_ids:
|
||
my_orders = user_orders_map.get(aid, [])
|
||
my_chars = account_chars.get(aid, [])
|
||
reg_time = account_info.get(aid)
|
||
|
||
total_l1 = sum(char_lessons_l1.get(c, 0) for c in my_chars)
|
||
total_l2 = sum(char_lessons_l2.get(c, 0) for c in my_chars)
|
||
dur_l1 = sum(char_duration_l1.get(c, 0) for c in my_chars)
|
||
dur_l2 = sum(char_duration_l2.get(c, 0) for c in my_chars)
|
||
|
||
total_orders = len(my_orders)
|
||
refunded_orders = sum(1 for o in my_orders if o["is_refunded"])
|
||
total_gmv = sum(o["amount"] for o in my_orders)
|
||
total_refund = sum(o["amount"] for o in my_orders if o["is_refunded"])
|
||
all_refunded = (refunded_orders == total_orders and total_orders > 0)
|
||
has_refund = refunded_orders > 0
|
||
|
||
goods_ids = [o["goods_id"] for o in my_orders]
|
||
cats = classify_orders(goods_ids)
|
||
|
||
for cat_label, watch_level in cats:
|
||
lessons = total_l1 if watch_level == "L1" else total_l2
|
||
dur_ms = dur_l1 if watch_level == "L1" else dur_l2
|
||
dur_min = round(dur_ms / 60000.0, 1) if dur_ms else 0.0
|
||
avg_min_per_lesson = round(dur_min / lessons, 1) if lessons > 0 else 0.0
|
||
|
||
# 构建角色信息
|
||
char_details = []
|
||
for cid in my_chars:
|
||
nick, bday_str, age = char_info.get(cid, ('?', '', None))
|
||
age_str = f'{age}岁' if age is not None else '?'
|
||
char_details.append(f'{nick}({age_str})')
|
||
|
||
rows.append({
|
||
"用户ID": aid,
|
||
"注册时间": reg_time,
|
||
"购买分类": cat_label,
|
||
"行课等级": watch_level,
|
||
"角色数": len(my_chars),
|
||
"角色信息(名称+年龄)": "; ".join(char_details),
|
||
"完成课时数": lessons,
|
||
"总完课时长(分钟)": dur_min,
|
||
"平均每课时长(分钟)": avg_min_per_lesson,
|
||
"订单数": total_orders,
|
||
"退款订单数": refunded_orders,
|
||
"GMV": round(total_gmv, 2),
|
||
"GSV": round(total_gmv - total_refund, 2),
|
||
"是否退款": "是" if has_refund else "否",
|
||
"是否全部退款": "是" if all_refunded else "否",
|
||
"是否加微": "是" if aid in wechat_bound else "否",
|
||
"购买课包": ";".join(o["goods_name"] for o in my_orders),
|
||
})
|
||
|
||
df = pd.DataFrame(rows)
|
||
df["课时桶"] = df["完成课时数"].apply(lesson_bucket)
|
||
df["时长桶"] = df["总完课时长(分钟)"].apply(time_bucket)
|
||
|
||
# ── 输出 ──
|
||
print("\n" + "="*70)
|
||
print("老狼退款用户 — 行课进度 × 完课时长 × 退款率交叉分析")
|
||
print("="*70)
|
||
|
||
for cat_label in ["L1+L2联报→看L1", "仅L2→看L2"]:
|
||
df_cat = df[df["购买分类"] == cat_label]
|
||
if len(df_cat) == 0: continue
|
||
|
||
print(f"\n{'='*60}")
|
||
print(f"【{cat_label}】 {len(df_cat)} 个用户-分类记录")
|
||
print(f"{'='*60}")
|
||
|
||
# A. 按完成课时数分桶
|
||
print("\n ┌─ 按完成课时数分桶 ─────────────────────")
|
||
bucket_stats = df_cat.groupby("课时桶").agg(
|
||
用户数=("用户ID","count"),
|
||
退款用户数=("是否退款", lambda x: (x=="是").sum()),
|
||
平均课时=("完成课时数","mean"),
|
||
平均完课时长=("总完课时长(分钟)","mean"),
|
||
中位完课时长=("总完课时长(分钟)","median"),
|
||
).reindex(BUCKET_ORDER).fillna(0)
|
||
bucket_stats["退款率%"] = (bucket_stats["退款用户数"]/bucket_stats["用户数"]*100).round(1)
|
||
bucket_stats["平均课时"] = bucket_stats["平均课时"].round(1)
|
||
bucket_stats["平均完课时长"] = bucket_stats["平均完课时长"].round(1)
|
||
bucket_stats["中位完课时长"] = bucket_stats["中位完课时长"].round(1)
|
||
bucket_stats = bucket_stats[bucket_stats["用户数"]>0]
|
||
print(bucket_stats.to_string())
|
||
|
||
# B. 按完课时长分桶
|
||
print("\n ┌─ 按完课时长分桶 ─────────────────────")
|
||
time_stats = df_cat.groupby("时长桶").agg(
|
||
用户数=("用户ID","count"),
|
||
退款用户数=("是否退款", lambda x: (x=="是").sum()),
|
||
平均课时=("完成课时数","mean"),
|
||
平均完课时长=("总完课时长(分钟)","mean"),
|
||
).reindex(TIME_BUCKET_ORDER).fillna(0)
|
||
time_stats["退款率%"] = (time_stats["退款用户数"]/time_stats["用户数"]*100).round(1)
|
||
time_stats["平均课时"] = time_stats["平均课时"].round(1)
|
||
time_stats["平均完课时长"] = time_stats["平均完课时长"].round(1)
|
||
time_stats = time_stats[time_stats["用户数"]>0]
|
||
print(time_stats.to_string())
|
||
|
||
# C. 汇总
|
||
refund_users = df_cat[df_cat["是否退款"]=="是"]
|
||
no_refund = df_cat[df_cat["是否退款"]=="否"]
|
||
print(f"\n ┌─ 汇总 ──────────────────────────────")
|
||
print(f" 退款用户: {len(refund_users)}, 平均课时: {refund_users['完成课时数'].mean():.1f}, 平均总时长: {refund_users['总完课时长(分钟)'].mean():.1f}min")
|
||
print(f" 未退款用户: {len(no_refund)}, 平均课时: {no_refund['完成课时数'].mean():.1f}, 平均总时长: {no_refund['总完课时长(分钟)'].mean():.1f}min")
|
||
print(f" 退费率: {len(refund_users)/len(df_cat)*100:.1f}%")
|
||
|
||
# D. 明细
|
||
print(f"\n ┌─ 明细(按时长排序)─────────────────────")
|
||
for _, r in df_cat.sort_values(["总完课时长(分钟)"]).iterrows():
|
||
print(f" UID={r['用户ID']} | {r['完成课时数']}课时 | {r['总完课时长(分钟)']}min | {'退款' if r['是否退款']=='是' else '未退款'} | {r['购买课包'][:35]}")
|
||
|
||
# ── 输出Excel ──
|
||
output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output")
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
output_path = os.path.join(output_dir, "老狼退款_行课分析.xlsx")
|
||
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
|
||
df_out = df.drop(columns=["课时桶","时长桶"], errors="ignore").copy()
|
||
for col in df_out.columns:
|
||
if pd.api.types.is_datetime64_any_dtype(df_out[col]):
|
||
df_out[col] = df_out[col].dt.tz_localize(None)
|
||
df_out.to_excel(writer, sheet_name="明细", index=False)
|
||
|
||
for cat_label in ["L1+L2联报→看L1", "仅L2→看L2"]:
|
||
df_cat = df[df["购买分类"] == cat_label]
|
||
if len(df_cat) == 0: continue
|
||
|
||
# 按课时桶
|
||
bs = df_cat.groupby("课时桶").agg(
|
||
用户数=("用户ID","count"),
|
||
退款用户数=("是否退款", lambda x: (x=="是").sum()),
|
||
平均课时=("完成课时数","mean"),
|
||
平均完课时长=("总完课时长(分钟)","mean"),
|
||
中位完课时长=("总完课时长(分钟)","median"),
|
||
).reindex(BUCKET_ORDER).fillna(0)
|
||
bs["退款率%"] = (bs["退款用户数"]/bs["用户数"]*100).round(1)
|
||
bs["平均课时"] = bs["平均课时"].round(1)
|
||
bs["平均完课时长"] = bs["平均完课时长"].round(1)
|
||
bs["中位完课时长"] = bs["中位完课时长"].round(1)
|
||
bs = bs[bs["用户数"]>0]
|
||
bs.to_excel(writer, sheet_name=f"{cat_label[:20]}_按课时")
|
||
|
||
# 按时长桶
|
||
ts = df_cat.groupby("时长桶").agg(
|
||
用户数=("用户ID","count"),
|
||
退款用户数=("是否退款", lambda x: (x=="是").sum()),
|
||
平均课时=("完成课时数","mean"),
|
||
平均完课时长=("总完课时长(分钟)","mean"),
|
||
).reindex(TIME_BUCKET_ORDER).fillna(0)
|
||
ts["退款率%"] = (ts["退款用户数"]/ts["用户数"]*100).round(1)
|
||
ts["平均课时"] = ts["平均课时"].round(1)
|
||
ts["平均完课时长"] = ts["平均完课时长"].round(1)
|
||
ts = ts[ts["用户数"]>0]
|
||
ts.to_excel(writer, sheet_name=f"{cat_label[:20]}_按时长")
|
||
|
||
print(f"\n✅ {output_path}")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|