ai_member_xiaoxi/scripts/laolang_refund_analysis.py
2026-06-02 08:00:01 +08:00

398 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
老狼退款用户 — 行课进度 × 完课时长 × 退款率交叉分析
规则:
- 购买 L1+L2 联报 → 看 L1 行课
- 购买 L2 课包 → 看 L2 行课
新增:完课时长 = SUM(bi_user_component_play_record.interval_time) / 60000 分钟
"""
import os, sys, re
import psycopg2
import pandas as pd
from collections import defaultdict
DB_HOST = "bj-postgres-16pob4sg.sql.tencentcdb.com"
DB_PORT = 28591
DB_USER = "ai_member"
DB_NAME = "vala_bi"
def get_password():
pw = os.environ.get("PG_ONLINE_PASSWORD", "")
if pw:
return pw
secrets_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "secrets.env")
if os.path.exists(secrets_path):
with open(secrets_path) as f:
for line in f:
if line.startswith("PG_ONLINE_PASSWORD="):
return line.strip().split("=", 1)[1].strip("'\"")
raise RuntimeError("PG_ONLINE_PASSWORD not found")
def get_conn():
return psycopg2.connect(host=DB_HOST, port=DB_PORT, user=DB_USER, password=get_password(), dbname=DB_NAME, connect_timeout=120)
def lesson_bucket(n):
if n == 0: return "0课时"
elif n <= 3: return "1-3课时"
elif n <= 7: return "4-7课时"
elif n <= 15: return "8-15课时"
elif n <= 30: return "16-30课时"
elif n <= 60: return "31-60课时"
else: return "60课时以上"
def time_bucket(minutes):
if minutes == 0: return "0分钟"
elif minutes <= 10: return "1-10分钟"
elif minutes <= 30: return "11-30分钟"
elif minutes <= 60: return "31-60分钟"
elif minutes <= 120: return "61-120分钟"
elif minutes <= 300: return "121-300分钟"
else: return "300分钟以上"
BUCKET_ORDER = ["0课时", "1-3课时", "4-7课时", "8-15课时", "16-30课时", "31-60课时", "60课时以上"]
TIME_BUCKET_ORDER = ["0分钟", "1-10分钟", "11-30分钟", "31-60分钟", "61-120分钟", "121-300分钟", "300分钟以上"]
L1_GOODS = {57, 60, 63}
L2_GOODS = {31, 32, 33, 54}
L1L2_GOODS = {61}
def classify_orders(goods_ids):
gs = set(goods_ids)
result = []
has_l1l2 = bool(gs & L1L2_GOODS)
has_l2 = bool(gs & L2_GOODS)
if has_l1l2:
result.append(("L1+L2联报→看L1", "L1"))
if has_l2 and not has_l1l2:
result.append(("仅L2→看L2", "L2"))
if has_l1l2 and has_l2:
result.append(("联报+仅L2→看L2", "L2"))
return result
def main():
input_file = sys.argv[1] if len(sys.argv) > 1 else "output/销售线索_用户分析.xlsx"
df_in = pd.read_excel(input_file, dtype=str)
user_ids = [int(x) for x in df_in["用户ID"].dropna().unique()]
print(f"老狼线索用户: {len(user_ids)}")
conn = get_conn()
cur = conn.cursor()
# ── 0. 获取 L1/L2 chapter_id ──
cur.execute("SELECT id, course_level FROM bi_level_unit_lesson WHERE course_level IN ('L1','L2')")
l1_chapters = set()
l2_chapters = set()
for ch_id, lv in cur.fetchall():
if lv == "L1": l1_chapters.add(ch_id)
else: l2_chapters.add(ch_id)
print(f" L1: {len(l1_chapters)}, L2: {len(l2_chapters)}")
# ── 1. 订单 ──
ph = ",".join(["%s"]*len(user_ids))
cur.execute(f"""
SELECT account_id, goods_id, goods_name, trade_no, pay_success_date,
pay_amount_int, order_status, key_from
FROM bi_vala_order
WHERE account_id IN ({ph}) AND deleted_at IS NULL
AND pay_success_date IS NOT NULL AND order_status IN (3,4)
ORDER BY account_id, pay_success_date
""", user_ids)
orders = cur.fetchall()
print(f" 订单: {len(orders)}")
# ── 2. 退款 ──
trade_nos = [o[3] for o in orders if o[3]]
refund_set = set()
for i in range(0, len(trade_nos), 500):
batch = trade_nos[i:i+500]
p2 = ",".join(["%s"]*len(batch))
cur.execute(f"SELECT DISTINCT trade_no FROM bi_refund_order WHERE trade_no IN ({p2}) AND status=3 AND deleted_at IS NULL", batch)
for (tn,) in cur.fetchall(): refund_set.add(tn)
# ── 3. 角色(含生日、年龄)──
cur.execute(f"""
SELECT id, account_id, nickname, birthday FROM bi_vala_app_character
WHERE account_id IN ({ph}) AND nickname IS NOT NULL AND nickname!='' AND deleted_at IS NULL
""", user_ids)
chars = cur.fetchall()
char_ids = [c[0] for c in chars]
account_chars = defaultdict(list)
char_info = {} # char_id -> (nickname, birthday, age)
from datetime import date
today = date.today()
for cid, aid, nick, bday in chars:
account_chars[aid].append(cid)
age = None
if bday:
try:
bd = pd.Timestamp(str(bday)[:10]).date()
age = today.year - bd.year - ((today.month, today.day) < (bd.month, bd.day))
except: pass
char_info[cid] = (nick, str(bday)[:10] if bday else '', age)
print(f" 角色: {len(chars)}")
# ── 4. 课时完成 + 获取 chapter_unique_id ──
# char -> {chapter_id -> chapter_unique_id} (取首次完成的记录)
char_chapter_cuid = defaultdict(dict) # character_id -> {chapter_id -> chapter_unique_id}
char_lessons_l1 = defaultdict(int)
char_lessons_l2 = defaultdict(int)
all_cuids = set()
for tbl_idx in range(8):
table = f"bi_user_chapter_play_record_{tbl_idx}"
for i in range(0, len(char_ids), 2000):
batch = char_ids[i:i+2000]
p2 = ",".join(["%s"]*len(batch))
try:
cur.execute(f"""
SELECT user_id, chapter_id, chapter_unique_id
FROM {table}
WHERE user_id IN ({p2}) AND play_status=1 AND deleted_at IS NULL
""", batch)
for uid, ch_id, cuid in cur.fetchall():
# 取首次完成的记录
if ch_id not in char_chapter_cuid[uid]:
char_chapter_cuid[uid][ch_id] = cuid
if ch_id in l1_chapters:
char_lessons_l1[uid] += 1
elif ch_id in l2_chapters:
char_lessons_l2[uid] += 1
all_cuids.add(cuid)
except Exception as e:
print(f" warn {table}: {e}")
print(f" chapter_unique_id 数: {len(all_cuids)}")
# ── 5. 从 component_play_record 获取完课时长 ──
print(f" 正在查询完课时长({len(all_cuids)} 个 cuid...")
cuid_duration = defaultdict(int) # chapter_unique_id -> 总耗时(毫秒)
cuid_list = list(all_cuids)
for tbl_idx in range(8):
table = f"bi_user_component_play_record_{tbl_idx}"
for i in range(0, len(cuid_list), 2000):
batch = cuid_list[i:i+2000]
p2 = ",".join(["%s"]*len(batch))
try:
cur.execute(f"""
SELECT chapter_unique_id, SUM(COALESCE(interval_time, 0))
FROM {table}
WHERE chapter_unique_id IN ({p2}) AND deleted_at IS NULL
GROUP BY chapter_unique_id
""", batch)
for cuid, total_ms in cur.fetchall():
if total_ms:
cuid_duration[cuid] += total_ms
except Exception as e:
print(f" warn {table}: {e}")
print(f" 有耗时记录的 cuid: {len(cuid_duration)}")
# ── 6. 账户信息 ──
cur.execute(f"SELECT id, created_at FROM bi_vala_app_account WHERE id IN ({ph}) AND status=1", user_ids)
account_info = {aid: reg for aid, reg in cur.fetchall()}
conn.close()
# ── 6.5 加微判断vala_class.student_info──
wechat_bound = set()
try:
conn_class = psycopg2.connect(host=DB_HOST, port=DB_PORT, user=DB_USER, password=get_password(), dbname='vala_class', connect_timeout=30)
cur2 = conn_class.cursor()
cur2.execute(f"SELECT DISTINCT vala_account_id FROM student_info WHERE vala_account_id IN ({ph})", user_ids)
wechat_bound = {r[0] for r in cur2.fetchall()}
cur2.close()
conn_class.close()
print(f" 加微判断: {len(wechat_bound)}/{len(user_ids)} 人已加微")
except Exception as e:
print(f" 加微判断跳过: {e}")
# ── 7. 按用户+等级汇总课时数和耗时 ──
# 先算每个角色在每个等级的总耗时
char_duration_l1 = defaultdict(int)
char_duration_l2 = defaultdict(int)
for cid, chapter_map in char_chapter_cuid.items():
for ch_id, cuid in chapter_map.items():
dur = cuid_duration.get(cuid, 0)
if ch_id in l1_chapters:
char_duration_l1[cid] += dur
elif ch_id in l2_chapters:
char_duration_l2[cid] += dur
# ── 8. 构建分析数据 ──
user_orders_map = defaultdict(list)
for o in orders:
aid, gid, gn, tn, pd_, amt, os_, kf = o
user_orders_map[aid].append({
"goods_id": gid, "goods_name": gn, "trade_no": tn,
"pay_date": pd_, "amount": amt/100.0, "order_status": os_,
"key_from": kf, "is_refunded": tn in refund_set,
})
rows = []
for aid in user_ids:
my_orders = user_orders_map.get(aid, [])
my_chars = account_chars.get(aid, [])
reg_time = account_info.get(aid)
total_l1 = sum(char_lessons_l1.get(c, 0) for c in my_chars)
total_l2 = sum(char_lessons_l2.get(c, 0) for c in my_chars)
dur_l1 = sum(char_duration_l1.get(c, 0) for c in my_chars)
dur_l2 = sum(char_duration_l2.get(c, 0) for c in my_chars)
total_orders = len(my_orders)
refunded_orders = sum(1 for o in my_orders if o["is_refunded"])
total_gmv = sum(o["amount"] for o in my_orders)
total_refund = sum(o["amount"] for o in my_orders if o["is_refunded"])
all_refunded = (refunded_orders == total_orders and total_orders > 0)
has_refund = refunded_orders > 0
goods_ids = [o["goods_id"] for o in my_orders]
cats = classify_orders(goods_ids)
for cat_label, watch_level in cats:
lessons = total_l1 if watch_level == "L1" else total_l2
dur_ms = dur_l1 if watch_level == "L1" else dur_l2
dur_min = round(dur_ms / 60000.0, 1) if dur_ms else 0.0
avg_min_per_lesson = round(dur_min / lessons, 1) if lessons > 0 else 0.0
# 构建角色信息
char_details = []
for cid in my_chars:
nick, bday_str, age = char_info.get(cid, ('?', '', None))
age_str = f'{age}' if age is not None else '?'
char_details.append(f'{nick}({age_str})')
rows.append({
"用户ID": aid,
"注册时间": reg_time,
"购买分类": cat_label,
"行课等级": watch_level,
"角色数": len(my_chars),
"角色信息(名称+年龄)": "; ".join(char_details),
"完成课时数": lessons,
"总完课时长(分钟)": dur_min,
"平均每课时长(分钟)": avg_min_per_lesson,
"订单数": total_orders,
"退款订单数": refunded_orders,
"GMV": round(total_gmv, 2),
"GSV": round(total_gmv - total_refund, 2),
"是否退款": "" if has_refund else "",
"是否全部退款": "" if all_refunded else "",
"是否加微": "" if aid in wechat_bound else "",
"购买课包": ";".join(o["goods_name"] for o in my_orders),
})
df = pd.DataFrame(rows)
df["课时桶"] = df["完成课时数"].apply(lesson_bucket)
df["时长桶"] = df["总完课时长(分钟)"].apply(time_bucket)
# ── 输出 ──
print("\n" + "="*70)
print("老狼退款用户 — 行课进度 × 完课时长 × 退款率交叉分析")
print("="*70)
for cat_label in ["L1+L2联报→看L1", "仅L2→看L2"]:
df_cat = df[df["购买分类"] == cat_label]
if len(df_cat) == 0: continue
print(f"\n{'='*60}")
print(f"{cat_label}{len(df_cat)} 个用户-分类记录")
print(f"{'='*60}")
# A. 按完成课时数分桶
print("\n ┌─ 按完成课时数分桶 ─────────────────────")
bucket_stats = df_cat.groupby("课时桶").agg(
用户数=("用户ID","count"),
退款用户数=("是否退款", lambda x: (x=="").sum()),
平均课时=("完成课时数","mean"),
平均完课时长=("总完课时长(分钟)","mean"),
中位完课时长=("总完课时长(分钟)","median"),
).reindex(BUCKET_ORDER).fillna(0)
bucket_stats["退款率%"] = (bucket_stats["退款用户数"]/bucket_stats["用户数"]*100).round(1)
bucket_stats["平均课时"] = bucket_stats["平均课时"].round(1)
bucket_stats["平均完课时长"] = bucket_stats["平均完课时长"].round(1)
bucket_stats["中位完课时长"] = bucket_stats["中位完课时长"].round(1)
bucket_stats = bucket_stats[bucket_stats["用户数"]>0]
print(bucket_stats.to_string())
# B. 按完课时长分桶
print("\n ┌─ 按完课时长分桶 ─────────────────────")
time_stats = df_cat.groupby("时长桶").agg(
用户数=("用户ID","count"),
退款用户数=("是否退款", lambda x: (x=="").sum()),
平均课时=("完成课时数","mean"),
平均完课时长=("总完课时长(分钟)","mean"),
).reindex(TIME_BUCKET_ORDER).fillna(0)
time_stats["退款率%"] = (time_stats["退款用户数"]/time_stats["用户数"]*100).round(1)
time_stats["平均课时"] = time_stats["平均课时"].round(1)
time_stats["平均完课时长"] = time_stats["平均完课时长"].round(1)
time_stats = time_stats[time_stats["用户数"]>0]
print(time_stats.to_string())
# C. 汇总
refund_users = df_cat[df_cat["是否退款"]==""]
no_refund = df_cat[df_cat["是否退款"]==""]
print(f"\n ┌─ 汇总 ──────────────────────────────")
print(f" 退款用户: {len(refund_users)}, 平均课时: {refund_users['完成课时数'].mean():.1f}, 平均总时长: {refund_users['总完课时长(分钟)'].mean():.1f}min")
print(f" 未退款用户: {len(no_refund)}, 平均课时: {no_refund['完成课时数'].mean():.1f}, 平均总时长: {no_refund['总完课时长(分钟)'].mean():.1f}min")
print(f" 退费率: {len(refund_users)/len(df_cat)*100:.1f}%")
# D. 明细
print(f"\n ┌─ 明细(按时长排序)─────────────────────")
for _, r in df_cat.sort_values(["总完课时长(分钟)"]).iterrows():
print(f" UID={r['用户ID']} | {r['完成课时数']}课时 | {r['总完课时长(分钟)']}min | {'退款' if r['是否退款']=='' else '未退款'} | {r['购买课包'][:35]}")
# ── 输出Excel ──
output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output")
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "老狼退款_行课分析.xlsx")
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
df_out = df.drop(columns=["课时桶","时长桶"], errors="ignore").copy()
for col in df_out.columns:
if pd.api.types.is_datetime64_any_dtype(df_out[col]):
df_out[col] = df_out[col].dt.tz_localize(None)
df_out.to_excel(writer, sheet_name="明细", index=False)
for cat_label in ["L1+L2联报→看L1", "仅L2→看L2"]:
df_cat = df[df["购买分类"] == cat_label]
if len(df_cat) == 0: continue
# 按课时桶
bs = df_cat.groupby("课时桶").agg(
用户数=("用户ID","count"),
退款用户数=("是否退款", lambda x: (x=="").sum()),
平均课时=("完成课时数","mean"),
平均完课时长=("总完课时长(分钟)","mean"),
中位完课时长=("总完课时长(分钟)","median"),
).reindex(BUCKET_ORDER).fillna(0)
bs["退款率%"] = (bs["退款用户数"]/bs["用户数"]*100).round(1)
bs["平均课时"] = bs["平均课时"].round(1)
bs["平均完课时长"] = bs["平均完课时长"].round(1)
bs["中位完课时长"] = bs["中位完课时长"].round(1)
bs = bs[bs["用户数"]>0]
bs.to_excel(writer, sheet_name=f"{cat_label[:20]}_按课时")
# 按时长桶
ts = df_cat.groupby("时长桶").agg(
用户数=("用户ID","count"),
退款用户数=("是否退款", lambda x: (x=="").sum()),
平均课时=("完成课时数","mean"),
平均完课时长=("总完课时长(分钟)","mean"),
).reindex(TIME_BUCKET_ORDER).fillna(0)
ts["退款率%"] = (ts["退款用户数"]/ts["用户数"]*100).round(1)
ts["平均课时"] = ts["平均课时"].round(1)
ts["平均完课时长"] = ts["平均完课时长"].round(1)
ts = ts[ts["用户数"]>0]
ts.to_excel(writer, sheet_name=f"{cat_label[:20]}_按时长")
print(f"\n{output_path}")
if __name__ == "__main__":
main()