253 lines
11 KiB
Python
253 lines
11 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
老狼退款用户 — 每节课完课时长 vs 退款率
|
||
|
||
粒度:每个 chapter_id 一行
|
||
- 完成该课的 user 数、退款 user 数、平均耗时、退款率
|
||
- 按购买分类分组(L1+L2联报→看L1,仅L2→看L2)
|
||
"""
|
||
|
||
import os, sys
|
||
import psycopg2
|
||
import pandas as pd
|
||
from collections import defaultdict
|
||
|
||
DB_HOST = "bj-postgres-16pob4sg.sql.tencentcdb.com"
|
||
DB_PORT = 28591
|
||
DB_USER = "ai_member"
|
||
DB_NAME = "vala_bi"
|
||
|
||
def get_password():
|
||
pw = os.environ.get("PG_ONLINE_PASSWORD", "")
|
||
if pw: return pw
|
||
secrets_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "secrets.env")
|
||
if os.path.exists(secrets_path):
|
||
with open(secrets_path) as f:
|
||
for line in f:
|
||
if line.startswith("PG_ONLINE_PASSWORD="):
|
||
return line.strip().split("=",1)[1].strip("'\"")
|
||
raise RuntimeError("PG_ONLINE_PASSWORD not found")
|
||
|
||
def get_conn():
|
||
return psycopg2.connect(host=DB_HOST, port=DB_PORT, user=DB_USER,
|
||
password=get_password(), dbname=DB_NAME, connect_timeout=120)
|
||
|
||
L1_GOODS, L2_GOODS, L1L2_GOODS = {57,60,63}, {31,32,33,54}, {61}
|
||
|
||
def classify_orders(goods_ids):
|
||
gs = set(goods_ids)
|
||
r = []
|
||
if gs & L1L2_GOODS: r.append(("L1+L2联报→看L1","L1"))
|
||
if (gs & L2_GOODS) and not (gs & L1L2_GOODS): r.append(("仅L2→看L2","L2"))
|
||
if (gs & L1L2_GOODS) and (gs & L2_GOODS): r.append(("联报+仅L2→看L2","L2"))
|
||
return r
|
||
|
||
def main():
|
||
input_file = sys.argv[1] if len(sys.argv) > 1 else "output/销售线索_用户分析.xlsx"
|
||
df_in = pd.read_excel(input_file, dtype=str)
|
||
user_ids = sorted(set(int(x) for x in df_in["用户ID"].dropna().unique()))
|
||
print(f"用户: {len(user_ids)} 人")
|
||
|
||
conn = get_conn()
|
||
cur = conn.cursor()
|
||
|
||
# ── 0. 课程结构 ──
|
||
cur.execute("SELECT id, course_level, course_season, course_unit, course_lesson FROM bi_level_unit_lesson WHERE course_level IN ('L1','L2')")
|
||
chapter_info = {}
|
||
l1_chs, l2_chs = set(), set()
|
||
for cid, lv, sn, un, ln in cur.fetchall():
|
||
chapter_info[cid] = (lv, sn, un, ln)
|
||
if lv == "L1": l1_chs.add(cid)
|
||
else: l2_chs.add(cid)
|
||
|
||
# ── 1. 订单+退款 ──
|
||
ph = ",".join(["%s"]*len(user_ids))
|
||
cur.execute(f"""
|
||
SELECT account_id, goods_id, trade_no FROM bi_vala_order
|
||
WHERE account_id IN ({ph}) AND deleted_at IS NULL
|
||
AND pay_success_date IS NOT NULL AND order_status IN (3,4)
|
||
""", user_ids)
|
||
orders = cur.fetchall()
|
||
|
||
trade_nos = [o[2] for o in orders if o[2]]
|
||
refund_set = set()
|
||
for i in range(0, len(trade_nos), 500):
|
||
b = trade_nos[i:i+500]
|
||
p2 = ",".join(["%s"]*len(b))
|
||
cur.execute(f"SELECT DISTINCT trade_no FROM bi_refund_order WHERE trade_no IN ({p2}) AND status=3 AND deleted_at IS NULL", b)
|
||
for (tn,) in cur.fetchall(): refund_set.add(tn)
|
||
|
||
# 用户分类
|
||
user_goods = defaultdict(set)
|
||
user_refund = {}
|
||
for aid, gid, tn in orders:
|
||
user_goods[aid].add(gid)
|
||
if tn in refund_set: user_refund[aid] = True
|
||
for uid in user_ids:
|
||
if uid not in user_refund: user_refund[uid] = False
|
||
|
||
user_cats = {} # uid -> [(cat,level), ...]
|
||
for uid in user_ids:
|
||
user_cats[uid] = classify_orders(user_goods.get(uid, set()))
|
||
|
||
# ── 2. 角色 ──
|
||
cur.execute(f"SELECT id, account_id FROM bi_vala_app_character WHERE account_id IN ({ph}) AND nickname IS NOT NULL AND nickname!='' AND deleted_at IS NULL", user_ids)
|
||
chars = cur.fetchall()
|
||
char_to_account = {c[0]: c[1] for c in chars}
|
||
account_chars = defaultdict(list)
|
||
for cid, aid in chars: account_chars[aid].append(cid)
|
||
print(f" 角色: {len(chars)}")
|
||
|
||
# ── 3. 每节课完成 + cuid ──
|
||
# char_chapter_cuid: character_id -> {chapter_id -> cuid}
|
||
char_chapter_cuid = defaultdict(dict)
|
||
char_chapters_done = defaultdict(set)
|
||
all_cuids = set()
|
||
|
||
for tbl_idx in range(8):
|
||
table = f"bi_user_chapter_play_record_{tbl_idx}"
|
||
for i in range(0, len(chars), 2000):
|
||
batch = [c[0] for c in chars[i:i+2000]]
|
||
p2 = ",".join(["%s"]*len(batch))
|
||
try:
|
||
cur.execute(f"""
|
||
SELECT user_id, chapter_id, chapter_unique_id
|
||
FROM {table}
|
||
WHERE user_id IN ({p2}) AND play_status=1 AND deleted_at IS NULL
|
||
""", batch)
|
||
for uid, ch_id, cuid in cur.fetchall():
|
||
if ch_id not in char_chapter_cuid[uid]:
|
||
char_chapter_cuid[uid][ch_id] = cuid
|
||
char_chapters_done[uid].add(ch_id)
|
||
all_cuids.add(cuid)
|
||
except Exception as e:
|
||
print(f" warn {table}: {e}")
|
||
|
||
# ── 4. 完课时长 ──
|
||
cuid_duration = {}
|
||
cuid_list = list(all_cuids)
|
||
for tbl_idx in range(8):
|
||
table = f"bi_user_component_play_record_{tbl_idx}"
|
||
for i in range(0, len(cuid_list), 2000):
|
||
batch = cuid_list[i:i+2000]
|
||
p2 = ",".join(["%s"]*len(batch))
|
||
try:
|
||
cur.execute(f"""
|
||
SELECT chapter_unique_id, SUM(COALESCE(interval_time,0))
|
||
FROM {table} WHERE chapter_unique_id IN ({p2}) AND deleted_at IS NULL
|
||
GROUP BY chapter_unique_id
|
||
""", batch)
|
||
for cuid, ms in cur.fetchall():
|
||
if cuid not in cuid_duration: cuid_duration[cuid] = 0
|
||
cuid_duration[cuid] += ms
|
||
except: pass
|
||
|
||
conn.close()
|
||
|
||
# ── 5. 按 category + chapter_id 汇总 ──
|
||
# 结构: (cat_label, level) -> chapter_id -> {user_ids, durations}
|
||
cat_chapter_users = defaultdict(lambda: defaultdict(set)) # (cat, lv) -> ch_id -> {user_ids}
|
||
cat_chapter_durations = defaultdict(lambda: defaultdict(list)) # (cat, lv) -> ch_id -> [durations_min]
|
||
|
||
for uid in user_ids:
|
||
cats = user_cats.get(uid, [])
|
||
if not cats: continue
|
||
my_chars = account_chars.get(uid, [])
|
||
is_refund = user_refund.get(uid, False)
|
||
|
||
for cat_label, watch_level in cats:
|
||
target_chs = l1_chs if watch_level == "L1" else l2_chs
|
||
for cid in my_chars:
|
||
for ch_id in char_chapters_done.get(cid, set()):
|
||
if ch_id not in target_chs: continue
|
||
cuid = char_chapter_cuid.get(cid, {}).get(ch_id)
|
||
dur_ms = cuid_duration.get(cuid, 0) if cuid else 0
|
||
dur_min = dur_ms / 60000.0
|
||
key = (cat_label, watch_level)
|
||
cat_chapter_users[key][ch_id].add(uid)
|
||
cat_chapter_durations[key][ch_id].append(dur_min)
|
||
|
||
# ── 6. 构建输出 ──
|
||
rows = []
|
||
for (cat_label, watch_level), ch_map in sorted(cat_chapter_users.items()):
|
||
for ch_id, users in sorted(ch_map.items(), key=lambda x: len(x[1]), reverse=True):
|
||
info = chapter_info.get(ch_id, ("?","?","?","?"))
|
||
durations = cat_chapter_durations[(cat_label, watch_level)][ch_id]
|
||
total_users = len(users)
|
||
refund_users = sum(1 for u in users if user_refund.get(u, False))
|
||
avg_dur = round(sum(durations)/len(durations), 1) if durations else 0
|
||
med_dur = round(sorted(durations)[len(durations)//2], 1) if durations else 0
|
||
rows.append({
|
||
"购买分类": cat_label,
|
||
"课程等级": watch_level,
|
||
"chapter_id": ch_id,
|
||
"Season": info[1],
|
||
"Unit": info[2],
|
||
"Lesson": info[3],
|
||
"完成用户数": total_users,
|
||
"退款用户数": refund_users,
|
||
"退款率%": round(refund_users/total_users*100, 1) if total_users else 0,
|
||
"平均耗时(分钟)": avg_dur,
|
||
"中位耗时(分钟)": med_dur,
|
||
"退款用户平均耗时": round(sum(d for i,d in enumerate(durations) if list(users)[i] if user_refund.get(list(users)[i], False)) / max(refund_users,1), 1) if refund_users > 0 else 0,
|
||
})
|
||
|
||
df = pd.DataFrame(rows)
|
||
|
||
# ── 7. 输出 ──
|
||
for cat_label in ["L1+L2联报→看L1", "仅L2→看L2"]:
|
||
df_cat = df[df["购买分类"] == cat_label]
|
||
if len(df_cat) == 0: continue
|
||
print(f"\n{'='*80}")
|
||
print(f"【{cat_label}】 共 {len(df_cat)} 节课有完成记录")
|
||
print(f"{'='*80}")
|
||
print(f"{'Unit':>6} {'Lesson':>6} {'完成人数':>6} {'退款人数':>6} {'退款率':>7} {'平均耗时':>9} {'中位耗时':>9} {'退款均耗时':>9}")
|
||
print("-"*70)
|
||
for _, r in df_cat.iterrows():
|
||
print(f"{r['Unit']:>6} {r['Lesson']:>6} {int(r['完成用户数']):>6} {int(r['退款用户数']):>6} {r['退款率%']:>6.1f}% {r['平均耗时(分钟)']:>8.1f}分 {r['中位耗时(分钟)']:>8.1f}分 {r['退款用户平均耗时']:>8.1f}分")
|
||
|
||
# 按时长桶汇总
|
||
def dur_bucket(m):
|
||
if m == 0: return "0分钟"
|
||
elif m <= 10: return "1-10分钟"
|
||
elif m <= 20: return "11-20分钟"
|
||
elif m <= 30: return "21-30分钟"
|
||
elif m <= 45: return "31-45分钟"
|
||
else: return "45分钟以上"
|
||
|
||
DUR_ORDER = ["0分钟","1-10分钟","11-20分钟","21-30分钟","31-45分钟","45分钟以上"]
|
||
|
||
for cat_label in ["L1+L2联报→看L1", "仅L2→看L2"]:
|
||
df_cat = df[df["购买分类"] == cat_label]
|
||
if len(df_cat) == 0: continue
|
||
print(f"\n{'='*80}")
|
||
print(f"【{cat_label}】按每节课平均耗时聚合")
|
||
print(f"{'='*80}")
|
||
df_cat["时长桶"] = df_cat["平均耗时(分钟)"].apply(dur_bucket)
|
||
ds = df_cat.groupby("时长桶").agg(
|
||
课节数=("chapter_id","count"),
|
||
完成用户数=("完成用户数","sum"),
|
||
退款用户数=("退款用户数","sum"),
|
||
).reindex(DUR_ORDER).fillna(0)
|
||
ds["退款率%"] = (ds["退款用户数"]/ds["完成用户数"]*100).round(1)
|
||
ds = ds[ds["课节数"]>0]
|
||
print(ds.to_string())
|
||
|
||
# ── 输出 Excel ──
|
||
output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output")
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
output_path = os.path.join(output_dir, "老狼退款_每课分析.xlsx")
|
||
with pd.ExcelWriter(output_path, engine="openpyxl") as w:
|
||
df.to_excel(w, sheet_name="每课明细", index=False)
|
||
for cat_label in ["L1+L2联报→看L1", "仅L2→看L2"]:
|
||
dc = df[df["购买分类"]==cat_label]
|
||
if len(dc)==0: continue
|
||
dc["时长桶"] = dc["平均耗时(分钟)"].apply(dur_bucket)
|
||
ds = dc.groupby("时长桶").agg(课节数=("chapter_id","count"),完成用户数=("完成用户数","sum"),退款用户数=("退款用户数","sum")).reindex(DUR_ORDER).fillna(0)
|
||
ds["退款率%"] = (ds["退款用户数"]/ds["完成用户数"]*100).round(1)
|
||
ds.to_excel(w, sheet_name=f"{cat_label[:25]}_按时长聚合")
|
||
print(f"\n✅ {output_path}")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|