ai_member_xiaoxi/scripts/laolang_per_lesson.py
2026-06-02 08:00:01 +08:00

253 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
老狼退款用户 — 每节课完课时长 vs 退款率
粒度:每个 chapter_id 一行
- 完成该课的 user 数、退款 user 数、平均耗时、退款率
- 按购买分类分组L1+L2联报→看L1仅L2→看L2
"""
import os, sys
import psycopg2
import pandas as pd
from collections import defaultdict
DB_HOST = "bj-postgres-16pob4sg.sql.tencentcdb.com"
DB_PORT = 28591
DB_USER = "ai_member"
DB_NAME = "vala_bi"
def get_password():
pw = os.environ.get("PG_ONLINE_PASSWORD", "")
if pw: return pw
secrets_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "secrets.env")
if os.path.exists(secrets_path):
with open(secrets_path) as f:
for line in f:
if line.startswith("PG_ONLINE_PASSWORD="):
return line.strip().split("=",1)[1].strip("'\"")
raise RuntimeError("PG_ONLINE_PASSWORD not found")
def get_conn():
return psycopg2.connect(host=DB_HOST, port=DB_PORT, user=DB_USER,
password=get_password(), dbname=DB_NAME, connect_timeout=120)
L1_GOODS, L2_GOODS, L1L2_GOODS = {57,60,63}, {31,32,33,54}, {61}
def classify_orders(goods_ids):
gs = set(goods_ids)
r = []
if gs & L1L2_GOODS: r.append(("L1+L2联报→看L1","L1"))
if (gs & L2_GOODS) and not (gs & L1L2_GOODS): r.append(("仅L2→看L2","L2"))
if (gs & L1L2_GOODS) and (gs & L2_GOODS): r.append(("联报+仅L2→看L2","L2"))
return r
def main():
input_file = sys.argv[1] if len(sys.argv) > 1 else "output/销售线索_用户分析.xlsx"
df_in = pd.read_excel(input_file, dtype=str)
user_ids = sorted(set(int(x) for x in df_in["用户ID"].dropna().unique()))
print(f"用户: {len(user_ids)}")
conn = get_conn()
cur = conn.cursor()
# ── 0. 课程结构 ──
cur.execute("SELECT id, course_level, course_season, course_unit, course_lesson FROM bi_level_unit_lesson WHERE course_level IN ('L1','L2')")
chapter_info = {}
l1_chs, l2_chs = set(), set()
for cid, lv, sn, un, ln in cur.fetchall():
chapter_info[cid] = (lv, sn, un, ln)
if lv == "L1": l1_chs.add(cid)
else: l2_chs.add(cid)
# ── 1. 订单+退款 ──
ph = ",".join(["%s"]*len(user_ids))
cur.execute(f"""
SELECT account_id, goods_id, trade_no FROM bi_vala_order
WHERE account_id IN ({ph}) AND deleted_at IS NULL
AND pay_success_date IS NOT NULL AND order_status IN (3,4)
""", user_ids)
orders = cur.fetchall()
trade_nos = [o[2] for o in orders if o[2]]
refund_set = set()
for i in range(0, len(trade_nos), 500):
b = trade_nos[i:i+500]
p2 = ",".join(["%s"]*len(b))
cur.execute(f"SELECT DISTINCT trade_no FROM bi_refund_order WHERE trade_no IN ({p2}) AND status=3 AND deleted_at IS NULL", b)
for (tn,) in cur.fetchall(): refund_set.add(tn)
# 用户分类
user_goods = defaultdict(set)
user_refund = {}
for aid, gid, tn in orders:
user_goods[aid].add(gid)
if tn in refund_set: user_refund[aid] = True
for uid in user_ids:
if uid not in user_refund: user_refund[uid] = False
user_cats = {} # uid -> [(cat,level), ...]
for uid in user_ids:
user_cats[uid] = classify_orders(user_goods.get(uid, set()))
# ── 2. 角色 ──
cur.execute(f"SELECT id, account_id FROM bi_vala_app_character WHERE account_id IN ({ph}) AND nickname IS NOT NULL AND nickname!='' AND deleted_at IS NULL", user_ids)
chars = cur.fetchall()
char_to_account = {c[0]: c[1] for c in chars}
account_chars = defaultdict(list)
for cid, aid in chars: account_chars[aid].append(cid)
print(f" 角色: {len(chars)}")
# ── 3. 每节课完成 + cuid ──
# char_chapter_cuid: character_id -> {chapter_id -> cuid}
char_chapter_cuid = defaultdict(dict)
char_chapters_done = defaultdict(set)
all_cuids = set()
for tbl_idx in range(8):
table = f"bi_user_chapter_play_record_{tbl_idx}"
for i in range(0, len(chars), 2000):
batch = [c[0] for c in chars[i:i+2000]]
p2 = ",".join(["%s"]*len(batch))
try:
cur.execute(f"""
SELECT user_id, chapter_id, chapter_unique_id
FROM {table}
WHERE user_id IN ({p2}) AND play_status=1 AND deleted_at IS NULL
""", batch)
for uid, ch_id, cuid in cur.fetchall():
if ch_id not in char_chapter_cuid[uid]:
char_chapter_cuid[uid][ch_id] = cuid
char_chapters_done[uid].add(ch_id)
all_cuids.add(cuid)
except Exception as e:
print(f" warn {table}: {e}")
# ── 4. 完课时长 ──
cuid_duration = {}
cuid_list = list(all_cuids)
for tbl_idx in range(8):
table = f"bi_user_component_play_record_{tbl_idx}"
for i in range(0, len(cuid_list), 2000):
batch = cuid_list[i:i+2000]
p2 = ",".join(["%s"]*len(batch))
try:
cur.execute(f"""
SELECT chapter_unique_id, SUM(COALESCE(interval_time,0))
FROM {table} WHERE chapter_unique_id IN ({p2}) AND deleted_at IS NULL
GROUP BY chapter_unique_id
""", batch)
for cuid, ms in cur.fetchall():
if cuid not in cuid_duration: cuid_duration[cuid] = 0
cuid_duration[cuid] += ms
except: pass
conn.close()
# ── 5. 按 category + chapter_id 汇总 ──
# 结构: (cat_label, level) -> chapter_id -> {user_ids, durations}
cat_chapter_users = defaultdict(lambda: defaultdict(set)) # (cat, lv) -> ch_id -> {user_ids}
cat_chapter_durations = defaultdict(lambda: defaultdict(list)) # (cat, lv) -> ch_id -> [durations_min]
for uid in user_ids:
cats = user_cats.get(uid, [])
if not cats: continue
my_chars = account_chars.get(uid, [])
is_refund = user_refund.get(uid, False)
for cat_label, watch_level in cats:
target_chs = l1_chs if watch_level == "L1" else l2_chs
for cid in my_chars:
for ch_id in char_chapters_done.get(cid, set()):
if ch_id not in target_chs: continue
cuid = char_chapter_cuid.get(cid, {}).get(ch_id)
dur_ms = cuid_duration.get(cuid, 0) if cuid else 0
dur_min = dur_ms / 60000.0
key = (cat_label, watch_level)
cat_chapter_users[key][ch_id].add(uid)
cat_chapter_durations[key][ch_id].append(dur_min)
# ── 6. 构建输出 ──
rows = []
for (cat_label, watch_level), ch_map in sorted(cat_chapter_users.items()):
for ch_id, users in sorted(ch_map.items(), key=lambda x: len(x[1]), reverse=True):
info = chapter_info.get(ch_id, ("?","?","?","?"))
durations = cat_chapter_durations[(cat_label, watch_level)][ch_id]
total_users = len(users)
refund_users = sum(1 for u in users if user_refund.get(u, False))
avg_dur = round(sum(durations)/len(durations), 1) if durations else 0
med_dur = round(sorted(durations)[len(durations)//2], 1) if durations else 0
rows.append({
"购买分类": cat_label,
"课程等级": watch_level,
"chapter_id": ch_id,
"Season": info[1],
"Unit": info[2],
"Lesson": info[3],
"完成用户数": total_users,
"退款用户数": refund_users,
"退款率%": round(refund_users/total_users*100, 1) if total_users else 0,
"平均耗时(分钟)": avg_dur,
"中位耗时(分钟)": med_dur,
"退款用户平均耗时": round(sum(d for i,d in enumerate(durations) if list(users)[i] if user_refund.get(list(users)[i], False)) / max(refund_users,1), 1) if refund_users > 0 else 0,
})
df = pd.DataFrame(rows)
# ── 7. 输出 ──
for cat_label in ["L1+L2联报→看L1", "仅L2→看L2"]:
df_cat = df[df["购买分类"] == cat_label]
if len(df_cat) == 0: continue
print(f"\n{'='*80}")
print(f"{cat_label}】 共 {len(df_cat)} 节课有完成记录")
print(f"{'='*80}")
print(f"{'Unit':>6} {'Lesson':>6} {'完成人数':>6} {'退款人数':>6} {'退款率':>7} {'平均耗时':>9} {'中位耗时':>9} {'退款均耗时':>9}")
print("-"*70)
for _, r in df_cat.iterrows():
print(f"{r['Unit']:>6} {r['Lesson']:>6} {int(r['完成用户数']):>6} {int(r['退款用户数']):>6} {r['退款率%']:>6.1f}% {r['平均耗时(分钟)']:>8.1f}{r['中位耗时(分钟)']:>8.1f}{r['退款用户平均耗时']:>8.1f}")
# 按时长桶汇总
def dur_bucket(m):
if m == 0: return "0分钟"
elif m <= 10: return "1-10分钟"
elif m <= 20: return "11-20分钟"
elif m <= 30: return "21-30分钟"
elif m <= 45: return "31-45分钟"
else: return "45分钟以上"
DUR_ORDER = ["0分钟","1-10分钟","11-20分钟","21-30分钟","31-45分钟","45分钟以上"]
for cat_label in ["L1+L2联报→看L1", "仅L2→看L2"]:
df_cat = df[df["购买分类"] == cat_label]
if len(df_cat) == 0: continue
print(f"\n{'='*80}")
print(f"{cat_label}】按每节课平均耗时聚合")
print(f"{'='*80}")
df_cat["时长桶"] = df_cat["平均耗时(分钟)"].apply(dur_bucket)
ds = df_cat.groupby("时长桶").agg(
课节数=("chapter_id","count"),
完成用户数=("完成用户数","sum"),
退款用户数=("退款用户数","sum"),
).reindex(DUR_ORDER).fillna(0)
ds["退款率%"] = (ds["退款用户数"]/ds["完成用户数"]*100).round(1)
ds = ds[ds["课节数"]>0]
print(ds.to_string())
# ── 输出 Excel ──
output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output")
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "老狼退款_每课分析.xlsx")
with pd.ExcelWriter(output_path, engine="openpyxl") as w:
df.to_excel(w, sheet_name="每课明细", index=False)
for cat_label in ["L1+L2联报→看L1", "仅L2→看L2"]:
dc = df[df["购买分类"]==cat_label]
if len(dc)==0: continue
dc["时长桶"] = dc["平均耗时(分钟)"].apply(dur_bucket)
ds = dc.groupby("时长桶").agg(课节数=("chapter_id","count"),完成用户数=("完成用户数","sum"),退款用户数=("退款用户数","sum")).reindex(DUR_ORDER).fillna(0)
ds["退款率%"] = (ds["退款用户数"]/ds["完成用户数"]*100).round(1)
ds.to_excel(w, sheet_name=f"{cat_label[:25]}_按时长聚合")
print(f"\n{output_path}")
if __name__ == "__main__":
main()