ai_member_xiaoxi/scripts/laolang_lvxing_analysis.py
2026-06-02 08:00:01 +08:00

348 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
老狼履约明细 — 行课分析(用订单号匹配)
数据源李丹发的5.23日老狼履约明细.xlsx含订单号列
规则:
- 用订单号直接查 account_id避免手机号脱敏碰撞
- 购买 L1+L2 联报 → 看 L1 行课
- 购买 L2 课包 → 看 L2 行课
- 含角色年龄、完课时长、加微状态
"""
import os, sys
import psycopg2
import pandas as pd
from collections import defaultdict
from datetime import date
DB_HOST = "bj-postgres-16pob4sg.sql.tencentcdb.com"
DB_PORT = 28591
DB_USER = "ai_member"
DB_NAME = "vala_bi"
def get_password():
pw = os.environ.get("PG_ONLINE_PASSWORD", "")
if pw:
return pw
secrets_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "secrets.env")
if os.path.exists(secrets_path):
with open(secrets_path) as f:
for line in f:
if line.startswith("PG_ONLINE_PASSWORD="):
return line.strip().split("=", 1)[1].strip("'\"")
raise RuntimeError("PG_ONLINE_PASSWORD not found")
def get_conn(db=DB_NAME):
return psycopg2.connect(host=DB_HOST, port=DB_PORT, user=DB_USER, password=get_password(), dbname=db, connect_timeout=120)
def lesson_bucket(n):
if n == 0: return "0课时"
elif n <= 3: return "1-3课时"
elif n <= 7: return "4-7课时"
elif n <= 15: return "8-15课时"
elif n <= 30: return "16-30课时"
elif n <= 60: return "31-60课时"
else: return "60课时以上"
def time_bucket(m):
if m == 0: return "0分钟"
elif m <= 10: return "1-10分钟"
elif m <= 30: return "11-30分钟"
elif m <= 60: return "31-60分钟"
elif m <= 120: return "61-120分钟"
elif m <= 300: return "121-300分钟"
else: return "300分钟以上"
BUCKET_ORDER = ["0课时","1-3课时","4-7课时","8-15课时","16-30课时","31-60课时","60课时以上"]
TIME_BUCKET_ORDER = ["0分钟","1-10分钟","11-30分钟","31-60分钟","61-120分钟","121-300分钟","300分钟以上"]
L1_GOODS, L2_GOODS, L1L2_GOODS = {57,60,63}, {31,32,33,54}, {61}
def classify_orders(goods_ids):
gs = set(goods_ids)
r = []
if gs & L1L2_GOODS: r.append(("L1+L2联报→看L1","L1"))
if (gs & L2_GOODS) and not (gs & L1L2_GOODS): r.append(("仅L2→看L2","L2"))
if (gs & L1L2_GOODS) and (gs & L2_GOODS): r.append(("联报+仅L2→看L2","L2"))
return r
def main():
input_file = sys.argv[1] if len(sys.argv) > 1 else "/root/.openclaw/media/inbound/5.23æ_è_ç_¼å_çº_æ_ç---c2361b75-b1c0-48c4-8302-d68ebb5e05d2.xlsx"
df_raw = pd.read_excel(input_file, dtype=str)
print(f"原始文件: {len(df_raw)}")
# ── 用订单号直接查 account_id ──
trade_nos = df_raw["订单号"].dropna().unique().tolist()
print(f"订单号: {len(trade_nos)}")
conn = get_conn()
cur = conn.cursor()
ph = ",".join(["%s"]*len(trade_nos))
cur.execute(f"""
SELECT o.account_id, o.trade_no, o.goods_id, o.goods_name, o.pay_success_date,
o.pay_amount_int, o.order_status, o.key_from
FROM bi_vala_order o
INNER JOIN bi_vala_app_account a ON o.account_id = a.id AND a.status = 1
WHERE o.trade_no IN ({ph}) AND o.deleted_at IS NULL
""", trade_nos)
orders = cur.fetchall()
print(f" 订单: {len(orders)}")
# 退款
refund_set = set()
for i in range(0, len(trade_nos), 500):
batch = trade_nos[i:i+500]
p2 = ",".join(["%s"]*len(batch))
cur.execute(f"SELECT DISTINCT trade_no FROM bi_refund_order WHERE trade_no IN ({p2}) AND status=3 AND deleted_at IS NULL", batch)
for (tn,) in cur.fetchall(): refund_set.add(tn)
# 用户列表
user_ids = sorted(set(o[0] for o in orders))
print(f" 用户: {len(user_ids)}")
# ── 课程结构 ──
cur.execute("SELECT id, course_level FROM bi_level_unit_lesson WHERE course_level IN ('L1','L2')")
l1_chs, l2_chs = set(), set()
for cid, lv in cur.fetchall():
if lv == "L1": l1_chs.add(cid)
else: l2_chs.add(cid)
# ── 角色(含生日)──
ph_u = ",".join(["%s"]*len(user_ids))
cur.execute(f"""
SELECT id, account_id, nickname, birthday FROM bi_vala_app_character
WHERE account_id IN ({ph_u}) AND nickname IS NOT NULL AND nickname!='' AND deleted_at IS NULL
""", user_ids)
chars = cur.fetchall()
char_ids = [c[0] for c in chars]
account_chars = defaultdict(list)
char_info = {}
today = date.today()
for cid, aid, nick, bday in chars:
account_chars[aid].append(cid)
age = None
if bday:
try:
bd = pd.Timestamp(str(bday)[:10]).date()
age = today.year - bd.year - ((today.month, today.day) < (bd.month, bd.day))
except: pass
char_info[cid] = (nick, str(bday)[:10] if bday else '', age)
print(f" 角色: {len(chars)}")
# ── 课时完成 ──
char_chapter_cuid = defaultdict(dict)
char_lessons_l1 = defaultdict(int)
char_lessons_l2 = defaultdict(int)
all_cuids = set()
for tbl_idx in range(8):
table = f"bi_user_chapter_play_record_{tbl_idx}"
for i in range(0, len(char_ids), 2000):
batch = char_ids[i:i+2000]
p2 = ",".join(["%s"]*len(batch))
try:
cur.execute(f"""
SELECT user_id, chapter_id, chapter_unique_id FROM {table}
WHERE user_id IN ({p2}) AND play_status=1 AND deleted_at IS NULL
""", batch)
for uid, ch_id, cuid in cur.fetchall():
if ch_id not in char_chapter_cuid[uid]:
char_chapter_cuid[uid][ch_id] = cuid
if ch_id in l1_chs: char_lessons_l1[uid] += 1
elif ch_id in l2_chs: char_lessons_l2[uid] += 1
all_cuids.add(cuid)
except Exception as e:
print(f" warn {table}: {e}")
# ── 完课时长 ──
cuid_duration = defaultdict(int)
cuid_list = list(all_cuids)
for tbl_idx in range(8):
table = f"bi_user_component_play_record_{tbl_idx}"
for i in range(0, len(cuid_list), 2000):
batch = cuid_list[i:i+2000]
p2 = ",".join(["%s"]*len(batch))
try:
cur.execute(f"""
SELECT chapter_unique_id, SUM(COALESCE(interval_time,0))
FROM {table} WHERE chapter_unique_id IN ({p2}) AND deleted_at IS NULL
GROUP BY chapter_unique_id
""", batch)
for cuid, ms in cur.fetchall():
if ms: cuid_duration[cuid] += ms
except: pass
# ── 账户信息 ──
cur.execute(f"SELECT id, created_at FROM bi_vala_app_account WHERE id IN ({ph_u}) AND status=1", user_ids)
account_info = {aid: reg for aid, reg in cur.fetchall()}
conn.close()
# ── 加微vala_class.student_info──
wechat_bound = set()
try:
conn_class = get_conn("vala_class")
cur2 = conn_class.cursor()
cur2.execute(f"SELECT DISTINCT vala_account_id FROM student_info WHERE vala_account_id IN ({ph_u})", user_ids)
wechat_bound = {r[0] for r in cur2.fetchall()}
cur2.close()
conn_class.close()
print(f" 加微: {len(wechat_bound)}/{len(user_ids)}")
except Exception as e:
print(f" 加微跳过: {e}")
# ── 构建数据 ──
user_orders_map = defaultdict(list)
for o in orders:
aid, tn, gid, gn, pd_, amt, os_, kf = o
user_orders_map[aid].append({
"goods_id": gid, "goods_name": gn, "trade_no": tn,
"pay_date": pd_, "amount": amt/100.0, "order_status": os_,
"key_from": kf, "is_refunded": tn in refund_set,
})
char_dur_l1 = defaultdict(int)
char_dur_l2 = defaultdict(int)
for cid, cmap in char_chapter_cuid.items():
for ch_id, cuid in cmap.items():
dur = cuid_duration.get(cuid, 0)
if ch_id in l1_chs: char_dur_l1[cid] += dur
elif ch_id in l2_chs: char_dur_l2[cid] += dur
rows = []
for aid in user_ids:
my_orders = user_orders_map.get(aid, [])
my_chars = account_chars.get(aid, [])
reg_time = account_info.get(aid)
total_l1 = sum(char_lessons_l1.get(c,0) for c in my_chars)
total_l2 = sum(char_lessons_l2.get(c,0) for c in my_chars)
dur_l1 = sum(char_dur_l1.get(c,0) for c in my_chars)
dur_l2 = sum(char_dur_l2.get(c,0) for c in my_chars)
total_orders = len(my_orders)
refunded_orders = sum(1 for o in my_orders if o["is_refunded"])
total_gmv = sum(o["amount"] for o in my_orders)
total_refund = sum(o["amount"] for o in my_orders if o["is_refunded"])
all_refunded = (refunded_orders == total_orders and total_orders > 0)
has_refund = refunded_orders > 0
goods_ids = [o["goods_id"] for o in my_orders]
cats = classify_orders(goods_ids)
char_details = []
for cid in my_chars:
nick, bday_str, age = char_info.get(cid, ('?','',None))
age_str = f'{age}' if age is not None else '?'
char_details.append(f'{nick}({age_str})')
for cat_label, watch_level in cats:
lessons = total_l1 if watch_level == "L1" else total_l2
dur_ms = dur_l1 if watch_level == "L1" else dur_l2
dur_min = round(dur_ms/60000.0, 1) if dur_ms else 0.0
avg_min = round(dur_min/lessons, 1) if lessons > 0 else 0.0
rows.append({
"用户ID": aid,
"注册时间": reg_time,
"购买分类": cat_label,
"行课等级": watch_level,
"角色数": len(my_chars),
"角色信息(名称+年龄)": "; ".join(char_details),
"完成课时数": lessons,
"总完课时长(分钟)": dur_min,
"平均每课时长(分钟)": avg_min,
"订单数": total_orders,
"退款订单数": refunded_orders,
"GMV": round(total_gmv, 2),
"GSV": round(total_gmv - total_refund, 2),
"是否退款": "" if has_refund else "",
"是否全部退款": "" if all_refunded else "",
"是否加微": "" if aid in wechat_bound else "",
"购买课包": ";".join(o["goods_name"] for o in my_orders),
})
df = pd.DataFrame(rows)
df["课时桶"] = df["完成课时数"].apply(lesson_bucket)
df["时长桶"] = df["总完课时长(分钟)"].apply(time_bucket)
# ── 输出 ──
print(f"\n{'='*70}")
print(f"老狼履约明细 — 行课分析({len(user_ids)} 用户, {len(orders)} 订单)")
print(f"{'='*70}")
for cat_label in ["L1+L2联报→看L1", "仅L2→看L2"]:
df_cat = df[df["购买分类"] == cat_label]
if len(df_cat) == 0: continue
print(f"\n{'='*60}")
print(f"{cat_label}{len(df_cat)} 个用户-分类记录")
print(f"{'='*60}")
print("\n ┌─ 按完成课时数分桶 ─────────────────────")
bs = df_cat.groupby("课时桶").agg(
用户数=("用户ID","count"),
退款用户数=("是否退款", lambda x: (x=="").sum()),
平均课时=("完成课时数","mean"),
平均完课时长=("总完课时长(分钟)","mean"),
中位完课时长=("总完课时长(分钟)","median"),
).reindex(BUCKET_ORDER).fillna(0)
bs["退款率%"] = (bs["退款用户数"]/bs["用户数"]*100).round(1)
bs["平均课时"] = bs["平均课时"].round(1)
bs["平均完课时长"] = bs["平均完课时长"].round(1)
bs["中位完课时长"] = bs["中位完课时长"].round(1)
bs = bs[bs["用户数"]>0]
print(bs.to_string())
print("\n ┌─ 按完课时长分桶 ─────────────────────")
ts = df_cat.groupby("时长桶").agg(
用户数=("用户ID","count"),
退款用户数=("是否退款", lambda x: (x=="").sum()),
平均课时=("完成课时数","mean"),
平均完课时长=("总完课时长(分钟)","mean"),
).reindex(TIME_BUCKET_ORDER).fillna(0)
ts["退款率%"] = (ts["退款用户数"]/ts["用户数"]*100).round(1)
ts["平均课时"] = ts["平均课时"].round(1)
ts["平均完课时长"] = ts["平均完课时长"].round(1)
ts = ts[ts["用户数"]>0]
print(ts.to_string())
refund_users = df_cat[df_cat["是否退款"]==""]
no_refund = df_cat[df_cat["是否退款"]==""]
print(f"\n ┌─ 汇总 ──────────────────────────────")
print(f" 退款用户: {len(refund_users)}, 平均课时: {refund_users['完成课时数'].mean():.1f}, 平均总时长: {refund_users['总完课时长(分钟)'].mean():.1f}min")
print(f" 未退款用户: {len(no_refund)}, 平均课时: {no_refund['完成课时数'].mean():.1f}, 平均总时长: {no_refund['总完课时长(分钟)'].mean():.1f}min")
print(f" 退费率: {len(refund_users)/len(df_cat)*100:.1f}%")
print(f"\n ┌─ 明细 ─────────────────────────────")
for _, r in df_cat.sort_values(["总完课时长(分钟)"]).iterrows():
print(f" UID={r['用户ID']} | {r['角色信息(名称+年龄)'][:30]} | {r['完成课时数']}课时 | {r['总完课时长(分钟)']}min | 加微={r['是否加微']} | {'退款' if r['是否退款']=='' else '未退款'}")
# ── Excel ──
output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output")
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "老狼履约_行课分析.xlsx")
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
df_out = df.drop(columns=["课时桶","时长桶"], errors="ignore").copy()
for col in df_out.columns:
if pd.api.types.is_datetime64_any_dtype(df_out[col]):
df_out[col] = df_out[col].dt.tz_localize(None)
df_out.to_excel(writer, sheet_name="明细", index=False)
for cat_label in ["L1+L2联报→看L1", "仅L2→看L2"]:
dc = df[df["购买分类"]==cat_label]
if len(dc)==0: continue
bs = dc.groupby("课时桶").agg(用户数=("用户ID","count"),退款用户数=("是否退款",lambda x:(x=="").sum()),平均课时=("完成课时数","mean"),平均完课时长=("总完课时长(分钟)","mean"),中位完课时长=("总完课时长(分钟)","median")).reindex(BUCKET_ORDER).fillna(0)
bs["退款率%"]=(bs["退款用户数"]/bs["用户数"]*100).round(1); bs["平均课时"]=bs["平均课时"].round(1); bs["平均完课时长"]=bs["平均完课时长"].round(1); bs["中位完课时长"]=bs["中位完课时长"].round(1)
bs=bs[bs["用户数"]>0]; bs.to_excel(writer, sheet_name=f"{cat_label[:20]}_按课时")
ts = dc.groupby("时长桶").agg(用户数=("用户ID","count"),退款用户数=("是否退款",lambda x:(x=="").sum()),平均课时=("完成课时数","mean"),平均完课时长=("总完课时长(分钟)","mean")).reindex(TIME_BUCKET_ORDER).fillna(0)
ts["退款率%"]=(ts["退款用户数"]/ts["用户数"]*100).round(1); ts["平均课时"]=ts["平均课时"].round(1); ts["平均完课时长"]=ts["平均完课时长"].round(1)
ts=ts[ts["用户数"]>0]; ts.to_excel(writer, sheet_name=f"{cat_label[:20]}_按时长")
print(f"\n{output_path}")
if __name__ == "__main__":
main()