ai_member_xiaoxi/scripts/last_completion_unit_dist.py

#!/usr/bin/env python3
"""
对于最近14天无完课行为的付费用户，统计他们「最后一次完课」所在的 Unit 分布（按 L1/L2 拆分）。
时间范围：2026-05-09 ~ 2026-05-22
"""

import psycopg2
import psycopg2.extras
from collections import defaultdict

conn = psycopg2.connect(
    host="bj-postgres-16pob4sg.sql.tencentcdb.com",
    port=28591,
    user="ai_member",
    password="LdfjdjL83h3h3^$&**YGG*",
    dbname="vala_bi"
)
conn.set_session(autocommit=True)
cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)

START_DATE = "2026-05-09"
END_DATE   = "2026-05-22"

print("=" * 70)
print("📊 无完课用户最后一次完课 Unit 分布分析")
print("=" * 70)

# ═══════════════════════════════════════════════════════════
# Step 1: 获取所有付费用户
# ═══════════════════════════════════════════════════════════
print("\n[1/6] 获取付费用户...")
cur.execute("""
    SELECT DISTINCT o.account_id
    FROM bi_vala_order o
    INNER JOIN bi_vala_app_account a ON o.account_id = a.id AND a.status = 1
    WHERE o.order_status IN (3, 4)
      AND o.pay_success_date IS NOT NULL
      AND o.deleted_at IS NULL
""")
paid_accounts = {row[0] for row in cur.fetchall()}
print(f"   付费用户总数: {len(paid_accounts)}")

# ═══════════════════════════════════════════════════════════
# Step 2: L1/L2 分类
# ═══════════════════════════════════════════════════════════
print("\n[2/6] 分类用户等级...")
L1_GOODS = {57, 60, 63}
L2_GOODS = {31, 32, 33, 54}
L1L2_GOODS = {61}

cur.execute("""
    SELECT o.account_id, o.goods_id
    FROM bi_vala_order o
    WHERE o.account_id = ANY(%s)
      AND o.order_status IN (3, 4)
      AND o.deleted_at IS NULL
""", (list(paid_accounts),))

user_goods = defaultdict(set)
for row in cur.fetchall():
    user_goods[row[0]].add(row[1])

only_l1, only_l2, both_l1l2, other = set(), set(), set(), set()
for aid in paid_accounts:
    g = user_goods.get(aid, set())
    h1, h2, h12 = bool(g & L1_GOODS), bool(g & L2_GOODS), bool(g & L1L2_GOODS)
    if h12 or (h1 and h2):
        both_l1l2.add(aid)
    elif h1:
        only_l1.add(aid)
    elif h2:
        only_l2.add(aid)
    else:
        other.add(aid)

# ═══════════════════════════════════════════════════════════
# Step 3: 获取最近14天有完课的用户
# ═══════════════════════════════════════════════════════════
print("\n[3/6] 查询最近14天完课行为...")
cur.execute("SELECT id, account_id FROM bi_vala_app_character WHERE account_id = ANY(%s) AND deleted_at IS NULL",
            (list(paid_accounts),))
char_to_account = {}
account_chars = defaultdict(list)
for row in cur.fetchall():
    char_to_account[row[0]] = row[1]
    account_chars[row[1]].append(row[0])
all_char_ids = set(char_to_account.keys())

active_char_ids = set()
for shard in range(8):
    cur.execute(f"""
        SELECT DISTINCT user_id FROM bi_user_chapter_play_record_{shard}
        WHERE play_status = 1 AND updated_at >= %s AND updated_at < %s::date + interval '1 day'
    """, (START_DATE, END_DATE))
    for r in cur.fetchall():
        if r[0] in all_char_ids:
            active_char_ids.add(r[0])

active_accounts = {char_to_account[c] for c in active_char_ids}
inactive_accounts = paid_accounts - active_accounts
print(f"   无完课付费用户: {len(inactive_accounts)}")

# ═══════════════════════════════════════════════════════════
# Step 4: 获取无完课用户的所有角色
# ═══════════════════════════════════════════════════════════
print("\n[4/6] 获取无完课用户的角色...")
inactive_chars = {}
for aid in inactive_accounts:
    for cid in account_chars.get(aid, []):
        inactive_chars[cid] = aid
print(f"   无完课用户角色数: {len(inactive_chars)}")

# ═══════════════════════════════════════════════════════════
# Step 5: 查询每个角色最后一次完课记录（遍历8张分表）
# ═══════════════════════════════════════════════════════════
print("\n[5/6] 查询各角色最后一次完课记录...")

# char_id → (chapter_id, updated_at)
latest_completion = {}

for shard in range(8):
    table = f"bi_user_chapter_play_record_{shard}"
    cur.execute(f"""
        SELECT DISTINCT ON (user_id) user_id, chapter_id, updated_at
        FROM {table}
        WHERE play_status = 1
        ORDER BY user_id, updated_at DESC
    """)
    rows = cur.fetchall()
    print(f"   {table}: {len(rows)} 条记录")
    for row in rows:
        cid = row[0]
        if cid not in inactive_chars:
            continue
        ch_id = row[1]
        ts = row[2]
        if cid not in latest_completion or ts > latest_completion[cid][1]:
            latest_completion[cid] = (ch_id, ts)

print(f"   有历史完课记录的角色数: {len(latest_completion)}")
no_history = len(inactive_chars) - len(latest_completion)
print(f"   无任何完课记录的角色数: {no_history}")

# ═══════════════════════════════════════════════════════════
# Step 6: 映射 chapter_id → Unit，按用户等级分类聚合
# ═══════════════════════════════════════════════════════════
print("\n[6/6] 映射 chapter_id → Unit 并聚合...")

# 获取课程结构映射
cur.execute("SELECT id, course_level, course_unit FROM bi_level_unit_lesson")
chapter_map = {}  # chapter_id → (course_level, course_unit)
for row in cur.fetchall():
    chapter_map[row[0]] = (row[1], row[2])

# 聚合：user_level_type → { unit → count }
# user_level_type: "仅L1", "仅L2", "L1+L2"
unit_counts = defaultdict(lambda: defaultdict(int))
no_chapter = defaultdict(int)  # 有完课但 chapter_id 映射不上的

for cid, (ch_id, ts) in latest_completion.items():
    aid = inactive_chars[cid]
    if aid in only_l1:
        user_type = "仅L1"
    elif aid in only_l2:
        user_type = "仅L2"
    elif aid in both_l1l2:
        user_type = "L1+L2"
    else:
        user_type = "其他"

    if ch_id in chapter_map:
        level, unit = chapter_map[ch_id]
        unit_counts[user_type][unit] += 1
    else:
        no_chapter[user_type] += 1

# ═══════════════════════════════════════════════════════════
# 输出结果
# ═══════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("📈 统计结果")
print("=" * 70)

# 定义 Unit 排序
def unit_sort_key(u):
    if u is None:
        return (99, 99)
    # e.g. "U00", "U01", "S1U00"
    import re
    m = re.match(r'(?:S(\d+))?U(\d+)', u)
    if m:
        s = int(m.group(1)) if m.group(1) else 0
        uu = int(m.group(2))
        return (s, uu)
    return (99, 99)

for user_type in ["仅L1", "仅L2", "L1+L2"]:
    data = unit_counts[user_type]
    total_with_history = sum(data.values())
    print(f"\n{'─' * 50}")
    print(f"  {user_type} 用户")
    print(f"  最后一次完课 Unit 分布（共 {total_with_history} 人有完课记录）:")
    print(f"{'─' * 50}")

    # 先按 Unit 排序
    sorted_units = sorted(data.items(), key=lambda x: unit_sort_key(x[0]))
    total_all = total_with_history + no_history
    for unit, cnt in sorted_units:
        bar = "█" * max(1, int(cnt / max(1, max(data.values())) * 30))
        print(f"  {unit:>10s}  {cnt:>5d}  {bar}")

    if no_chapter.get(user_type, 0) > 0:
        print(f"  {'(未知)':>10s}  {no_chapter[user_type]:>5d}  (chapter_id 映射失败)")

# ─── 总体汇总 ───
print(f"\n{'═' * 50}")
print("📋 汇总")
print(f"{'═' * 50}")
total_inactive = len(inactive_accounts)
for user_type in ["仅L1", "仅L2", "L1+L2"]:
    type_users = len([a for a in inactive_accounts if
                      (user_type == "仅L1" and a in only_l1) or
                      (user_type == "仅L2" and a in only_l2) or
                      (user_type == "L1+L2" and a in both_l1l2)])
    with_history = sum(unit_counts[user_type].values())
    no_hist = 0
    # Count users in this type without any history
    type_chars = {c: a for c, a in inactive_chars.items() if
                  (user_type == "仅L1" and a in only_l1) or
                  (user_type == "仅L2" and a in only_l2) or
                  (user_type == "L1+L2" and a in both_l1l2)}
    # For users with no history: count distinct account_ids among chars with no record
    chars_with_history = set(latest_completion.keys())
    accounts_with_history = set()
    accounts_no_history = set()
    for c, a in type_chars.items():
        if c in chars_with_history:
            accounts_with_history.add(a)
        else:
            accounts_no_history.add(a)
    # An account may have some chars with history and some without.
    # Count accounts that have ANY char with history vs those with NO chars having history
    no_history_accounts = accounts_no_history - accounts_with_history

    print(f"  {user_type}: 共 {type_users} 人")
    print(f"       有完课记录: {len(accounts_with_history)} 人")
    print(f"       完全无完课: {len(no_history_accounts)} 人")
    # Top units
    data = unit_counts[user_type]
    if data:
        top = sorted(data.items(), key=lambda x: -x[1])[:5]
        top_str = ", ".join(f"{u}({c})" for u, c in top)
        print(f"       Top5 Unit: {top_str}")

print(f"\n  合计无完课付费用户: {total_inactive}")
print(f"  注：用户数 {total_inactive} 为上轮口径，本次统计以角色维度查最后完课记录")

cur.close()
conn.close()
print("\n✅ 完成")