ai_member_xiaoxi/scripts/last_study_lesson_distribution.py

#!/usr/bin/env python3
"""
无完课激活课程中，最后停在 U00/U01 的，按 Lesson(L01~L05) 分布
"""

import psycopg2
from collections import defaultdict, Counter

PG_CONFIG = {
    'host': 'bj-postgres-16pob4sg.sql.tencentcdb.com',
    'port': 28591,
    'user': 'ai_member',
    'password': "LdfjdjL83h3h3^$&**YGG*",
    'dbname': 'vala_bi'
}

WINDOW_START = '2026-05-09'
WINDOW_END = '2026-05-23'

conn = psycopg2.connect(**PG_CONFIG)
cur = conn.cursor()

# ===== 1. Activated courses =====
print("1. 查询激活课程...")
cur.execute("""
    SELECT DISTINCT character_id, season_package_level
    FROM bi_vala_seasonal_ticket
    WHERE status = 1 AND deleted_at IS NULL
      AND season_package_level IN ('A1', 'A2')
      AND character_id IS NOT NULL
""")
activated = set()
for row in cur.fetchall():
    activated.add((row[0], row[1]))
print(f"   总激活课程: {len(activated)}")

# ===== 2. chapter -> (level, unit, lesson) =====
print("2. 查询课程结构...")
cur.execute("""
    SELECT id, course_level, course_unit, course_lesson
    FROM bi_level_unit_lesson
    WHERE course_level IN ('L1', 'L2')
""")
ch_info = {}  # id -> (level, unit, lesson)
for row in cur.fetchall():
    ch_info[row[0]] = (row[1], row[2], row[3])

# ===== 3. Find studied in window =====
print("3. 查询近14天完课...")
all_chars = set(cid for cid, _ in activated)
mod_buckets = defaultdict(set)
for cid in all_chars:
    mod_buckets[cid % 8].add(cid)

studied = set()  # (character_id, level_label)
level_map = {'A1': 'L1', 'A2': 'L2'}

for mod_val in range(8):
    uids = mod_buckets.get(mod_val, set())
    if not uids:
        continue
    uid_list = list(uids)
    for j in range(0, len(uid_list), 500):
        batch = uid_list[j:j+500]
        cur.execute(f"""
            SELECT DISTINCT pr.user_id, cl.course_level
            FROM bi_user_chapter_play_record_{mod_val} pr
            JOIN bi_level_unit_lesson cl ON pr.chapter_id = cl.id
            WHERE pr.user_id = ANY(%s) AND pr.play_status = 1
              AND pr.created_at >= %s AND pr.created_at < %s
              AND cl.course_level IN ('L1','L2')
        """, (batch, WINDOW_START, WINDOW_END))
        for row in cur.fetchall():
            studied.add((row[0], row[1]))

# ===== 4. No-study courses =====
no_study = [(cid, db_level, level_map[db_level]) for (cid, db_level) in activated
            if (cid, level_map[db_level]) not in studied]
print(f"   无完课课程: {len(no_study)}")

# ===== 5. Last chapter per (character, level) =====
print("4. 查询最后一次完课 chapter...")
no_study_chars = list(set(cid for cid, _, _ in no_study))
mod_buckets2 = defaultdict(list)
for cid in no_study_chars:
    mod_buckets2[cid % 8].append(cid)

last_chapter = {}  # (character_id, level_label) -> chapter_id

for mod_val in range(8):
    uids = mod_buckets2.get(mod_val, [])
    if not uids:
        continue
    for j in range(0, len(uids), 500):
        batch = uids[j:j+500]
        cur.execute(f"""
            SELECT DISTINCT ON (pr.user_id, cl.course_level)
                pr.user_id, cl.course_level, pr.chapter_id
            FROM bi_user_chapter_play_record_{mod_val} pr
            JOIN bi_level_unit_lesson cl ON pr.chapter_id = cl.id
            WHERE pr.user_id = ANY(%s) AND pr.play_status = 1
              AND cl.course_level IN ('L1','L2')
            ORDER BY pr.user_id, cl.course_level, pr.created_at DESC
        """, (batch,))
        for row in cur.fetchall():
            last_chapter[(row[0], row[1])] = row[2]

print(f"   有历史记录的组合数: {len(last_chapter)}")

# ===== 6. Stats: U00/U01 by lesson =====
print("\n===== 结果 =====\n")

# Structure: { 'L1': { 'U00': Counter(lesson), 'U01': Counter(lesson) }, 'L2': ... }
stats = {
    'L1': {'U00': Counter(), 'U01': Counter()},
    'L2': {'U00': Counter(), 'U01': Counter()},
}
no_record = {'L1': {'U00': 0, 'U01': 0}, 'L2': {'U00': 0, 'U01': 0}}

# Expected totals from pre-known query
targets = {
    ('L1', 'U00'): 149,
    ('L1', 'U01'): 94,
    ('L2', 'U00'): 351,
    ('L2', 'U01'): 242,
}

# First, split no_study courses by their last chapter's unit
for cid, db_level, label in no_study:
    ch_id = last_chapter.get((cid, label))
    if ch_id is None or ch_id not in ch_info:
        # No record - need to check which unit they're in
        # Actually we can't determine unit without a chapter. Skip for now.
        continue
    cl, unit, lesson = ch_info[ch_id]
    if unit in ('U00', 'U01'):
        stats[cl][unit][lesson] += 1

# Count no-record for U00/U01: these are courses whose last chapter we DON'T have
# But we need to check: were they counted in the original 149/94/351/242?
# They were - the original query counted them under U00/U01.
# But we can't determine lesson for them since they have no chapter record.
# Actually wait - the original query found chapter_id from last_chapter, so if they're in
# the original U00/U01 counts, they DO have a chapter. Let me verify...

# Let me recount properly: for each no_study course with a last chapter in U00/U01
for label in ('L1', 'L2'):
    for unit in ('U00', 'U01'):
        target = targets[(label, unit)]
        counted = sum(stats[label][unit].values())
        no_rec = target - counted
        if no_rec > 0:
            no_record[label][unit] = no_rec

# Print
for label in ('L1', 'L2'):
    for unit in ('U00', 'U01'):
        target = targets[(label, unit)]
        counted = sum(stats[label][unit].values())
        no_rec = target - counted
        print(f"【{label} {unit}】 共 {target} 门（有lesson数据 {counted} + 无记录 {no_rec}）")
        print(f"  {'Lesson':<8} {'课程数':<8} {'占比':<10}")
        lessons = sorted(stats[label][unit].keys())
        for les in lessons:
            cnt = stats[label][unit][les]
            pct = cnt / target * 100
            print(f"  {les:<8} {cnt:<8} {pct:>6.1f}%")
        if no_rec > 0:
            print(f"  {'无记录':<8} {no_rec:<8} {no_rec/target*100:>6.1f}%")
        print()

cur.close()
conn.close()