ai_member_xiaoxi/scripts/last_study_unit_distribution.py

#!/usr/bin/env python3
"""
1880个近14天无完课的激活课程，最后一次完课记录分布在哪个Unit
- 激活课程：(character_id, level) 唯一组合
- 最后一次完课：该角色在对应level课程中 play_status=1 的最晚 created_at
- 分布按 bi_level_unit_lesson.course_unit
"""

import psycopg2
from collections import defaultdict, Counter

PG_CONFIG = {
    'host': 'bj-postgres-16pob4sg.sql.tencentcdb.com',
    'port': 28591,
    'user': 'ai_member',
    'password': "LdfjdjL83h3h3^$&**YGG*",
    'dbname': 'vala_bi'
}

WINDOW_START = '2026-05-09'
WINDOW_END = '2026-05-23'

conn = psycopg2.connect(**PG_CONFIG)
cur = conn.cursor()

# ===== 1. Get activated courses =====
print("1. 查询被激活的课程...")
cur.execute("""
    SELECT DISTINCT character_id, season_package_level
    FROM bi_vala_seasonal_ticket
    WHERE status = 1 AND deleted_at IS NULL
      AND season_package_level IN ('A1', 'A2')
      AND character_id IS NOT NULL
""")
activated = {}  # (character_id, db_level) -> True
for row in cur.fetchall():
    activated[(row[0], row[1])] = True

print(f"   总激活课程数: {len(activated)}")

# ===== 2. Get chapter -> (level, unit, season) mapping =====
print("2. 查询课程结构...")
cur.execute("""
    SELECT id, course_level, course_unit, course_season
    FROM bi_level_unit_lesson
    WHERE course_level IN ('L1', 'L2')
""")
chapter_info = {}  # chapter_id -> (level, unit, season)
for row in cur.fetchall():
    chapter_info[row[0]] = (row[1], row[2], row[3])

print(f"   L1/L2 课时数: {len(chapter_info)}")

# ===== 3. Find which courses had study in last 14 days =====
print("3. 查询最近14天有完课的...")

level_map = {'A1': 'L1', 'A2': 'L2'}

all_chars = set(cid for cid, _ in activated.keys())
mod_buckets = defaultdict(set)
for cid in all_chars:
    mod_buckets[cid % 8].add(cid)

studied_courses = set()  # (character_id, level_label) that had study in window

for mod_val in range(8):
    uids = mod_buckets.get(mod_val, set())
    if not uids:
        continue
    uid_list = list(uids)
    for j in range(0, len(uid_list), 500):
        batch = uid_list[j:j+500]
        cur.execute(f"""
            SELECT DISTINCT pr.user_id, cl.course_level
            FROM bi_user_chapter_play_record_{mod_val} pr
            JOIN bi_level_unit_lesson cl ON pr.chapter_id = cl.id
            WHERE pr.user_id = ANY(%s)
              AND pr.play_status = 1
              AND pr.created_at >= %s
              AND pr.created_at < %s
              AND cl.course_level IN ('L1', 'L2')
        """, (batch, WINDOW_START, WINDOW_END))
        for row in cur.fetchall():
            studied_courses.add((row[0], row[1]))

# ===== 4. Identify courses with NO study in last 14 days =====
print("4. 找出无完课的激活课程...")

no_study_courses = {}  # (character_id, db_level) -> label_level
for (cid, db_level), _ in activated.items():
    label = level_map[db_level]
    if (cid, label) not in studied_courses:
        no_study_courses[(cid, db_level)] = label

print(f"   无完课激活课程数: {len(no_study_courses)}")

# ===== 5. For each no-study course, find the LAST study record =====
print("5. 查询最后一次完课记录...")

# Group no_study courses by character_id
no_study_chars = set(cid for cid, _ in no_study_courses.keys())
mod_buckets_no = defaultdict(set)
for cid in no_study_chars:
    mod_buckets_no[cid % 8].add(cid)

# Store last chapter_id per (character_id, level_label)
last_chapter = {}  # (character_id, level_label) -> chapter_id

for mod_val in range(8):
    uids = mod_buckets_no.get(mod_val, set())
    if not uids:
        continue
    uid_list = list(uids)
    for j in range(0, len(uid_list), 500):
        batch = uid_list[j:j+500]
        # For each user, get the max created_at per level
        cur.execute(f"""
            SELECT DISTINCT ON (pr.user_id, cl.course_level)
                pr.user_id, cl.course_level, pr.chapter_id
            FROM bi_user_chapter_play_record_{mod_val} pr
            JOIN bi_level_unit_lesson cl ON pr.chapter_id = cl.id
            WHERE pr.user_id = ANY(%s)
              AND pr.play_status = 1
              AND cl.course_level IN ('L1', 'L2')
            ORDER BY pr.user_id, cl.course_level, pr.created_at DESC
        """, (batch,))

        for row in cur.fetchall():
            uid, level, ch_id = row
            last_chapter[(uid, level)] = ch_id

print(f"   有历史完课记录的 (角色,level) 组合数: {len(last_chapter)}")

# ===== 6. Build distribution =====
print("\n===== 最后一次完课 Unit 分布 =====\n")

no_history = 0
l1_unit_counter = Counter()
l2_unit_counter = Counter()

for (cid, db_level), label in no_study_courses.items():
    key = (cid, label)
    ch_id = last_chapter.get(key)
    if ch_id is None:
        no_history += 1
        continue

    info = chapter_info.get(ch_id)
    if info is None:
        no_history += 1
        continue

    cl, unit, season = info
    if cl == 'L1':
        l1_unit_counter[unit] += 1
    else:
        l2_unit_counter[unit] += 1

# Sort units naturally
def sort_units(counter):
    result = []
    for u in sorted(counter.keys()):
        result.append((u, counter[u]))
    return result

print(f"总无完课激活课程: {len(no_study_courses)}")
print(f"  从未有过任何完课记录: {no_history}")
print(f"  有历史完课记录: {len(no_study_courses) - no_history}")
print()

print("【L1 激活课程 - 最后一次完课 Unit 分布】")
print(f"{'Unit':<8} {'课程数':<8} {'占比':<10} {'累计占比':<10}")
total_l1 = len([v for (c,db),v in no_study_courses.items() if db == 'A1'])
cum = 0
for unit, cnt in sort_units(l1_unit_counter):
    pct = cnt / total_l1 * 100 if total_l1 else 0
    cum += pct
    print(f"{unit:<8} {cnt:<8} {pct:>6.1f}%    {cum:>6.1f}%")
no_hist_l1 = sum(1 for (c,db) in no_study_courses if db == 'A1') - sum(l1_unit_counter.values())
if no_hist_l1 > 0:
    print(f"{'无记录':<8} {no_hist_l1:<8} {no_hist_l1/total_l1*100:>6.1f}%")
print()

print("【L2 激活课程 - 最后一次完课 Unit 分布】")
print(f"{'Unit':<8} {'课程数':<8} {'占比':<10} {'累计占比':<10}")
total_l2 = len([v for (c,db),v in no_study_courses.items() if db == 'A2'])
cum = 0
for unit, cnt in sort_units(l2_unit_counter):
    pct = cnt / total_l2 * 100 if total_l2 else 0
    cum += pct
    print(f"{unit:<8} {cnt:<8} {pct:>6.1f}%    {cum:>6.1f}%")
no_hist_l2 = sum(1 for (c,db) in no_study_courses if db == 'A2') - sum(l2_unit_counter.values())
if no_hist_l2 > 0:
    print(f"{'无记录':<8} {no_hist_l2:<8} {no_hist_l2/total_l2*100:>6.1f}%")

cur.close()
conn.close()