ai_member_xiaoxi/scripts/last_study_unit_distribution.py
2026-05-23 08:00:01 +08:00

198 lines
6.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
1880个近14天无完课的激活课程最后一次完课记录分布在哪个Unit
- 激活课程:(character_id, level) 唯一组合
- 最后一次完课该角色在对应level课程中 play_status=1 的最晚 created_at
- 分布按 bi_level_unit_lesson.course_unit
"""
import psycopg2
from collections import defaultdict, Counter
PG_CONFIG = {
'host': 'bj-postgres-16pob4sg.sql.tencentcdb.com',
'port': 28591,
'user': 'ai_member',
'password': "LdfjdjL83h3h3^$&**YGG*",
'dbname': 'vala_bi'
}
WINDOW_START = '2026-05-09'
WINDOW_END = '2026-05-23'
conn = psycopg2.connect(**PG_CONFIG)
cur = conn.cursor()
# ===== 1. Get activated courses =====
print("1. 查询被激活的课程...")
cur.execute("""
SELECT DISTINCT character_id, season_package_level
FROM bi_vala_seasonal_ticket
WHERE status = 1 AND deleted_at IS NULL
AND season_package_level IN ('A1', 'A2')
AND character_id IS NOT NULL
""")
activated = {} # (character_id, db_level) -> True
for row in cur.fetchall():
activated[(row[0], row[1])] = True
print(f" 总激活课程数: {len(activated)}")
# ===== 2. Get chapter -> (level, unit, season) mapping =====
print("2. 查询课程结构...")
cur.execute("""
SELECT id, course_level, course_unit, course_season
FROM bi_level_unit_lesson
WHERE course_level IN ('L1', 'L2')
""")
chapter_info = {} # chapter_id -> (level, unit, season)
for row in cur.fetchall():
chapter_info[row[0]] = (row[1], row[2], row[3])
print(f" L1/L2 课时数: {len(chapter_info)}")
# ===== 3. Find which courses had study in last 14 days =====
print("3. 查询最近14天有完课的...")
level_map = {'A1': 'L1', 'A2': 'L2'}
all_chars = set(cid for cid, _ in activated.keys())
mod_buckets = defaultdict(set)
for cid in all_chars:
mod_buckets[cid % 8].add(cid)
studied_courses = set() # (character_id, level_label) that had study in window
for mod_val in range(8):
uids = mod_buckets.get(mod_val, set())
if not uids:
continue
uid_list = list(uids)
for j in range(0, len(uid_list), 500):
batch = uid_list[j:j+500]
cur.execute(f"""
SELECT DISTINCT pr.user_id, cl.course_level
FROM bi_user_chapter_play_record_{mod_val} pr
JOIN bi_level_unit_lesson cl ON pr.chapter_id = cl.id
WHERE pr.user_id = ANY(%s)
AND pr.play_status = 1
AND pr.created_at >= %s
AND pr.created_at < %s
AND cl.course_level IN ('L1', 'L2')
""", (batch, WINDOW_START, WINDOW_END))
for row in cur.fetchall():
studied_courses.add((row[0], row[1]))
# ===== 4. Identify courses with NO study in last 14 days =====
print("4. 找出无完课的激活课程...")
no_study_courses = {} # (character_id, db_level) -> label_level
for (cid, db_level), _ in activated.items():
label = level_map[db_level]
if (cid, label) not in studied_courses:
no_study_courses[(cid, db_level)] = label
print(f" 无完课激活课程数: {len(no_study_courses)}")
# ===== 5. For each no-study course, find the LAST study record =====
print("5. 查询最后一次完课记录...")
# Group no_study courses by character_id
no_study_chars = set(cid for cid, _ in no_study_courses.keys())
mod_buckets_no = defaultdict(set)
for cid in no_study_chars:
mod_buckets_no[cid % 8].add(cid)
# Store last chapter_id per (character_id, level_label)
last_chapter = {} # (character_id, level_label) -> chapter_id
for mod_val in range(8):
uids = mod_buckets_no.get(mod_val, set())
if not uids:
continue
uid_list = list(uids)
for j in range(0, len(uid_list), 500):
batch = uid_list[j:j+500]
# For each user, get the max created_at per level
cur.execute(f"""
SELECT DISTINCT ON (pr.user_id, cl.course_level)
pr.user_id, cl.course_level, pr.chapter_id
FROM bi_user_chapter_play_record_{mod_val} pr
JOIN bi_level_unit_lesson cl ON pr.chapter_id = cl.id
WHERE pr.user_id = ANY(%s)
AND pr.play_status = 1
AND cl.course_level IN ('L1', 'L2')
ORDER BY pr.user_id, cl.course_level, pr.created_at DESC
""", (batch,))
for row in cur.fetchall():
uid, level, ch_id = row
last_chapter[(uid, level)] = ch_id
print(f" 有历史完课记录的 (角色,level) 组合数: {len(last_chapter)}")
# ===== 6. Build distribution =====
print("\n===== 最后一次完课 Unit 分布 =====\n")
no_history = 0
l1_unit_counter = Counter()
l2_unit_counter = Counter()
for (cid, db_level), label in no_study_courses.items():
key = (cid, label)
ch_id = last_chapter.get(key)
if ch_id is None:
no_history += 1
continue
info = chapter_info.get(ch_id)
if info is None:
no_history += 1
continue
cl, unit, season = info
if cl == 'L1':
l1_unit_counter[unit] += 1
else:
l2_unit_counter[unit] += 1
# Sort units naturally
def sort_units(counter):
result = []
for u in sorted(counter.keys()):
result.append((u, counter[u]))
return result
print(f"总无完课激活课程: {len(no_study_courses)}")
print(f" 从未有过任何完课记录: {no_history}")
print(f" 有历史完课记录: {len(no_study_courses) - no_history}")
print()
print("【L1 激活课程 - 最后一次完课 Unit 分布】")
print(f"{'Unit':<8} {'课程数':<8} {'占比':<10} {'累计占比':<10}")
total_l1 = len([v for (c,db),v in no_study_courses.items() if db == 'A1'])
cum = 0
for unit, cnt in sort_units(l1_unit_counter):
pct = cnt / total_l1 * 100 if total_l1 else 0
cum += pct
print(f"{unit:<8} {cnt:<8} {pct:>6.1f}% {cum:>6.1f}%")
no_hist_l1 = sum(1 for (c,db) in no_study_courses if db == 'A1') - sum(l1_unit_counter.values())
if no_hist_l1 > 0:
print(f"{'无记录':<8} {no_hist_l1:<8} {no_hist_l1/total_l1*100:>6.1f}%")
print()
print("【L2 激活课程 - 最后一次完课 Unit 分布】")
print(f"{'Unit':<8} {'课程数':<8} {'占比':<10} {'累计占比':<10}")
total_l2 = len([v for (c,db),v in no_study_courses.items() if db == 'A2'])
cum = 0
for unit, cnt in sort_units(l2_unit_counter):
pct = cnt / total_l2 * 100 if total_l2 else 0
cum += pct
print(f"{unit:<8} {cnt:<8} {pct:>6.1f}% {cum:>6.1f}%")
no_hist_l2 = sum(1 for (c,db) in no_study_courses if db == 'A2') - sum(l2_unit_counter.values())
if no_hist_l2 > 0:
print(f"{'无记录':<8} {no_hist_l2:<8} {no_hist_l2/total_l2*100:>6.1f}%")
cur.close()
conn.close()