ai_member_xiaoxi/scripts/last_completion_unit_dist.py
2026-05-23 08:00:01 +08:00

261 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
对于最近14天无完课行为的付费用户统计他们「最后一次完课」所在的 Unit 分布(按 L1/L2 拆分)。
时间范围2026-05-09 ~ 2026-05-22
"""
import psycopg2
import psycopg2.extras
from collections import defaultdict
conn = psycopg2.connect(
host="bj-postgres-16pob4sg.sql.tencentcdb.com",
port=28591,
user="ai_member",
password="LdfjdjL83h3h3^$&**YGG*",
dbname="vala_bi"
)
conn.set_session(autocommit=True)
cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
START_DATE = "2026-05-09"
END_DATE = "2026-05-22"
print("=" * 70)
print("📊 无完课用户最后一次完课 Unit 分布分析")
print("=" * 70)
# ═══════════════════════════════════════════════════════════
# Step 1: 获取所有付费用户
# ═══════════════════════════════════════════════════════════
print("\n[1/6] 获取付费用户...")
cur.execute("""
SELECT DISTINCT o.account_id
FROM bi_vala_order o
INNER JOIN bi_vala_app_account a ON o.account_id = a.id AND a.status = 1
WHERE o.order_status IN (3, 4)
AND o.pay_success_date IS NOT NULL
AND o.deleted_at IS NULL
""")
paid_accounts = {row[0] for row in cur.fetchall()}
print(f" 付费用户总数: {len(paid_accounts)}")
# ═══════════════════════════════════════════════════════════
# Step 2: L1/L2 分类
# ═══════════════════════════════════════════════════════════
print("\n[2/6] 分类用户等级...")
L1_GOODS = {57, 60, 63}
L2_GOODS = {31, 32, 33, 54}
L1L2_GOODS = {61}
cur.execute("""
SELECT o.account_id, o.goods_id
FROM bi_vala_order o
WHERE o.account_id = ANY(%s)
AND o.order_status IN (3, 4)
AND o.deleted_at IS NULL
""", (list(paid_accounts),))
user_goods = defaultdict(set)
for row in cur.fetchall():
user_goods[row[0]].add(row[1])
only_l1, only_l2, both_l1l2, other = set(), set(), set(), set()
for aid in paid_accounts:
g = user_goods.get(aid, set())
h1, h2, h12 = bool(g & L1_GOODS), bool(g & L2_GOODS), bool(g & L1L2_GOODS)
if h12 or (h1 and h2):
both_l1l2.add(aid)
elif h1:
only_l1.add(aid)
elif h2:
only_l2.add(aid)
else:
other.add(aid)
# ═══════════════════════════════════════════════════════════
# Step 3: 获取最近14天有完课的用户
# ═══════════════════════════════════════════════════════════
print("\n[3/6] 查询最近14天完课行为...")
cur.execute("SELECT id, account_id FROM bi_vala_app_character WHERE account_id = ANY(%s) AND deleted_at IS NULL",
(list(paid_accounts),))
char_to_account = {}
account_chars = defaultdict(list)
for row in cur.fetchall():
char_to_account[row[0]] = row[1]
account_chars[row[1]].append(row[0])
all_char_ids = set(char_to_account.keys())
active_char_ids = set()
for shard in range(8):
cur.execute(f"""
SELECT DISTINCT user_id FROM bi_user_chapter_play_record_{shard}
WHERE play_status = 1 AND updated_at >= %s AND updated_at < %s::date + interval '1 day'
""", (START_DATE, END_DATE))
for r in cur.fetchall():
if r[0] in all_char_ids:
active_char_ids.add(r[0])
active_accounts = {char_to_account[c] for c in active_char_ids}
inactive_accounts = paid_accounts - active_accounts
print(f" 无完课付费用户: {len(inactive_accounts)}")
# ═══════════════════════════════════════════════════════════
# Step 4: 获取无完课用户的所有角色
# ═══════════════════════════════════════════════════════════
print("\n[4/6] 获取无完课用户的角色...")
inactive_chars = {}
for aid in inactive_accounts:
for cid in account_chars.get(aid, []):
inactive_chars[cid] = aid
print(f" 无完课用户角色数: {len(inactive_chars)}")
# ═══════════════════════════════════════════════════════════
# Step 5: 查询每个角色最后一次完课记录遍历8张分表
# ═══════════════════════════════════════════════════════════
print("\n[5/6] 查询各角色最后一次完课记录...")
# char_id → (chapter_id, updated_at)
latest_completion = {}
for shard in range(8):
table = f"bi_user_chapter_play_record_{shard}"
cur.execute(f"""
SELECT DISTINCT ON (user_id) user_id, chapter_id, updated_at
FROM {table}
WHERE play_status = 1
ORDER BY user_id, updated_at DESC
""")
rows = cur.fetchall()
print(f" {table}: {len(rows)} 条记录")
for row in rows:
cid = row[0]
if cid not in inactive_chars:
continue
ch_id = row[1]
ts = row[2]
if cid not in latest_completion or ts > latest_completion[cid][1]:
latest_completion[cid] = (ch_id, ts)
print(f" 有历史完课记录的角色数: {len(latest_completion)}")
no_history = len(inactive_chars) - len(latest_completion)
print(f" 无任何完课记录的角色数: {no_history}")
# ═══════════════════════════════════════════════════════════
# Step 6: 映射 chapter_id → Unit按用户等级分类聚合
# ═══════════════════════════════════════════════════════════
print("\n[6/6] 映射 chapter_id → Unit 并聚合...")
# 获取课程结构映射
cur.execute("SELECT id, course_level, course_unit FROM bi_level_unit_lesson")
chapter_map = {} # chapter_id → (course_level, course_unit)
for row in cur.fetchall():
chapter_map[row[0]] = (row[1], row[2])
# 聚合user_level_type → { unit → count }
# user_level_type: "仅L1", "仅L2", "L1+L2"
unit_counts = defaultdict(lambda: defaultdict(int))
no_chapter = defaultdict(int) # 有完课但 chapter_id 映射不上的
for cid, (ch_id, ts) in latest_completion.items():
aid = inactive_chars[cid]
if aid in only_l1:
user_type = "仅L1"
elif aid in only_l2:
user_type = "仅L2"
elif aid in both_l1l2:
user_type = "L1+L2"
else:
user_type = "其他"
if ch_id in chapter_map:
level, unit = chapter_map[ch_id]
unit_counts[user_type][unit] += 1
else:
no_chapter[user_type] += 1
# ═══════════════════════════════════════════════════════════
# 输出结果
# ═══════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("📈 统计结果")
print("=" * 70)
# 定义 Unit 排序
def unit_sort_key(u):
if u is None:
return (99, 99)
# e.g. "U00", "U01", "S1U00"
import re
m = re.match(r'(?:S(\d+))?U(\d+)', u)
if m:
s = int(m.group(1)) if m.group(1) else 0
uu = int(m.group(2))
return (s, uu)
return (99, 99)
for user_type in ["仅L1", "仅L2", "L1+L2"]:
data = unit_counts[user_type]
total_with_history = sum(data.values())
print(f"\n{'' * 50}")
print(f" {user_type} 用户")
print(f" 最后一次完课 Unit 分布(共 {total_with_history} 人有完课记录):")
print(f"{'' * 50}")
# 先按 Unit 排序
sorted_units = sorted(data.items(), key=lambda x: unit_sort_key(x[0]))
total_all = total_with_history + no_history
for unit, cnt in sorted_units:
bar = "" * max(1, int(cnt / max(1, max(data.values())) * 30))
print(f" {unit:>10s} {cnt:>5d} {bar}")
if no_chapter.get(user_type, 0) > 0:
print(f" {'(未知)':>10s} {no_chapter[user_type]:>5d} (chapter_id 映射失败)")
# ─── 总体汇总 ───
print(f"\n{'' * 50}")
print("📋 汇总")
print(f"{'' * 50}")
total_inactive = len(inactive_accounts)
for user_type in ["仅L1", "仅L2", "L1+L2"]:
type_users = len([a for a in inactive_accounts if
(user_type == "仅L1" and a in only_l1) or
(user_type == "仅L2" and a in only_l2) or
(user_type == "L1+L2" and a in both_l1l2)])
with_history = sum(unit_counts[user_type].values())
no_hist = 0
# Count users in this type without any history
type_chars = {c: a for c, a in inactive_chars.items() if
(user_type == "仅L1" and a in only_l1) or
(user_type == "仅L2" and a in only_l2) or
(user_type == "L1+L2" and a in both_l1l2)}
# For users with no history: count distinct account_ids among chars with no record
chars_with_history = set(latest_completion.keys())
accounts_with_history = set()
accounts_no_history = set()
for c, a in type_chars.items():
if c in chars_with_history:
accounts_with_history.add(a)
else:
accounts_no_history.add(a)
# An account may have some chars with history and some without.
# Count accounts that have ANY char with history vs those with NO chars having history
no_history_accounts = accounts_no_history - accounts_with_history
print(f" {user_type}: 共 {type_users}")
print(f" 有完课记录: {len(accounts_with_history)}")
print(f" 完全无完课: {len(no_history_accounts)}")
# Top units
data = unit_counts[user_type]
if data:
top = sorted(data.items(), key=lambda x: -x[1])[:5]
top_str = ", ".join(f"{u}({c})" for u, c in top)
print(f" Top5 Unit: {top_str}")
print(f"\n 合计无完课付费用户: {total_inactive}")
print(f" 注:用户数 {total_inactive} 为上轮口径,本次统计以角色维度查最后完课记录")
cur.close()
conn.close()
print("\n✅ 完成")