diff --git a/scripts/challenge_click_analysis.py b/scripts/challenge_click_analysis.py new file mode 100644 index 0000000..61e32e3 --- /dev/null +++ b/scripts/challenge_click_analysis.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 +"""分析 homepage_challenge_click 入口点击用户数,按 A1/A2 拆分""" + +import json +import urllib.request +import base64 +import ssl +import psycopg2 + +ES_HOST = "es-7vd7jcu9.public.tencentelasticsearch.com" +ES_PORT = 9200 +ES_USER = "elastic" +ES_PASS = "F%?QDcWes7N2WTuiYD11" + +PG_HOST = "bj-postgres-16pob4sg.sql.tencentcdb.com" +PG_PORT = 28591 +PG_USER = "ai_member" +PG_PASS = "LdfjdjL83h3h3^$&**YGG*" +PG_DB = "vala_bi" + +START_TS = 1747939200 # 2026-05-23 00:00:00 CST +END_TS = 1750521599 # 2026-06-21 23:59:59 CST + +ctx = ssl.create_default_context() +ctx.check_hostname = False +ctx.verify_mode = ssl.CERT_NONE + +def es_query(body): + url = f"https://{ES_HOST}:{ES_PORT}/user_behavior_buried_points/_search" + auth = base64.b64encode(f"{ES_USER}:{ES_PASS}".encode()).decode() + req = urllib.request.Request(url, data=json.dumps(body).encode(), headers={ + "Content-Type": "application/json", + "Authorization": f"Basic {auth}" + }) + resp = urllib.request.urlopen(req, context=ctx) + return json.loads(resp.read()) + +# Step 1: 获取所有点击单元挑战入口的 accountId +print("Step 1: 从 ES 获取所有 accountId...") +all_account_ids = set() +after_key = None +page = 0 +while True: + page += 1 + body = { + "size": 0, + "query": { + "bool": { + "must": [ + {"term": {"buryingPointId": 500}}, + {"term": {"buryingPointSubId": 14}}, + {"range": {"activeTime": {"gte": START_TS, "lte": END_TS}}} + ] + } + }, + "aggs": { + "users": { + "composite": { + "size": 10000, + "sources": [{"accountId": {"terms": {"field": "accountId"}}}] + } + } + } + } + if after_key: + body["aggs"]["users"]["composite"]["after"] = after_key + + result = es_query(body) + buckets = result.get("aggregations", {}).get("users", {}).get("buckets", []) + for b in buckets: + aid = b["key"]["accountId"] + if aid > 0: + all_account_ids.add(aid) + + after_key = result.get("aggregations", {}).get("users", {}).get("after_key") + if not after_key or not buckets: + break + if page % 10 == 0: + print(f" page {page}, collected {len(all_account_ids)} users so far...") + +print(f" 共 {len(all_account_ids)} 个唯一 accountId(已排除 accountId=0)") + +if not all_account_ids: + print(" 没有数据,退出") + exit(0) + +# Step 2: 从 PG 获取这些用户的 course_level +print("\nStep 2: 从 PostgreSQL 获取用户课程等级...") +conn = psycopg2.connect(host=PG_HOST, port=PG_PORT, user=PG_USER, password=PG_PASS, dbname=PG_DB) +cur = conn.cursor() + +# 分批查询 +batch_size = 500 +account_list = list(all_account_ids) +user_levels = {} # accountId -> set of course_levels + +for i in range(0, len(account_list), batch_size): + batch = account_list[i:i+batch_size] + placeholders = ",".join(["%s"] * len(batch)) + cur.execute(f""" + SELECT DISTINCT account_id, course_level + FROM bi_user_course_detail + WHERE account_id IN ({placeholders}) + AND deleted_at IS NULL + """, batch) + for row in cur.fetchall(): + aid = row[0] + level = row[1] + if aid not in user_levels: + user_levels[aid] = set() + user_levels[aid].add(level) + +cur.close() +conn.close() + +# Step 3: 统计 +print("\nStep 3: 统计结果...") +a1_users = set() +a2_users = set() +both_users = set() +unknown_users = set() + +for aid in all_account_ids: + levels = user_levels.get(aid, set()) + is_a1 = "L1" in levels or "A1" in levels + is_a2 = "L2" in levels or "A2" in levels + + if is_a1 and is_a2: + both_users.add(aid) + elif is_a1: + a1_users.add(aid) + elif is_a2: + a2_users.add(aid) + else: + unknown_users.add(aid) + +print(f"\n{'='*50}") +print(f"单元挑战入口点击用户数统计") +print(f"统计周期: 2026-05-23 ~ 2026-06-21") +print(f"{'='*50}") +print(f"A1 (L1) 用户数: {len(a1_users)}") +print(f"A2 (L2) 用户数: {len(a2_users)}") +print(f"同时有 A1+A2 课程: {len(both_users)}") +print(f"未匹配到课程等级: {len(unknown_users)}") +print(f"{'='*50}") +print(f"总用户数: {len(all_account_ids)}") +print(f"A1 合计 (含双课程): {len(a1_users) + len(both_users)}") +print(f"A2 合计 (含双课程): {len(a2_users) + len(both_users)}") + +if unknown_users: + print(f"\n未匹配用户 accountId: {sorted(unknown_users)[:20]}...") diff --git a/scripts/challenge_click_daily.py b/scripts/challenge_click_daily.py new file mode 100644 index 0000000..6d55ab3 --- /dev/null +++ b/scripts/challenge_click_daily.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +"""单元挑战入口点击 埋点日活(2026-05-23 ~ 2026-06-21,按A1/A2拆分)""" + +import json, urllib.request, base64, ssl +from collections import defaultdict +from datetime import datetime, timezone, timedelta + +ES_HOST = "es-7vd7jcu9.public.tencentelasticsearch.com" +ES_PORT = 9200 +ES_USER = "elastic" +ES_PASS = "F%?QDcWes7N2WTuiYD11" + +START_TS = 1779465600 # 2026-05-23 00:00:00 CST +END_TS = 1782057599 # 2026-06-21 23:59:59 CST +CST = timezone(timedelta(hours=8)) + +ctx = ssl.create_default_context() +ctx.check_hostname = False +ctx.verify_mode = ssl.CERT_NONE + +def es_query(body): + url = f"https://{ES_HOST}:{ES_PORT}/user_behavior_buried_points/_search" + auth = base64.b64encode(f"{ES_USER}:{ES_PASS}".encode()).decode() + req = urllib.request.Request(url, data=json.dumps(body).encode(), headers={ + "Content-Type": "application/json", + "Authorization": f"Basic {auth}" + }) + resp = urllib.request.urlopen(req, context=ctx) + return json.loads(resp.read()) + +# Step 1: 一次拉取所有数据 +print("Step 1: 从 ES 获取数据...") +body = { + "size": 10000, + "query": { + "bool": { + "must": [ + {"term": {"buryingPointId": 1300}}, + {"term": {"buryingPointSubId": 19}}, + {"range": {"activeTime": {"gte": START_TS, "lte": END_TS}}} + ] + } + }, + "_source": ["accountId", "activeTime", "courseLevel"] +} + +result = es_query(body) +hits = result.get("hits", {}).get("hits", []) +total = result.get("hits", {}).get("total", {}).get("value", 0) +print(f" 共 {total} 条记录, 返回 {len(hits)} 条") + +# 按天 + 按课程等级汇总 +daily_a1 = defaultdict(set) +daily_a2 = defaultdict(set) +all_a1 = set() +all_a2 = set() + +for hit in hits: + src = hit["_source"] + aid = src.get("accountId", 0) + ts = src.get("activeTime", 0) + level = src.get("courseLevel", "") + if aid <= 0 or ts <= 0: + continue + dt = datetime.fromtimestamp(ts, tz=CST) + date_str = dt.strftime("%Y-%m-%d") + if level == "A1": + daily_a1[date_str].add(aid) + all_a1.add(aid) + elif level == "A2": + daily_a2[date_str].add(aid) + all_a2.add(aid) + +# 合并所有日期 +all_dates = sorted(set(list(daily_a1.keys()) + list(daily_a2.keys()))) + +print(f"\n{'日期':<12} {'A1日活':>6} {'A2日活':>6} {'总计':>6}") +print("-" * 36) +total_a1_daily = 0 +total_a2_daily = 0 +for d in all_dates: + a1 = len(daily_a1.get(d, set())) + a2 = len(daily_a2.get(d, set())) + total_a1_daily += a1 + total_a2_daily += a2 + print(f"{d:<12} {a1:>6} {a2:>6} {a1+a2:>6}") + +print("-" * 36) +print(f"{'合计':<12} {total_a1_daily:>6} {total_a2_daily:>6} {total_a1_daily+total_a2_daily:>6}") + +print(f"\n{'='*36}") +print(f"总去重用户数: {len(all_a1 | all_a2)}") +print(f" A1: {len(all_a1)}") +print(f" A2: {len(all_a2)}") +print(f"注:日活为每日累加值(非去重),同一用户多天活跃会重复计数") diff --git a/scripts/challenge_funnel.py b/scripts/challenge_funnel.py new file mode 100644 index 0000000..abb44f8 --- /dev/null +++ b/scripts/challenge_funnel.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +"""单元挑战漏斗:按用户交叉匹配,不依赖ex3区分路径""" + +import json, urllib.request, base64, ssl + +ES_HOST = "es-7vd7jcu9.public.tencentelasticsearch.com" +ES_PORT = 9200 +ES_USER = "elastic" +ES_PASS = "F%?QDcWes7N2WTuiYD11" +START_TS = 1779465600 +END_TS = 1782057599 + +ctx = ssl.create_default_context() +ctx.check_hostname = False +ctx.verify_mode = ssl.CERT_NONE + +def es_query(body): + url = f"https://{ES_HOST}:{ES_PORT}/user_behavior_buried_points/_search" + auth = base64.b64encode(f"{ES_USER}:{ES_PASS}".encode()).decode() + req = urllib.request.Request(url, data=json.dumps(body).encode(), headers={ + "Content-Type": "application/json", + "Authorization": f"Basic {auth}" + }) + resp = urllib.request.urlopen(req, context=ctx) + return json.loads(resp.read()) + +def fetch_users(sub_id): + """获取某个 subId 的所有用户""" + body = { + "size": 10000, + "query": { + "bool": { + "must": [ + {"term": {"buryingPointId": 1300}}, + {"term": {"buryingPointSubId": sub_id}}, + {"range": {"activeTime": {"gte": START_TS, "lte": END_TS}}} + ] + } + }, + "_source": ["accountId", "courseLevel"] + } + result = es_query(body) + hits = result.get("hits", {}).get("hits", []) + a1 = set() + a2 = set() + for hit in hits: + src = hit["_source"] + aid = src.get("accountId", 0) + level = src.get("courseLevel", "") + if aid <= 0: continue + if level == "A1": a1.add(aid) + elif level == "A2": a2.add(aid) + return a1, a2 + +# 1. 开始挑战 (subId=25) +start_a1, start_a2 = fetch_users(25) +# 2. 再次挑战 (subId=26) +restart_a1, restart_a2 = fetch_users(26) +# 3. 第一题曝光 (subId=32) - 全部 +first_a1, first_a2 = fetch_users(32) +# 4. 结算页曝光 (subId=27) - 全部 +settle_a1, settle_a2 = fetch_users(27) + +# 漏斗:按用户交叉匹配 +# 开始挑战漏斗 = 开始挑战用户 ∩ 第一题用户 ∩ 结算用户 +start_first_a1 = start_a1 & first_a1 +start_settle_a1 = start_a1 & settle_a1 +start_first_a2 = start_a2 & first_a2 +start_settle_a2 = start_a2 & settle_a2 + +# 再次挑战漏斗 = 再次挑战用户 ∩ 第一题用户 ∩ 结算用户 +restart_first_a1 = restart_a1 & first_a1 +restart_settle_a1 = restart_a1 & settle_a1 +restart_first_a2 = restart_a2 & first_a2 +restart_settle_a2 = restart_a2 & settle_a2 + +def pct(part, base): + if base == 0: return "N/A" + return f"{part/base*100:.1f}%" + +print("单元挑战漏斗 | 2026-05-23 ~ 2026-06-21") +print("方法:按用户交叉匹配,不依赖ex3区分路径") +print("=" * 60) + +print(f"\n📌 A1 开始挑战漏斗:") +print(f" 开始挑战: {len(start_a1)}人") +print(f" → 第一题曝光: {len(start_first_a1)}人 ({pct(len(start_first_a1), len(start_a1))})") +print(f" → 结算页曝光: {len(start_settle_a1)}人 ({pct(len(start_settle_a1), len(start_a1))})") + +print(f"\n📌 A1 再次挑战漏斗:") +print(f" 再次挑战: {len(restart_a1)}人") +print(f" → 第一题曝光: {len(restart_first_a1)}人 ({pct(len(restart_first_a1), len(restart_a1))})") +print(f" → 结算页曝光: {len(restart_settle_a1)}人 ({pct(len(restart_settle_a1), len(restart_a1))})") + +print(f"\n📌 A2 开始挑战漏斗:") +print(f" 开始挑战: {len(start_a2)}人") +print(f" → 第一题曝光: {len(start_first_a2)}人 ({pct(len(start_first_a2), len(start_a2))})") +print(f" → 结算页曝光: {len(start_settle_a2)}人 ({pct(len(start_settle_a2), len(start_a2))})") + +print(f"\n📌 A2 再次挑战漏斗:") +print(f" 再次挑战: {len(restart_a2)}人") +print(f" → 第一题曝光: {len(restart_first_a2)}人 ({pct(len(restart_first_a2), len(restart_a2))})") +print(f" → 结算页曝光: {len(restart_settle_a2)}人 ({pct(len(restart_settle_a2), len(restart_a2))})") + +# 诊断:结算页有但第一题没有的用户 +print(f"\n--- 诊断 ---") +settle_only_a1 = start_settle_a1 - start_first_a1 +settle_only_a2 = start_settle_a2 - start_first_a2 +print(f"A1 开始挑战→结算页有但第一题没有: {len(settle_only_a1)}人") +print(f"A2 开始挑战→结算页有但第一题没有: {len(settle_only_a2)}人")