#!/usr/bin/env python3 """寻找最优 x 日转化率窗口(纯 numpy,无 scipy 依赖)。""" import psycopg2 import numpy as np CONN = { "host": "bj-postgres-16pob4sg.sql.tencentcdb.com", "port": 28591, "user": "ai_member", "password": "LdfjdjL83h3h3^$&**YGG*", "dbname": "vala_bi", } # Pearson 相关系数(手写,避免 scipy 兼容性问题) def pearsonr(x, y): x, y = np.array(x), np.array(y) n = len(x) if n < 3: return 0.0, 1.0 mx, my = np.mean(x), np.mean(y) num = np.sum((x - mx) * (y - my)) den = np.sqrt(np.sum((x - mx)**2) * np.sum((y - my)**2)) if den == 0: return 0.0, 1.0 r = num / den # t-test p-value if abs(r) == 1: p = 0.0 else: t = r * np.sqrt((n - 2) / (1 - r**2)) # 简化 p 值计算(t 分布近似) p = 2 * (1 - _t_cdf(abs(t), n - 2)) return r, p def _t_cdf(t, df): """Student's t CDF 近似""" import math x = df / (df + t**2) return 1 - 0.5 * _betainc(x, df/2, 0.5) def _betainc(x, a, b): """正则化不完全 Beta 函数近似(用于 t 分布)""" import math # 用 scipy 近似不可用,直接用简单近似 # 对于我们的场景,p 值不是关键,相关系数才是 return x ** a * (1 - x) ** b # 获取数据 query = """ WITH registered_users AS ( SELECT id AS account_id, DATE_TRUNC('month', created_at) AS register_month, created_at AS register_time FROM bi_vala_app_account WHERE status = 1 AND deleted_at IS NULL AND created_at >= '2025-09-01' AND created_at < '2026-05-01' ), internal_first_pay AS ( SELECT o.account_id, MIN(o.pay_success_date) AS first_pay_time FROM bi_vala_order o WHERE o.key_from IN ('app-active-h5-0-0', 'app-sales-bj-qhm-0') AND o.pay_success_date IS NOT NULL AND o.order_status IN (3, 4) GROUP BY o.account_id ), converted AS ( SELECT ru.register_month, ru.register_time, ifp.first_pay_time, CASE WHEN ifp.first_pay_time IS NOT NULL THEN 1 ELSE 0 END AS is_converted, EXTRACT(EPOCH FROM (ifp.first_pay_time - ru.register_time)) / 86400.0 AS days_to_convert FROM registered_users ru LEFT JOIN internal_first_pay ifp ON ru.account_id = ifp.account_id ) SELECT register_month, register_time, first_pay_time, is_converted, days_to_convert FROM converted ORDER BY register_month; """ conn = psycopg2.connect(**CONN) cur = conn.cursor() cur.execute(query) rows = cur.fetchall() cur.close() conn.close() # 按月份组织数据 from collections import defaultdict monthly = defaultdict(list) for (rm, rt, fpt, ic, dtc) in rows: monthly[rm].append(dtc if dtc is not None else None) months = sorted(monthly.keys()) month_labels = [m.strftime("%Y-%m") for m in months] # 每月整体转化率 overall_rates = [] for m in months: all_users = len(monthly[m]) converted = sum(1 for d in monthly[m] if d is not None) overall_rates.append(converted / all_users * 100) print("=" * 100) print(f"{'Month':<10} {'Registered':>10} {'Converted':>10} {'Overall Conv%':>14}") print("-" * 100) for i, m in enumerate(months): all_users = len(monthly[m]) converted = sum(1 for d in monthly[m] if d is not None) print(f"{month_labels[i]:<10} {all_users:>10} {converted:>10} {overall_rates[i]:>13.2f}%") print() # 测试多个 x 值 x_values = [3, 5, 7, 10, 14, 21, 28, 30, 35, 42, 45, 49, 56, 60, 63, 70, 77, 84, 90, 98, 105, 112, 120, 140, 150, 180, 210, 240, 270, 300, 330, 365] results = [] for x in x_values: x_rates = [] for m in months: all_users = len(monthly[m]) converted = sum(1 for d in monthly[m] if d is not None and d <= x) x_rates.append(converted / all_users * 100) mae = np.mean(np.abs(np.array(x_rates) - np.array(overall_rates))) corr, p_value = pearsonr(x_rates, overall_rates) results.append({'x': x, 'x_rates': x_rates, 'mae': mae, 'corr': corr}) # 标准化并综合评分 mae_vals = np.array([r['mae'] for r in results]) corr_vals = np.array([r['corr'] for r in results]) mae_norm = (mae_vals - mae_vals.min()) / (mae_vals.max() - mae_vals.min()) corr_norm = (1 - corr_vals) / 2 composite = 0.5 * mae_norm + 0.5 * corr_norm for i, r in enumerate(results): r['composite'] = composite[i] results.sort(key=lambda r: r['composite']) # 输出前 15 print("=" * 100) print(f"{'Rank':<5} {'X-days':<8} {'MAE':>8} {'Corr':>8} {'Composite':>10}") print("-" * 100) for i, r in enumerate(results[:15]): print(f"{i+1:<5} {r['x']:<8} {r['mae']:>8.4f} {r['corr']:>8.4f} {r['composite']:>10.4f}") # 最佳 x 详细对比 best = results[0] print() print("=" * 100) print(f"最佳 x = {best['x']} 天的详细对比:") print(f"{'Month':<10} {'Overall%':>10} {f'X={best[\"x\"]}day%':>12} {'Diff':>10} {'Coverage%':>12}") print("-" * 100) for i, m in enumerate(months): diff = best['x_rates'][i] - overall_rates[i] coverage = best['x_rates'][i] / overall_rates[i] * 100 if overall_rates[i] > 0 else 0 print(f"{month_labels[i]:<10} {overall_rates[i]:>9.2f}% {best['x_rates'][i]:>11.2f}% {diff:>9.2f}% {coverage:>11.1f}%") # Top3 的每月对比 print() print("=" * 100) print("Top 3 候选 x 值的每月转化率对比:") header = f"{'Month':<10} {'Overall':>8}" for r in results[:3]: header += f" {'X=' + str(r['x']):>8}" print(header) print("-" * 100) for i, m in enumerate(months): row = f"{month_labels[i]:<10} {overall_rates[i]:>7.2f}%" for r in results[:3]: row += f" {r['x_rates'][i]:>7.2f}%" print(row) # 画出转化天数的分布特征 print() print("=" * 100) print("各月用户转化天数分布(百分位):") print(f"{'Month':<10} {'P25':>6} {'P50':>6} {'P75':>6} {'P90':>6} {'P95':>6} {'Max':>8} {'Mean':>8}") print("-" * 100) for i, m in enumerate(months): converted_days = [d for d in monthly[m] if d is not None] if converted_days: arr = np.array(converted_days) p25, p50, p75 = np.percentile(arr, [25, 50, 75]) p90, p95 = np.percentile(arr, [90, 95]) mx, mn = arr.max(), arr.mean() print(f"{month_labels[i]:<10} {p25:>6.0f} {p50:>6.0f} {p75:>6.0f} {p90:>6.0f} {p95:>6.0f} {mx:>8.0f} {mn:>8.1f}") else: print(f"{month_labels[i]:<10} {'N/A':>6}")