ai_member_xiaoxi/scripts/find_optimal_x.py
2026-05-12 08:00:01 +08:00

203 lines
6.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""寻找最优 x 日转化率窗口(纯 numpy无 scipy 依赖)。"""
import psycopg2
import numpy as np
CONN = {
"host": "bj-postgres-16pob4sg.sql.tencentcdb.com",
"port": 28591,
"user": "ai_member",
"password": "LdfjdjL83h3h3^$&**YGG*",
"dbname": "vala_bi",
}
# Pearson 相关系数(手写,避免 scipy 兼容性问题)
def pearsonr(x, y):
x, y = np.array(x), np.array(y)
n = len(x)
if n < 3:
return 0.0, 1.0
mx, my = np.mean(x), np.mean(y)
num = np.sum((x - mx) * (y - my))
den = np.sqrt(np.sum((x - mx)**2) * np.sum((y - my)**2))
if den == 0:
return 0.0, 1.0
r = num / den
# t-test p-value
if abs(r) == 1:
p = 0.0
else:
t = r * np.sqrt((n - 2) / (1 - r**2))
# 简化 p 值计算t 分布近似)
p = 2 * (1 - _t_cdf(abs(t), n - 2))
return r, p
def _t_cdf(t, df):
"""Student's t CDF 近似"""
import math
x = df / (df + t**2)
return 1 - 0.5 * _betainc(x, df/2, 0.5)
def _betainc(x, a, b):
"""正则化不完全 Beta 函数近似(用于 t 分布)"""
import math
# 用 scipy 近似不可用,直接用简单近似
# 对于我们的场景p 值不是关键,相关系数才是
return x ** a * (1 - x) ** b
# 获取数据
query = """
WITH registered_users AS (
SELECT
id AS account_id,
DATE_TRUNC('month', created_at) AS register_month,
created_at AS register_time
FROM bi_vala_app_account
WHERE status = 1
AND deleted_at IS NULL
AND created_at >= '2025-09-01'
AND created_at < '2026-05-01'
),
internal_first_pay AS (
SELECT
o.account_id,
MIN(o.pay_success_date) AS first_pay_time
FROM bi_vala_order o
WHERE o.key_from IN ('app-active-h5-0-0', 'app-sales-bj-qhm-0')
AND o.pay_success_date IS NOT NULL
AND o.order_status IN (3, 4)
GROUP BY o.account_id
),
converted AS (
SELECT
ru.register_month,
ru.register_time,
ifp.first_pay_time,
CASE WHEN ifp.first_pay_time IS NOT NULL THEN 1 ELSE 0 END AS is_converted,
EXTRACT(EPOCH FROM (ifp.first_pay_time - ru.register_time)) / 86400.0 AS days_to_convert
FROM registered_users ru
LEFT JOIN internal_first_pay ifp ON ru.account_id = ifp.account_id
)
SELECT
register_month,
register_time,
first_pay_time,
is_converted,
days_to_convert
FROM converted
ORDER BY register_month;
"""
conn = psycopg2.connect(**CONN)
cur = conn.cursor()
cur.execute(query)
rows = cur.fetchall()
cur.close()
conn.close()
# 按月份组织数据
from collections import defaultdict
monthly = defaultdict(list)
for (rm, rt, fpt, ic, dtc) in rows:
monthly[rm].append(dtc if dtc is not None else None)
months = sorted(monthly.keys())
month_labels = [m.strftime("%Y-%m") for m in months]
# 每月整体转化率
overall_rates = []
for m in months:
all_users = len(monthly[m])
converted = sum(1 for d in monthly[m] if d is not None)
overall_rates.append(converted / all_users * 100)
print("=" * 100)
print(f"{'Month':<10} {'Registered':>10} {'Converted':>10} {'Overall Conv%':>14}")
print("-" * 100)
for i, m in enumerate(months):
all_users = len(monthly[m])
converted = sum(1 for d in monthly[m] if d is not None)
print(f"{month_labels[i]:<10} {all_users:>10} {converted:>10} {overall_rates[i]:>13.2f}%")
print()
# 测试多个 x 值
x_values = [3, 5, 7, 10, 14, 21, 28, 30, 35, 42, 45, 49, 56, 60, 63, 70, 77, 84, 90, 98, 105, 112, 120, 140, 150, 180, 210, 240, 270, 300, 330, 365]
results = []
for x in x_values:
x_rates = []
for m in months:
all_users = len(monthly[m])
converted = sum(1 for d in monthly[m] if d is not None and d <= x)
x_rates.append(converted / all_users * 100)
mae = np.mean(np.abs(np.array(x_rates) - np.array(overall_rates)))
corr, p_value = pearsonr(x_rates, overall_rates)
results.append({'x': x, 'x_rates': x_rates, 'mae': mae, 'corr': corr})
# 标准化并综合评分
mae_vals = np.array([r['mae'] for r in results])
corr_vals = np.array([r['corr'] for r in results])
mae_norm = (mae_vals - mae_vals.min()) / (mae_vals.max() - mae_vals.min())
corr_norm = (1 - corr_vals) / 2
composite = 0.5 * mae_norm + 0.5 * corr_norm
for i, r in enumerate(results):
r['composite'] = composite[i]
results.sort(key=lambda r: r['composite'])
# 输出前 15
print("=" * 100)
print(f"{'Rank':<5} {'X-days':<8} {'MAE':>8} {'Corr':>8} {'Composite':>10}")
print("-" * 100)
for i, r in enumerate(results[:15]):
print(f"{i+1:<5} {r['x']:<8} {r['mae']:>8.4f} {r['corr']:>8.4f} {r['composite']:>10.4f}")
# 最佳 x 详细对比
best = results[0]
print()
print("=" * 100)
print(f"最佳 x = {best['x']} 天的详细对比:")
print(f"{'Month':<10} {'Overall%':>10} {f'X={best[\"x\"]}day%':>12} {'Diff':>10} {'Coverage%':>12}")
print("-" * 100)
for i, m in enumerate(months):
diff = best['x_rates'][i] - overall_rates[i]
coverage = best['x_rates'][i] / overall_rates[i] * 100 if overall_rates[i] > 0 else 0
print(f"{month_labels[i]:<10} {overall_rates[i]:>9.2f}% {best['x_rates'][i]:>11.2f}% {diff:>9.2f}% {coverage:>11.1f}%")
# Top3 的每月对比
print()
print("=" * 100)
print("Top 3 候选 x 值的每月转化率对比:")
header = f"{'Month':<10} {'Overall':>8}"
for r in results[:3]:
header += f" {'X=' + str(r['x']):>8}"
print(header)
print("-" * 100)
for i, m in enumerate(months):
row = f"{month_labels[i]:<10} {overall_rates[i]:>7.2f}%"
for r in results[:3]:
row += f" {r['x_rates'][i]:>7.2f}%"
print(row)
# 画出转化天数的分布特征
print()
print("=" * 100)
print("各月用户转化天数分布(百分位):")
print(f"{'Month':<10} {'P25':>6} {'P50':>6} {'P75':>6} {'P90':>6} {'P95':>6} {'Max':>8} {'Mean':>8}")
print("-" * 100)
for i, m in enumerate(months):
converted_days = [d for d in monthly[m] if d is not None]
if converted_days:
arr = np.array(converted_days)
p25, p50, p75 = np.percentile(arr, [25, 50, 75])
p90, p95 = np.percentile(arr, [90, 95])
mx, mn = arr.max(), arr.mean()
print(f"{month_labels[i]:<10} {p25:>6.0f} {p50:>6.0f} {p75:>6.0f} {p90:>6.0f} {p95:>6.0f} {mx:>8.0f} {mn:>8.1f}")
else:
print(f"{month_labels[i]:<10} {'N/A':>6}")