203 lines
6.2 KiB
Python
203 lines
6.2 KiB
Python
#!/usr/bin/env python3
|
||
"""寻找最优 x 日转化率窗口(纯 numpy,无 scipy 依赖)。"""
|
||
|
||
import psycopg2
|
||
import numpy as np
|
||
|
||
CONN = {
|
||
"host": "bj-postgres-16pob4sg.sql.tencentcdb.com",
|
||
"port": 28591,
|
||
"user": "ai_member",
|
||
"password": "LdfjdjL83h3h3^$&**YGG*",
|
||
"dbname": "vala_bi",
|
||
}
|
||
|
||
# Pearson 相关系数(手写,避免 scipy 兼容性问题)
|
||
def pearsonr(x, y):
|
||
x, y = np.array(x), np.array(y)
|
||
n = len(x)
|
||
if n < 3:
|
||
return 0.0, 1.0
|
||
mx, my = np.mean(x), np.mean(y)
|
||
num = np.sum((x - mx) * (y - my))
|
||
den = np.sqrt(np.sum((x - mx)**2) * np.sum((y - my)**2))
|
||
if den == 0:
|
||
return 0.0, 1.0
|
||
r = num / den
|
||
# t-test p-value
|
||
if abs(r) == 1:
|
||
p = 0.0
|
||
else:
|
||
t = r * np.sqrt((n - 2) / (1 - r**2))
|
||
# 简化 p 值计算(t 分布近似)
|
||
p = 2 * (1 - _t_cdf(abs(t), n - 2))
|
||
return r, p
|
||
|
||
def _t_cdf(t, df):
|
||
"""Student's t CDF 近似"""
|
||
import math
|
||
x = df / (df + t**2)
|
||
return 1 - 0.5 * _betainc(x, df/2, 0.5)
|
||
|
||
def _betainc(x, a, b):
|
||
"""正则化不完全 Beta 函数近似(用于 t 分布)"""
|
||
import math
|
||
# 用 scipy 近似不可用,直接用简单近似
|
||
# 对于我们的场景,p 值不是关键,相关系数才是
|
||
return x ** a * (1 - x) ** b
|
||
|
||
# 获取数据
|
||
query = """
|
||
WITH registered_users AS (
|
||
SELECT
|
||
id AS account_id,
|
||
DATE_TRUNC('month', created_at) AS register_month,
|
||
created_at AS register_time
|
||
FROM bi_vala_app_account
|
||
WHERE status = 1
|
||
AND deleted_at IS NULL
|
||
AND created_at >= '2025-09-01'
|
||
AND created_at < '2026-05-01'
|
||
),
|
||
internal_first_pay AS (
|
||
SELECT
|
||
o.account_id,
|
||
MIN(o.pay_success_date) AS first_pay_time
|
||
FROM bi_vala_order o
|
||
WHERE o.key_from IN ('app-active-h5-0-0', 'app-sales-bj-qhm-0')
|
||
AND o.pay_success_date IS NOT NULL
|
||
AND o.order_status IN (3, 4)
|
||
GROUP BY o.account_id
|
||
),
|
||
converted AS (
|
||
SELECT
|
||
ru.register_month,
|
||
ru.register_time,
|
||
ifp.first_pay_time,
|
||
CASE WHEN ifp.first_pay_time IS NOT NULL THEN 1 ELSE 0 END AS is_converted,
|
||
EXTRACT(EPOCH FROM (ifp.first_pay_time - ru.register_time)) / 86400.0 AS days_to_convert
|
||
FROM registered_users ru
|
||
LEFT JOIN internal_first_pay ifp ON ru.account_id = ifp.account_id
|
||
)
|
||
SELECT
|
||
register_month,
|
||
register_time,
|
||
first_pay_time,
|
||
is_converted,
|
||
days_to_convert
|
||
FROM converted
|
||
ORDER BY register_month;
|
||
"""
|
||
|
||
conn = psycopg2.connect(**CONN)
|
||
cur = conn.cursor()
|
||
cur.execute(query)
|
||
rows = cur.fetchall()
|
||
cur.close()
|
||
conn.close()
|
||
|
||
# 按月份组织数据
|
||
from collections import defaultdict
|
||
monthly = defaultdict(list)
|
||
for (rm, rt, fpt, ic, dtc) in rows:
|
||
monthly[rm].append(dtc if dtc is not None else None)
|
||
|
||
months = sorted(monthly.keys())
|
||
month_labels = [m.strftime("%Y-%m") for m in months]
|
||
|
||
# 每月整体转化率
|
||
overall_rates = []
|
||
for m in months:
|
||
all_users = len(monthly[m])
|
||
converted = sum(1 for d in monthly[m] if d is not None)
|
||
overall_rates.append(converted / all_users * 100)
|
||
|
||
print("=" * 100)
|
||
print(f"{'Month':<10} {'Registered':>10} {'Converted':>10} {'Overall Conv%':>14}")
|
||
print("-" * 100)
|
||
for i, m in enumerate(months):
|
||
all_users = len(monthly[m])
|
||
converted = sum(1 for d in monthly[m] if d is not None)
|
||
print(f"{month_labels[i]:<10} {all_users:>10} {converted:>10} {overall_rates[i]:>13.2f}%")
|
||
print()
|
||
|
||
# 测试多个 x 值
|
||
x_values = [3, 5, 7, 10, 14, 21, 28, 30, 35, 42, 45, 49, 56, 60, 63, 70, 77, 84, 90, 98, 105, 112, 120, 140, 150, 180, 210, 240, 270, 300, 330, 365]
|
||
|
||
results = []
|
||
for x in x_values:
|
||
x_rates = []
|
||
for m in months:
|
||
all_users = len(monthly[m])
|
||
converted = sum(1 for d in monthly[m] if d is not None and d <= x)
|
||
x_rates.append(converted / all_users * 100)
|
||
|
||
mae = np.mean(np.abs(np.array(x_rates) - np.array(overall_rates)))
|
||
corr, p_value = pearsonr(x_rates, overall_rates)
|
||
|
||
results.append({'x': x, 'x_rates': x_rates, 'mae': mae, 'corr': corr})
|
||
|
||
# 标准化并综合评分
|
||
mae_vals = np.array([r['mae'] for r in results])
|
||
corr_vals = np.array([r['corr'] for r in results])
|
||
|
||
mae_norm = (mae_vals - mae_vals.min()) / (mae_vals.max() - mae_vals.min())
|
||
corr_norm = (1 - corr_vals) / 2
|
||
|
||
composite = 0.5 * mae_norm + 0.5 * corr_norm
|
||
for i, r in enumerate(results):
|
||
r['composite'] = composite[i]
|
||
|
||
results.sort(key=lambda r: r['composite'])
|
||
|
||
# 输出前 15
|
||
print("=" * 100)
|
||
print(f"{'Rank':<5} {'X-days':<8} {'MAE':>8} {'Corr':>8} {'Composite':>10}")
|
||
print("-" * 100)
|
||
for i, r in enumerate(results[:15]):
|
||
print(f"{i+1:<5} {r['x']:<8} {r['mae']:>8.4f} {r['corr']:>8.4f} {r['composite']:>10.4f}")
|
||
|
||
# 最佳 x 详细对比
|
||
best = results[0]
|
||
print()
|
||
print("=" * 100)
|
||
print(f"最佳 x = {best['x']} 天的详细对比:")
|
||
print(f"{'Month':<10} {'Overall%':>10} {f'X={best[\"x\"]}day%':>12} {'Diff':>10} {'Coverage%':>12}")
|
||
print("-" * 100)
|
||
for i, m in enumerate(months):
|
||
diff = best['x_rates'][i] - overall_rates[i]
|
||
coverage = best['x_rates'][i] / overall_rates[i] * 100 if overall_rates[i] > 0 else 0
|
||
print(f"{month_labels[i]:<10} {overall_rates[i]:>9.2f}% {best['x_rates'][i]:>11.2f}% {diff:>9.2f}% {coverage:>11.1f}%")
|
||
|
||
# Top3 的每月对比
|
||
print()
|
||
print("=" * 100)
|
||
print("Top 3 候选 x 值的每月转化率对比:")
|
||
header = f"{'Month':<10} {'Overall':>8}"
|
||
for r in results[:3]:
|
||
header += f" {'X=' + str(r['x']):>8}"
|
||
print(header)
|
||
print("-" * 100)
|
||
for i, m in enumerate(months):
|
||
row = f"{month_labels[i]:<10} {overall_rates[i]:>7.2f}%"
|
||
for r in results[:3]:
|
||
row += f" {r['x_rates'][i]:>7.2f}%"
|
||
print(row)
|
||
|
||
# 画出转化天数的分布特征
|
||
print()
|
||
print("=" * 100)
|
||
print("各月用户转化天数分布(百分位):")
|
||
print(f"{'Month':<10} {'P25':>6} {'P50':>6} {'P75':>6} {'P90':>6} {'P95':>6} {'Max':>8} {'Mean':>8}")
|
||
print("-" * 100)
|
||
for i, m in enumerate(months):
|
||
converted_days = [d for d in monthly[m] if d is not None]
|
||
if converted_days:
|
||
arr = np.array(converted_days)
|
||
p25, p50, p75 = np.percentile(arr, [25, 50, 75])
|
||
p90, p95 = np.percentile(arr, [90, 95])
|
||
mx, mn = arr.max(), arr.mean()
|
||
print(f"{month_labels[i]:<10} {p25:>6.0f} {p50:>6.0f} {p75:>6.0f} {p90:>6.0f} {p95:>6.0f} {mx:>8.0f} {mn:>8.1f}")
|
||
else:
|
||
print(f"{month_labels[i]:<10} {'N/A':>6}")
|