ai_member_xiaoxi/scripts/daily_reg_analysis.py
2026-05-09 08:00:01 +08:00

256 lines
12 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
2026年1-4月每日新增注册人数回归分析V4 - 比例潮汐+保底)
改进点:
1. 星期潮汐效应改为比例补偿:周末效应 = 趋势 × (1 + 比例),而非趋势 + 固定值
理由1月基线50人时+11是+22%4月基线120人时+11只+9%,固定值会随趋势增长等比稀释
2. 实际值低于拟合值时保留实际值(活动日才用拟合值替换)
"""
import numpy as np
import pandas as pd
# ==================== 全量数据 ====================
raw_data = {
'2026-01-01': 78, '2026-01-02': 74, '2026-01-03': 69, '2026-01-04': 44,
'2026-01-05': 56, '2026-01-06': 33, '2026-01-07': 52, '2026-01-08': 60,
'2026-01-09': 58, '2026-01-10': 84, '2026-01-11': 75, '2026-01-12': 35,
'2026-01-13': 46, '2026-01-14': 59, '2026-01-15': 32, '2026-01-16': 37,
'2026-01-17': 67, '2026-01-18': 71, '2026-01-19': 53, '2026-01-20': 49,
'2026-01-21': 42, '2026-01-22': 63, '2026-01-23': 48, '2026-01-24': 72,
'2026-01-25': 86, '2026-01-26': 62, '2026-01-27': 58, '2026-01-28': 150,
'2026-01-29': 106, '2026-01-30': 88, '2026-01-31': 61,
'2026-02-01': 55, '2026-02-02': 53, '2026-02-03': 42, '2026-02-04': 39,
'2026-02-05': 42, '2026-02-06': 55, '2026-02-07': 37, '2026-02-08': 47,
'2026-02-09': 42, '2026-02-10': 60, '2026-02-11': 265, '2026-02-12': 59,
'2026-02-13': 42, '2026-02-14': 32, '2026-02-15': 41, '2026-02-16': 28,
'2026-02-17': 52, '2026-02-18': 23, '2026-02-19': 35, '2026-02-20': 26,
'2026-02-21': 37, '2026-02-22': 36, '2026-02-23': 47, '2026-02-24': 45,
'2026-02-25': 64, '2026-02-26': 182, '2026-02-27': 102, '2026-02-28': 168,
'2026-03-01': 126, '2026-03-02': 108, '2026-03-03': 78, '2026-03-04': 111,
'2026-03-05': 453, '2026-03-06': 218, '2026-03-07': 241, '2026-03-08': 207,
'2026-03-09': 168, '2026-03-10': 109, '2026-03-11': 135, '2026-03-12': 178,
'2026-03-13': 245, '2026-03-14': 131, '2026-03-15': 132, '2026-03-16': 110,
'2026-03-17': 96, '2026-03-18': 96, '2026-03-19': 78, '2026-03-20': 103,
'2026-03-21': 133, '2026-03-22': 126, '2026-03-23': 85, '2026-03-24': 80,
'2026-03-25': 94, '2026-03-26': 85, '2026-03-27': 95, '2026-03-28': 105,
'2026-03-29': 101, '2026-03-30': 82, '2026-03-31': 100,
'2026-04-01': 138, '2026-04-02': 165, '2026-04-03': 779, '2026-04-04': 341,
'2026-04-05': 186, '2026-04-06': 207, '2026-04-07': 212, '2026-04-08': 751,
'2026-04-09': 341, '2026-04-10': 129, '2026-04-11': 174, '2026-04-12': 151,
'2026-04-13': 118, '2026-04-14': 133, '2026-04-15': 126, '2026-04-16': 115,
'2026-04-17': 84, '2026-04-18': 117, '2026-04-19': 120, '2026-04-20': 89,
'2026-04-21': 100, '2026-04-22': 179, '2026-04-23': 139, '2026-04-24': 122,
'2026-04-25': 142, '2026-04-26': 137, '2026-04-27': 120, '2026-04-28': 164,
'2026-04-29': 65, '2026-04-30': 58,
}
dates = sorted(raw_data.keys())
date_series = pd.to_datetime(dates)
values = np.array([raw_data[d] for d in dates], dtype=float)
n = len(values)
x = np.arange(n, dtype=float)
dow = np.array([d.weekday() for d in date_series]) # 0=Mon, 6=Sun
# ==================== 活动标记 ====================
activity_events = {
'2026-01-28': ('活动', 1),
'2026-02-11': ('活动', 0),
'2026-02-26': ('活动', 4),
'2026-03-05': ('活动', 3),
'2026-03-09': ('活动', 0),
'2026-03-12': ('活动', 0),
'2026-03-13': ('活动', 0),
'2026-04-03': ('大型活动', 4),
'2026-04-08': ('大型活动', 2),
'2026-04-22': ('小活动', 1),
'2026-04-28': ('小活动', 0),
}
affected_reason = {}
for d_str, (reason, aftereffect_days) in activity_events.items():
base_idx = dates.index(d_str)
affected_reason[d_str] = reason
for offset in range(1, aftereffect_days + 1):
idx = base_idx + offset
if idx < n:
ae_date = dates[idx]
if ae_date not in affected_reason:
affected_reason[ae_date] = f'{dates[base_idx]} 余波'
is_affected = np.array([d in affected_reason for d in dates])
# ==================== LOESS ====================
def loess_smooth(x_data, y_data, x_eval, frac=0.28, deg=2):
n_eval = len(x_eval)
n_data = len(x_data)
y_smooth = np.zeros(n_eval)
for j in range(n_eval):
x0 = x_eval[j]; dist = np.abs(x_data - x0)
k = max(int(frac * n_data), deg + 2); k = min(k, n_data)
idx = np.argpartition(dist, k)[:k]
x_nn, y_nn, d_nn = x_data[idx], y_data[idx], dist[idx]
max_dist = np.max(d_nn)
w = (1 - (d_nn / max_dist) ** 3) ** 3 if max_dist > 0 else np.ones(k)
A = np.column_stack([np.ones(k), x_nn, x_nn**2]); W = np.diag(w)
try:
beta = np.linalg.solve(A.T @ W @ A, A.T @ W @ y_nn)
y_smooth[j] = beta[0] + beta[1] * x0 + beta[2] * x0**2
except np.linalg.LinAlgError:
y_smooth[j] = np.average(y_nn, weights=w)
return np.maximum(y_smooth, 0)
clean_mask = ~is_affected
clean_x, clean_y, clean_dow = x[clean_mask], values[clean_mask], dow[clean_mask]
# 第一轮LOESS 拟合原始趋势
trend_raw = loess_smooth(clean_x, clean_y, x, frac=0.28, deg=2)
# ==================== 星期效应:比例方式 ====================
# 在干净数据上计算残差率 = (实际 - 趋势) / 趋势
residual_pct = (clean_y - trend_raw[clean_mask]) / trend_raw[clean_mask]
dow_ratio = np.zeros(7)
for d in range(7):
mask = clean_dow == d
if np.sum(mask) > 0:
dow_ratio[d] = np.mean(residual_pct[mask])
# 调整使比例总和为0不影响总注册人数
dow_ratio -= np.mean(dow_ratio)
print("=" * 80)
print("2026年3-4月 每日新增注册回归分析 V4比例潮汐 + 保底)")
print("=" * 80)
print(f"\n📊 星期潮汐效应(比例方式):")
dow_names = ['周一', '周二', '周三', '周四', '周五', '周六', '周日']
for i, name in enumerate(dow_names):
bar = '' * max(1, int(abs(dow_ratio[i]) * 200))
sign = '+' if dow_ratio[i] >= 0 else ''
print(f" {name}: {sign}{dow_ratio[i]*100:.1f}% {bar}")
# ==================== 最终拟合 = 趋势 × (1 + 星期比例) ====================
trend_with_dow = trend_raw * (1 + dow_ratio[dow])
trend_with_dow = np.maximum(trend_with_dow, 0)
# ==================== 有效注册人数 ====================
# 规则:活动日 → min(实际, 拟合);非活动日 → 保留实际值
effective = np.where(is_affected, np.minimum(values, trend_with_dow), values)
print(f"\n{'='*80}")
print("📋 2026年3-4月 最终结果")
print(f"{'='*80}")
print(f"{'日期':>12} {'星期':>4} | {'实际':>5} | {'影响':>10} | {'趋势':>6} | {'趋势+潮汐':>9} | {'有效注册':>8} | {'剔除':>6}")
print("-" * 95)
mar_start = dates.index('2026-03-01')
apr_end = dates.index('2026-04-30') + 1
for i in range(mar_start, apr_end):
d = dates[i]
dw = date_series[i].strftime('%a')
tag = affected_reason.get(d, '')
trend_val = trend_raw[i]
tdow_val = trend_with_dow[i]
eff_val = effective[i]
removed = values[i] - eff_val
print(f"{d:>12} {dw:>4} | {values[i]:>5.0f} | {tag:>10} | {trend_val:>6.0f} | {tdow_val:>9.0f} | {eff_val:>8.0f} | {removed:>+6.0f}")
# ==================== 汇总 ====================
mar_apr_actual = np.sum(values[mar_start:apr_end])
mar_apr_eff = np.sum(effective[mar_start:apr_end])
mar_apr_affected_count = int(np.sum(is_affected[mar_start:apr_end]))
print(f"\n{'='*80}")
print("📊 最终汇总")
print(f"{'='*80}")
print(f" 统计天数: {apr_end - mar_start}")
print(f" 活动影响天数: {mar_apr_affected_count}")
print(f"")
print(f" 实际注册总计: {mar_apr_actual:.0f}")
print(f" 有效注册总计: {mar_apr_eff:.0f}")
print(f" 剔除活动量: {mar_apr_actual - mar_apr_eff:.0f} 人 ({(mar_apr_actual-mar_apr_eff)/mar_apr_actual*100:.1f}%)")
print(f"")
print(f" 实际日均: {np.mean(values[mar_start:apr_end]):.0f}")
print(f" 有效日均: {np.mean(effective[mar_start:apr_end]):.0f}")
print(f"")
print(f" 1-2月干净日均: {np.mean(values[:mar_start][clean_mask[:mar_start]]):.0f}")
print(f" 3-4月有效日均: {np.mean(effective[mar_start:apr_end]):.0f}")
# ==================== 对比 V3 固定值 vs V4 比例 ====================
print(f"\n{'='*80}")
print("📊 V3(固定补偿) vs V4(比例补偿) 对比")
print(f"{'='*80}")
# 用固定值方式算一遍做对比
residuals_abs = clean_y - trend_raw[clean_mask]
dow_abs = np.zeros(7)
for d in range(7):
mask = clean_dow == d
if np.sum(mask) > 0:
dow_abs[d] = np.mean(residuals_abs[mask])
dow_abs -= np.mean(dow_abs)
trend_v3 = trend_raw + dow_abs[dow]
trend_v3 = np.maximum(trend_v3, 0)
eff_v3 = np.where(is_affected, np.minimum(values, trend_v3), values)
print(f"{'方法':>20} | {'有效总计':>10} | {'剔除':>8} | {'有效日均':>8}")
print(f"{'V3 固定值补偿':>20} | {np.sum(eff_v3[mar_start:apr_end]):>10.0f} | {mar_apr_actual - np.sum(eff_v3[mar_start:apr_end]):>8.0f} | {np.mean(eff_v3[mar_start:apr_end]):>8.1f}")
print(f"{'V4 比例补偿':>20} | {mar_apr_eff:>10.0f} | {mar_apr_actual - mar_apr_eff:>8.0f} | {np.mean(effective[mar_start:apr_end]):>8.1f}")
# 列出差异较大的日期
print(f"\n V3 vs V4 有效注册差异 > 3 的日期:")
diff = effective[mar_start:apr_end] - eff_v3[mar_start:apr_end]
for i in range(len(diff)):
if abs(diff[i]) > 3:
d = dates[mar_start + i]
sign = '+' if diff[i] > 0 else ''
print(f" {d}: V3={eff_v3[mar_start+i]:.0f}, V4={effective[mar_start+i]:.0f}, 差异={sign}{diff[i]:.0f}")
# ==================== 导出Excel ====================
rows = []
for i in range(mar_start, apr_end):
rows.append({
'日期': dates[i],
'星期': date_series[i].strftime('%A'),
'实际注册人数': int(values[i]),
'是否活动影响': '' if is_affected[i] else '',
'活动说明': affected_reason.get(dates[i], ''),
'LOESS趋势': round(trend_raw[i], 0),
'趋势+潮汐(比例)': round(trend_with_dow[i], 0),
'有效注册人数(用于转化)': round(effective[i], 0),
'剔除人数': round(values[i] - effective[i], 0),
})
output_df = pd.DataFrame(rows)
output_path = '/root/.openclaw/workspace/output/2026_3-4月注册回归分析.xlsx'
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
output_df.to_excel(writer, sheet_name='每日明细', index=False)
summary_rows = [
('统计天数', apr_end - mar_start),
('活动影响天数', mar_apr_affected_count),
('', ''),
('实际注册总计', f'{mar_apr_actual:.0f}'),
('有效注册总计(用于转化)', f'{mar_apr_eff:.0f}'),
('剔除活动量', f'{mar_apr_actual - mar_apr_eff:.0f}'),
('剔除比例', f'{(mar_apr_actual-mar_apr_eff)/mar_apr_actual*100:.1f}%'),
('', ''),
('实际日均', f'{np.mean(values[mar_start:apr_end]):.1f}'),
('有效日均', f'{np.mean(effective[mar_start:apr_end]):.1f}'),
('', ''),
('1-2月干净数据日均', f'{np.mean(values[:mar_start][clean_mask[:mar_start]]):.1f}'),
('3-4月有效数据日均', f'{np.mean(effective[mar_start:apr_end]):.1f}'),
('', ''),
('星期效应-周一', f'{dow_ratio[0]*100:+.1f}%'),
('星期效应-周二', f'{dow_ratio[1]*100:+.1f}%'),
('星期效应-周三', f'{dow_ratio[2]*100:+.1f}%'),
('星期效应-周四', f'{dow_ratio[3]*100:+.1f}%'),
('星期效应-周五', f'{dow_ratio[4]*100:+.1f}%'),
('星期效应-周六', f'{dow_ratio[5]*100:+.1f}%'),
('星期效应-周日', f'{dow_ratio[6]*100:+.1f}%'),
]
pd.DataFrame(summary_rows, columns=['指标','']).to_excel(writer, sheet_name='汇总', index=False)
print(f"\n✅ Excel 已导出: {output_path}")