256 lines
12 KiB
Python
256 lines
12 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
2026年1-4月每日新增注册人数回归分析(V4 - 比例潮汐+保底)
|
||
改进点:
|
||
1. 星期潮汐效应改为比例补偿:周末效应 = 趋势 × (1 + 比例),而非趋势 + 固定值
|
||
理由:1月基线50人时+11是+22%,4月基线120人时+11只+9%,固定值会随趋势增长等比稀释
|
||
2. 实际值低于拟合值时保留实际值(活动日才用拟合值替换)
|
||
"""
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
|
||
# ==================== 全量数据 ====================
|
||
raw_data = {
|
||
'2026-01-01': 78, '2026-01-02': 74, '2026-01-03': 69, '2026-01-04': 44,
|
||
'2026-01-05': 56, '2026-01-06': 33, '2026-01-07': 52, '2026-01-08': 60,
|
||
'2026-01-09': 58, '2026-01-10': 84, '2026-01-11': 75, '2026-01-12': 35,
|
||
'2026-01-13': 46, '2026-01-14': 59, '2026-01-15': 32, '2026-01-16': 37,
|
||
'2026-01-17': 67, '2026-01-18': 71, '2026-01-19': 53, '2026-01-20': 49,
|
||
'2026-01-21': 42, '2026-01-22': 63, '2026-01-23': 48, '2026-01-24': 72,
|
||
'2026-01-25': 86, '2026-01-26': 62, '2026-01-27': 58, '2026-01-28': 150,
|
||
'2026-01-29': 106, '2026-01-30': 88, '2026-01-31': 61,
|
||
'2026-02-01': 55, '2026-02-02': 53, '2026-02-03': 42, '2026-02-04': 39,
|
||
'2026-02-05': 42, '2026-02-06': 55, '2026-02-07': 37, '2026-02-08': 47,
|
||
'2026-02-09': 42, '2026-02-10': 60, '2026-02-11': 265, '2026-02-12': 59,
|
||
'2026-02-13': 42, '2026-02-14': 32, '2026-02-15': 41, '2026-02-16': 28,
|
||
'2026-02-17': 52, '2026-02-18': 23, '2026-02-19': 35, '2026-02-20': 26,
|
||
'2026-02-21': 37, '2026-02-22': 36, '2026-02-23': 47, '2026-02-24': 45,
|
||
'2026-02-25': 64, '2026-02-26': 182, '2026-02-27': 102, '2026-02-28': 168,
|
||
'2026-03-01': 126, '2026-03-02': 108, '2026-03-03': 78, '2026-03-04': 111,
|
||
'2026-03-05': 453, '2026-03-06': 218, '2026-03-07': 241, '2026-03-08': 207,
|
||
'2026-03-09': 168, '2026-03-10': 109, '2026-03-11': 135, '2026-03-12': 178,
|
||
'2026-03-13': 245, '2026-03-14': 131, '2026-03-15': 132, '2026-03-16': 110,
|
||
'2026-03-17': 96, '2026-03-18': 96, '2026-03-19': 78, '2026-03-20': 103,
|
||
'2026-03-21': 133, '2026-03-22': 126, '2026-03-23': 85, '2026-03-24': 80,
|
||
'2026-03-25': 94, '2026-03-26': 85, '2026-03-27': 95, '2026-03-28': 105,
|
||
'2026-03-29': 101, '2026-03-30': 82, '2026-03-31': 100,
|
||
'2026-04-01': 138, '2026-04-02': 165, '2026-04-03': 779, '2026-04-04': 341,
|
||
'2026-04-05': 186, '2026-04-06': 207, '2026-04-07': 212, '2026-04-08': 751,
|
||
'2026-04-09': 341, '2026-04-10': 129, '2026-04-11': 174, '2026-04-12': 151,
|
||
'2026-04-13': 118, '2026-04-14': 133, '2026-04-15': 126, '2026-04-16': 115,
|
||
'2026-04-17': 84, '2026-04-18': 117, '2026-04-19': 120, '2026-04-20': 89,
|
||
'2026-04-21': 100, '2026-04-22': 179, '2026-04-23': 139, '2026-04-24': 122,
|
||
'2026-04-25': 142, '2026-04-26': 137, '2026-04-27': 120, '2026-04-28': 164,
|
||
'2026-04-29': 65, '2026-04-30': 58,
|
||
}
|
||
|
||
dates = sorted(raw_data.keys())
|
||
date_series = pd.to_datetime(dates)
|
||
values = np.array([raw_data[d] for d in dates], dtype=float)
|
||
n = len(values)
|
||
x = np.arange(n, dtype=float)
|
||
dow = np.array([d.weekday() for d in date_series]) # 0=Mon, 6=Sun
|
||
|
||
# ==================== 活动标记 ====================
|
||
activity_events = {
|
||
'2026-01-28': ('活动', 1),
|
||
'2026-02-11': ('活动', 0),
|
||
'2026-02-26': ('活动', 4),
|
||
'2026-03-05': ('活动', 3),
|
||
'2026-03-09': ('活动', 0),
|
||
'2026-03-12': ('活动', 0),
|
||
'2026-03-13': ('活动', 0),
|
||
'2026-04-03': ('大型活动', 4),
|
||
'2026-04-08': ('大型活动', 2),
|
||
'2026-04-22': ('小活动', 1),
|
||
'2026-04-28': ('小活动', 0),
|
||
}
|
||
|
||
affected_reason = {}
|
||
for d_str, (reason, aftereffect_days) in activity_events.items():
|
||
base_idx = dates.index(d_str)
|
||
affected_reason[d_str] = reason
|
||
for offset in range(1, aftereffect_days + 1):
|
||
idx = base_idx + offset
|
||
if idx < n:
|
||
ae_date = dates[idx]
|
||
if ae_date not in affected_reason:
|
||
affected_reason[ae_date] = f'{dates[base_idx]} 余波'
|
||
|
||
is_affected = np.array([d in affected_reason for d in dates])
|
||
|
||
# ==================== LOESS ====================
|
||
def loess_smooth(x_data, y_data, x_eval, frac=0.28, deg=2):
|
||
n_eval = len(x_eval)
|
||
n_data = len(x_data)
|
||
y_smooth = np.zeros(n_eval)
|
||
for j in range(n_eval):
|
||
x0 = x_eval[j]; dist = np.abs(x_data - x0)
|
||
k = max(int(frac * n_data), deg + 2); k = min(k, n_data)
|
||
idx = np.argpartition(dist, k)[:k]
|
||
x_nn, y_nn, d_nn = x_data[idx], y_data[idx], dist[idx]
|
||
max_dist = np.max(d_nn)
|
||
w = (1 - (d_nn / max_dist) ** 3) ** 3 if max_dist > 0 else np.ones(k)
|
||
A = np.column_stack([np.ones(k), x_nn, x_nn**2]); W = np.diag(w)
|
||
try:
|
||
beta = np.linalg.solve(A.T @ W @ A, A.T @ W @ y_nn)
|
||
y_smooth[j] = beta[0] + beta[1] * x0 + beta[2] * x0**2
|
||
except np.linalg.LinAlgError:
|
||
y_smooth[j] = np.average(y_nn, weights=w)
|
||
return np.maximum(y_smooth, 0)
|
||
|
||
clean_mask = ~is_affected
|
||
clean_x, clean_y, clean_dow = x[clean_mask], values[clean_mask], dow[clean_mask]
|
||
|
||
# 第一轮:LOESS 拟合原始趋势
|
||
trend_raw = loess_smooth(clean_x, clean_y, x, frac=0.28, deg=2)
|
||
|
||
# ==================== 星期效应:比例方式 ====================
|
||
# 在干净数据上计算残差率 = (实际 - 趋势) / 趋势
|
||
residual_pct = (clean_y - trend_raw[clean_mask]) / trend_raw[clean_mask]
|
||
dow_ratio = np.zeros(7)
|
||
for d in range(7):
|
||
mask = clean_dow == d
|
||
if np.sum(mask) > 0:
|
||
dow_ratio[d] = np.mean(residual_pct[mask])
|
||
|
||
# 调整使比例总和为0(不影响总注册人数)
|
||
dow_ratio -= np.mean(dow_ratio)
|
||
|
||
print("=" * 80)
|
||
print("2026年3-4月 每日新增注册回归分析 V4(比例潮汐 + 保底)")
|
||
print("=" * 80)
|
||
|
||
print(f"\n📊 星期潮汐效应(比例方式):")
|
||
dow_names = ['周一', '周二', '周三', '周四', '周五', '周六', '周日']
|
||
for i, name in enumerate(dow_names):
|
||
bar = '█' * max(1, int(abs(dow_ratio[i]) * 200))
|
||
sign = '+' if dow_ratio[i] >= 0 else ''
|
||
print(f" {name}: {sign}{dow_ratio[i]*100:.1f}% {bar}")
|
||
|
||
# ==================== 最终拟合 = 趋势 × (1 + 星期比例) ====================
|
||
trend_with_dow = trend_raw * (1 + dow_ratio[dow])
|
||
trend_with_dow = np.maximum(trend_with_dow, 0)
|
||
|
||
# ==================== 有效注册人数 ====================
|
||
# 规则:活动日 → min(实际, 拟合);非活动日 → 保留实际值
|
||
effective = np.where(is_affected, np.minimum(values, trend_with_dow), values)
|
||
|
||
print(f"\n{'='*80}")
|
||
print("📋 2026年3-4月 最终结果")
|
||
print(f"{'='*80}")
|
||
print(f"{'日期':>12} {'星期':>4} | {'实际':>5} | {'影响':>10} | {'趋势':>6} | {'趋势+潮汐':>9} | {'有效注册':>8} | {'剔除':>6}")
|
||
print("-" * 95)
|
||
|
||
mar_start = dates.index('2026-03-01')
|
||
apr_end = dates.index('2026-04-30') + 1
|
||
|
||
for i in range(mar_start, apr_end):
|
||
d = dates[i]
|
||
dw = date_series[i].strftime('%a')
|
||
tag = affected_reason.get(d, '—')
|
||
trend_val = trend_raw[i]
|
||
tdow_val = trend_with_dow[i]
|
||
eff_val = effective[i]
|
||
removed = values[i] - eff_val
|
||
print(f"{d:>12} {dw:>4} | {values[i]:>5.0f} | {tag:>10} | {trend_val:>6.0f} | {tdow_val:>9.0f} | {eff_val:>8.0f} | {removed:>+6.0f}")
|
||
|
||
# ==================== 汇总 ====================
|
||
mar_apr_actual = np.sum(values[mar_start:apr_end])
|
||
mar_apr_eff = np.sum(effective[mar_start:apr_end])
|
||
mar_apr_affected_count = int(np.sum(is_affected[mar_start:apr_end]))
|
||
|
||
print(f"\n{'='*80}")
|
||
print("📊 最终汇总")
|
||
print(f"{'='*80}")
|
||
print(f" 统计天数: {apr_end - mar_start}")
|
||
print(f" 活动影响天数: {mar_apr_affected_count}")
|
||
print(f"")
|
||
print(f" 实际注册总计: {mar_apr_actual:.0f}")
|
||
print(f" 有效注册总计: {mar_apr_eff:.0f}")
|
||
print(f" 剔除活动量: {mar_apr_actual - mar_apr_eff:.0f} 人 ({(mar_apr_actual-mar_apr_eff)/mar_apr_actual*100:.1f}%)")
|
||
print(f"")
|
||
print(f" 实际日均: {np.mean(values[mar_start:apr_end]):.0f}")
|
||
print(f" 有效日均: {np.mean(effective[mar_start:apr_end]):.0f}")
|
||
print(f"")
|
||
print(f" 1-2月干净日均: {np.mean(values[:mar_start][clean_mask[:mar_start]]):.0f}")
|
||
print(f" 3-4月有效日均: {np.mean(effective[mar_start:apr_end]):.0f}")
|
||
|
||
# ==================== 对比 V3 固定值 vs V4 比例 ====================
|
||
print(f"\n{'='*80}")
|
||
print("📊 V3(固定补偿) vs V4(比例补偿) 对比")
|
||
print(f"{'='*80}")
|
||
# 用固定值方式算一遍做对比
|
||
residuals_abs = clean_y - trend_raw[clean_mask]
|
||
dow_abs = np.zeros(7)
|
||
for d in range(7):
|
||
mask = clean_dow == d
|
||
if np.sum(mask) > 0:
|
||
dow_abs[d] = np.mean(residuals_abs[mask])
|
||
dow_abs -= np.mean(dow_abs)
|
||
trend_v3 = trend_raw + dow_abs[dow]
|
||
trend_v3 = np.maximum(trend_v3, 0)
|
||
eff_v3 = np.where(is_affected, np.minimum(values, trend_v3), values)
|
||
|
||
print(f"{'方法':>20} | {'有效总计':>10} | {'剔除':>8} | {'有效日均':>8}")
|
||
print(f"{'V3 固定值补偿':>20} | {np.sum(eff_v3[mar_start:apr_end]):>10.0f} | {mar_apr_actual - np.sum(eff_v3[mar_start:apr_end]):>8.0f} | {np.mean(eff_v3[mar_start:apr_end]):>8.1f}")
|
||
print(f"{'V4 比例补偿':>20} | {mar_apr_eff:>10.0f} | {mar_apr_actual - mar_apr_eff:>8.0f} | {np.mean(effective[mar_start:apr_end]):>8.1f}")
|
||
|
||
# 列出差异较大的日期
|
||
print(f"\n V3 vs V4 有效注册差异 > 3 的日期:")
|
||
diff = effective[mar_start:apr_end] - eff_v3[mar_start:apr_end]
|
||
for i in range(len(diff)):
|
||
if abs(diff[i]) > 3:
|
||
d = dates[mar_start + i]
|
||
sign = '+' if diff[i] > 0 else ''
|
||
print(f" {d}: V3={eff_v3[mar_start+i]:.0f}, V4={effective[mar_start+i]:.0f}, 差异={sign}{diff[i]:.0f}")
|
||
|
||
# ==================== 导出Excel ====================
|
||
rows = []
|
||
for i in range(mar_start, apr_end):
|
||
rows.append({
|
||
'日期': dates[i],
|
||
'星期': date_series[i].strftime('%A'),
|
||
'实际注册人数': int(values[i]),
|
||
'是否活动影响': '是' if is_affected[i] else '否',
|
||
'活动说明': affected_reason.get(dates[i], ''),
|
||
'LOESS趋势': round(trend_raw[i], 0),
|
||
'趋势+潮汐(比例)': round(trend_with_dow[i], 0),
|
||
'有效注册人数(用于转化)': round(effective[i], 0),
|
||
'剔除人数': round(values[i] - effective[i], 0),
|
||
})
|
||
|
||
output_df = pd.DataFrame(rows)
|
||
output_path = '/root/.openclaw/workspace/output/2026_3-4月注册回归分析.xlsx'
|
||
|
||
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
|
||
output_df.to_excel(writer, sheet_name='每日明细', index=False)
|
||
|
||
summary_rows = [
|
||
('统计天数', apr_end - mar_start),
|
||
('活动影响天数', mar_apr_affected_count),
|
||
('', ''),
|
||
('实际注册总计', f'{mar_apr_actual:.0f}'),
|
||
('有效注册总计(用于转化)', f'{mar_apr_eff:.0f}'),
|
||
('剔除活动量', f'{mar_apr_actual - mar_apr_eff:.0f}'),
|
||
('剔除比例', f'{(mar_apr_actual-mar_apr_eff)/mar_apr_actual*100:.1f}%'),
|
||
('', ''),
|
||
('实际日均', f'{np.mean(values[mar_start:apr_end]):.1f}'),
|
||
('有效日均', f'{np.mean(effective[mar_start:apr_end]):.1f}'),
|
||
('', ''),
|
||
('1-2月干净数据日均', f'{np.mean(values[:mar_start][clean_mask[:mar_start]]):.1f}'),
|
||
('3-4月有效数据日均', f'{np.mean(effective[mar_start:apr_end]):.1f}'),
|
||
('', ''),
|
||
('星期效应-周一', f'{dow_ratio[0]*100:+.1f}%'),
|
||
('星期效应-周二', f'{dow_ratio[1]*100:+.1f}%'),
|
||
('星期效应-周三', f'{dow_ratio[2]*100:+.1f}%'),
|
||
('星期效应-周四', f'{dow_ratio[3]*100:+.1f}%'),
|
||
('星期效应-周五', f'{dow_ratio[4]*100:+.1f}%'),
|
||
('星期效应-周六', f'{dow_ratio[5]*100:+.1f}%'),
|
||
('星期效应-周日', f'{dow_ratio[6]*100:+.1f}%'),
|
||
]
|
||
pd.DataFrame(summary_rows, columns=['指标','值']).to_excel(writer, sheet_name='汇总', index=False)
|
||
|
||
print(f"\n✅ Excel 已导出: {output_path}")
|