447 lines
9.6 KiB
Python
447 lines
9.6 KiB
Python
#!/usr/bin/env python3
|
||
"""拟合版新增注册用户数:LOESS + 星期因子修正"""
|
||
import numpy as np
|
||
from datetime import date, timedelta
|
||
from collections import defaultdict
|
||
|
||
# ---- 原始每日新增数据 ----
|
||
raw_data = """2025-09-01,3
|
||
2025-09-02,10
|
||
2025-09-03,4
|
||
2025-09-04,5
|
||
2025-09-05,11
|
||
2025-09-06,8
|
||
2025-09-07,16
|
||
2025-09-08,11
|
||
2025-09-09,137
|
||
2025-09-10,63
|
||
2025-09-11,26
|
||
2025-09-12,27
|
||
2025-09-13,41
|
||
2025-09-14,39
|
||
2025-09-15,27
|
||
2025-09-16,57
|
||
2025-09-17,58
|
||
2025-09-18,55
|
||
2025-09-19,133
|
||
2025-09-20,104
|
||
2025-09-21,101
|
||
2025-09-22,132
|
||
2025-09-23,126
|
||
2025-09-24,71
|
||
2025-09-25,37
|
||
2025-09-26,34
|
||
2025-09-27,81
|
||
2025-09-28,35
|
||
2025-09-29,47
|
||
2025-09-30,30
|
||
2025-10-01,48
|
||
2025-10-02,62
|
||
2025-10-03,45
|
||
2025-10-04,42
|
||
2025-10-05,45
|
||
2025-10-06,62
|
||
2025-10-07,42
|
||
2025-10-08,45
|
||
2025-10-09,36
|
||
2025-10-10,62
|
||
2025-10-11,90
|
||
2025-10-12,93
|
||
2025-10-13,162
|
||
2025-10-14,131
|
||
2025-10-15,112
|
||
2025-10-16,131
|
||
2025-10-17,215
|
||
2025-10-18,129
|
||
2025-10-19,81
|
||
2025-10-20,44
|
||
2025-10-21,41
|
||
2025-10-22,45
|
||
2025-10-23,37
|
||
2025-10-24,56
|
||
2025-10-25,79
|
||
2025-10-26,50
|
||
2025-10-27,89
|
||
2025-10-28,86
|
||
2025-10-29,82
|
||
2025-10-30,92
|
||
2025-10-31,78
|
||
2025-11-01,129
|
||
2025-11-02,168
|
||
2025-11-03,77
|
||
2025-11-04,68
|
||
2025-11-05,48
|
||
2025-11-06,67
|
||
2025-11-07,177
|
||
2025-11-08,160
|
||
2025-11-09,105
|
||
2025-11-10,78
|
||
2025-11-11,72
|
||
2025-11-12,234
|
||
2025-11-13,104
|
||
2025-11-14,69
|
||
2025-11-15,89
|
||
2025-11-16,63
|
||
2025-11-17,70
|
||
2025-11-18,82
|
||
2025-11-19,155
|
||
2025-11-20,71
|
||
2025-11-21,90
|
||
2025-11-22,71
|
||
2025-11-23,91
|
||
2025-11-24,47
|
||
2025-11-25,77
|
||
2025-11-26,93
|
||
2025-11-27,92
|
||
2025-11-28,77
|
||
2025-11-29,122
|
||
2025-11-30,125
|
||
2025-12-01,91
|
||
2025-12-02,94
|
||
2025-12-03,188
|
||
2025-12-04,131
|
||
2025-12-05,125
|
||
2025-12-06,190
|
||
2025-12-07,194
|
||
2025-12-08,108
|
||
2025-12-09,116
|
||
2025-12-10,110
|
||
2025-12-11,104
|
||
2025-12-12,120
|
||
2025-12-13,190
|
||
2025-12-14,172
|
||
2025-12-15,98
|
||
2025-12-16,100
|
||
2025-12-17,97
|
||
2025-12-18,85
|
||
2025-12-19,142
|
||
2025-12-20,127
|
||
2025-12-21,131
|
||
2025-12-22,69
|
||
2025-12-23,77
|
||
2025-12-24,108
|
||
2025-12-25,84
|
||
2025-12-26,89
|
||
2025-12-27,107
|
||
2025-12-28,95
|
||
2025-12-29,50
|
||
2025-12-30,67
|
||
2025-12-31,64
|
||
2026-01-01,77
|
||
2026-01-02,74
|
||
2026-01-03,69
|
||
2026-01-04,42
|
||
2026-01-05,56
|
||
2026-01-06,33
|
||
2026-01-07,52
|
||
2026-01-08,59
|
||
2026-01-09,58
|
||
2026-01-10,83
|
||
2026-01-11,75
|
||
2026-01-12,34
|
||
2026-01-13,46
|
||
2026-01-14,59
|
||
2026-01-15,31
|
||
2026-01-16,31
|
||
2026-01-17,66
|
||
2026-01-18,71
|
||
2026-01-19,53
|
||
2026-01-20,48
|
||
2026-01-21,40
|
||
2026-01-22,62
|
||
2026-01-23,46
|
||
2026-01-24,72
|
||
2026-01-25,86
|
||
2026-01-26,61
|
||
2026-01-27,57
|
||
2026-01-28,146
|
||
2026-01-29,102
|
||
2026-01-30,87
|
||
2026-01-31,61
|
||
2026-02-01,54
|
||
2026-02-02,53
|
||
2026-02-03,42
|
||
2026-02-04,39
|
||
2026-02-05,42
|
||
2026-02-06,55
|
||
2026-02-07,36
|
||
2026-02-08,47
|
||
2026-02-09,40
|
||
2026-02-10,60
|
||
2026-02-11,265
|
||
2026-02-12,59
|
||
2026-02-13,42
|
||
2026-02-14,31
|
||
2026-02-15,41
|
||
2026-02-16,28
|
||
2026-02-17,52
|
||
2026-02-18,23
|
||
2026-02-19,34
|
||
2026-02-20,26
|
||
2026-02-21,36
|
||
2026-02-22,35
|
||
2026-02-23,46
|
||
2026-02-24,45
|
||
2026-02-25,64
|
||
2026-02-26,180
|
||
2026-02-27,101
|
||
2026-02-28,167
|
||
2026-03-01,124
|
||
2026-03-02,106
|
||
2026-03-03,76
|
||
2026-03-04,105
|
||
2026-03-05,447
|
||
2026-03-06,216
|
||
2026-03-07,239
|
||
2026-03-08,206
|
||
2026-03-09,167
|
||
2026-03-10,108
|
||
2026-03-11,131
|
||
2026-03-12,177
|
||
2026-03-13,243
|
||
2026-03-14,131
|
||
2026-03-15,131
|
||
2026-03-16,108
|
||
2026-03-17,94
|
||
2026-03-18,95
|
||
2026-03-19,77
|
||
2026-03-20,103
|
||
2026-03-21,133
|
||
2026-03-22,126
|
||
2026-03-23,84
|
||
2026-03-24,80
|
||
2026-03-25,94
|
||
2026-03-26,85
|
||
2026-03-27,95
|
||
2026-03-28,104
|
||
2026-03-29,101
|
||
2026-03-30,81
|
||
2026-03-31,99
|
||
2026-04-01,138
|
||
2026-04-02,162
|
||
2026-04-03,771
|
||
2026-04-04,340
|
||
2026-04-05,185
|
||
2026-04-06,206
|
||
2026-04-07,212
|
||
2026-04-08,749
|
||
2026-04-09,336
|
||
2026-04-10,128
|
||
2026-04-11,174
|
||
2026-04-12,150
|
||
2026-04-13,116
|
||
2026-04-14,132
|
||
2026-04-15,126
|
||
2026-04-16,115
|
||
2026-04-17,84
|
||
2026-04-18,117
|
||
2026-04-19,119
|
||
2026-04-20,88
|
||
2026-04-21,97
|
||
2026-04-22,179
|
||
2026-04-23,139
|
||
2026-04-24,121
|
||
2026-04-25,140
|
||
2026-04-26,137
|
||
2026-04-27,120
|
||
2026-04-28,163
|
||
2026-04-29,65
|
||
2026-04-30,58
|
||
2026-05-01,63
|
||
2026-05-02,60
|
||
2026-05-03,60
|
||
2026-05-04,52
|
||
2026-05-05,70
|
||
2026-05-06,98
|
||
2026-05-07,133
|
||
2026-05-08,86
|
||
2026-05-09,89
|
||
2026-05-10,106
|
||
2026-05-11,98
|
||
2026-05-12,82
|
||
2026-05-13,99
|
||
2026-05-14,98
|
||
2026-05-15,166
|
||
2026-05-16,116
|
||
2026-05-17,123
|
||
2026-05-18,88
|
||
2026-05-19,115
|
||
2026-05-20,132
|
||
2026-05-21,130
|
||
2026-05-22,127
|
||
2026-05-23,200
|
||
2026-05-24,121
|
||
2026-05-25,67
|
||
2026-05-26,104
|
||
2026-05-27,99
|
||
2026-05-28,112
|
||
2026-05-29,107
|
||
2026-05-30,121
|
||
2026-05-31,178"""
|
||
|
||
# 解析数据
|
||
data = {}
|
||
for line in raw_data.strip().split('\n'):
|
||
dt, nu = line.split(',')
|
||
data[dt] = int(nu)
|
||
|
||
dates = sorted(data.keys())
|
||
start_date = date.fromisoformat(dates[0])
|
||
end_date = date.fromisoformat(dates[-1])
|
||
|
||
# ---- 活动日历(活动日+余波日) ----
|
||
activity_dates = set()
|
||
# 2025: 9/9-10, 9/19-23, 10/13-14, 10/16-17, 11/2, 11/7, 11/10, 11/12, 11/19, 12/3
|
||
for d in ['2025-09-09','2025-09-10','2025-09-19','2025-09-20','2025-09-21','2025-09-22','2025-09-23',
|
||
'2025-10-13','2025-10-14','2025-10-16','2025-10-17',
|
||
'2025-11-02','2025-11-07','2025-11-10','2025-11-12','2025-11-19',
|
||
'2025-12-03']:
|
||
activity_dates.add(d)
|
||
# 2026: 1/28-29(余波1天), 2/11, 2/26-3/2(余波4天), 3/5-8(余波3天), 3/9, 3/12-13, 4/3-7(余波4天), 4/8-10(余波2天), 4/22-23(余波1天), 4/28, 5/6-7
|
||
for d in ['2026-01-28','2026-01-29',
|
||
'2026-02-11',
|
||
'2026-02-26','2026-02-27','2026-02-28','2026-03-01','2026-03-02',
|
||
'2026-03-05','2026-03-06','2026-03-07','2026-03-08',
|
||
'2026-03-09',
|
||
'2026-03-12','2026-03-13',
|
||
'2026-04-03','2026-04-04','2026-04-05','2026-04-06','2026-04-07',
|
||
'2026-04-08','2026-04-09','2026-04-10',
|
||
'2026-04-22','2026-04-23',
|
||
'2026-04-28',
|
||
'2026-05-06','2026-05-07']:
|
||
activity_dates.add(d)
|
||
|
||
print(f"活动日+余波日总数: {len(activity_dates)}")
|
||
|
||
# ---- LOESS 实现 ----
|
||
def loess(x, y, x_eval, frac=0.236):
|
||
"""简单 LOESS 平滑"""
|
||
n = len(x)
|
||
k = max(int(n * frac), 3)
|
||
y_smooth = np.zeros(len(x_eval))
|
||
|
||
for i, xi in enumerate(x_eval):
|
||
# 计算距离
|
||
dists = np.abs(x - xi)
|
||
idx = np.argsort(dists)[:k]
|
||
max_dist = dists[idx[-1]]
|
||
|
||
# tricube 权重
|
||
weights = np.zeros(n)
|
||
for j in idx:
|
||
u = dists[j] / max_dist if max_dist > 0 else 0
|
||
weights[j] = (1 - u**3)**3 if u < 1 else 0
|
||
|
||
# 加权线性回归
|
||
w_sum = weights.sum()
|
||
if w_sum > 0:
|
||
x_w = (x * weights).sum() / w_sum
|
||
y_w = (y * weights).sum() / w_sum
|
||
xy_w = (x * y * weights).sum() / w_sum
|
||
x2_w = (x * x * weights).sum() / w_sum
|
||
|
||
denom = x2_w - x_w**2
|
||
if abs(denom) > 1e-10:
|
||
b = (xy_w - x_w * y_w) / denom
|
||
a = y_w - b * x_w
|
||
y_smooth[i] = a + b * xi
|
||
else:
|
||
y_smooth[i] = y_w
|
||
else:
|
||
y_smooth[i] = y.mean()
|
||
|
||
return y_smooth
|
||
|
||
# 准备数据
|
||
x_all = np.arange(len(dates))
|
||
y_all = np.array([data[d] for d in dates])
|
||
|
||
# 清洁日(非活动日)的索引
|
||
clean_idx = [i for i, d in enumerate(dates) if d not in activity_dates]
|
||
x_clean = x_all[clean_idx]
|
||
y_clean = y_all[clean_idx]
|
||
|
||
print(f"清洁日数量: {len(clean_idx)} / {len(dates)} ({len(clean_idx)/len(dates)*100:.1f}%)")
|
||
|
||
# LOESS 拟合(基于清洁日)
|
||
y_loess_clean = loess(x_clean, y_clean, x_clean, frac=0.236)
|
||
|
||
# 在全量日期上做 LOESS(用清洁日拟合参数插值到所有日期)
|
||
y_loess_all = loess(x_clean, y_clean, x_all, frac=0.236)
|
||
|
||
# ---- 星期因子修正 ----
|
||
# 基于清洁日计算每周每日的平均注册量与全局均值的比值
|
||
weekday_sums = defaultdict(list)
|
||
for i in clean_idx:
|
||
d = date.fromisoformat(dates[i])
|
||
wd = d.weekday() # 0=Mon, 6=Sun
|
||
weekday_sums[wd].append(y_all[i])
|
||
|
||
global_mean_clean = y_clean.mean()
|
||
weekday_factors = {}
|
||
for wd in range(7):
|
||
if weekday_sums[wd]:
|
||
wd_mean = np.mean(weekday_sums[wd])
|
||
weekday_factors[wd] = wd_mean / global_mean_clean
|
||
else:
|
||
weekday_factors[wd] = 1.0
|
||
|
||
day_names = ['周一','周二','周三','周四','周五','周六','周日']
|
||
print("\n星期因子(基于清洁日):")
|
||
for wd in range(7):
|
||
print(f" {day_names[wd]}: {weekday_factors[wd]:.4f}")
|
||
|
||
# ---- 拟合版每日注册 ----
|
||
fitted_daily = {}
|
||
for i, d in enumerate(dates):
|
||
wd = date.fromisoformat(d).weekday()
|
||
if d in activity_dates:
|
||
# 活动日/余波日:用星期修正后的 LOESS 拟合值
|
||
fitted_daily[d] = y_loess_all[i] * weekday_factors[wd]
|
||
else:
|
||
# 非活动日:保留实际值
|
||
fitted_daily[d] = y_all[i]
|
||
|
||
# ---- 月度汇总 ----
|
||
print("\n" + "=" * 70)
|
||
print("拟合版 vs 原始版 月度新增注册用户数")
|
||
print("=" * 70)
|
||
print(f"{'月份':<10} {'原始新增':>8} {'拟合新增':>8} {'差值':>8} {'压降比例':>8}")
|
||
print("-" * 50)
|
||
|
||
monthly_original = defaultdict(int)
|
||
monthly_fitted = defaultdict(float)
|
||
for d in dates:
|
||
mon = d[:7]
|
||
monthly_original[mon] += data[d]
|
||
monthly_fitted[mon] += fitted_daily[d]
|
||
|
||
for mon in sorted(monthly_original.keys()):
|
||
orig = monthly_original[mon]
|
||
fit = monthly_fitted[mon]
|
||
diff = orig - fit
|
||
pct = diff / orig * 100 if orig > 0 else 0
|
||
print(f"{mon:<10} {orig:>8,} {fit:>8,.0f} {diff:>8,.0f} {pct:>7.1f}%")
|
||
|
||
# ---- 3/4/5月详细 ----
|
||
print("\n" + "=" * 70)
|
||
print("3/4/5月 拟合版详细")
|
||
print("=" * 70)
|
||
for mon in ['2026-03', '2026-04', '2026-05']:
|
||
orig = monthly_original[mon]
|
||
fit = monthly_fitted[mon]
|
||
diff = orig - fit
|
||
pct = diff / orig * 100
|
||
print(f" {mon}: 原始 {orig:,} → 拟合 {fit:,.0f}(压降 {pct:.1f}%)")
|
||
|
||
# ---- 3/4/5月每日拟合 vs 原始 ----
|
||
print("\n" + "=" * 70)
|
||
print("3/4/5月每日对比(活动日标记 *)")
|
||
print("=" * 70)
|
||
print(f"{'日期':<12} {'原始':>6} {'拟合':>8} {'是否活动':>6}")
|
||
print("-" * 40)
|
||
for d in dates:
|
||
if '2026-03' <= d <= '2026-05-31':
|
||
is_act = '*' if d in activity_dates else ''
|
||
print(f"{d:<12} {data[d]:>6} {fitted_daily[d]:>8.0f} {is_act:>6}")
|