ai_member_xiaoxi/scripts/fitted_registration.py
2026-06-02 08:00:01 +08:00

447 lines
9.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""拟合版新增注册用户数LOESS + 星期因子修正"""
import numpy as np
from datetime import date, timedelta
from collections import defaultdict
# ---- 原始每日新增数据 ----
raw_data = """2025-09-01,3
2025-09-02,10
2025-09-03,4
2025-09-04,5
2025-09-05,11
2025-09-06,8
2025-09-07,16
2025-09-08,11
2025-09-09,137
2025-09-10,63
2025-09-11,26
2025-09-12,27
2025-09-13,41
2025-09-14,39
2025-09-15,27
2025-09-16,57
2025-09-17,58
2025-09-18,55
2025-09-19,133
2025-09-20,104
2025-09-21,101
2025-09-22,132
2025-09-23,126
2025-09-24,71
2025-09-25,37
2025-09-26,34
2025-09-27,81
2025-09-28,35
2025-09-29,47
2025-09-30,30
2025-10-01,48
2025-10-02,62
2025-10-03,45
2025-10-04,42
2025-10-05,45
2025-10-06,62
2025-10-07,42
2025-10-08,45
2025-10-09,36
2025-10-10,62
2025-10-11,90
2025-10-12,93
2025-10-13,162
2025-10-14,131
2025-10-15,112
2025-10-16,131
2025-10-17,215
2025-10-18,129
2025-10-19,81
2025-10-20,44
2025-10-21,41
2025-10-22,45
2025-10-23,37
2025-10-24,56
2025-10-25,79
2025-10-26,50
2025-10-27,89
2025-10-28,86
2025-10-29,82
2025-10-30,92
2025-10-31,78
2025-11-01,129
2025-11-02,168
2025-11-03,77
2025-11-04,68
2025-11-05,48
2025-11-06,67
2025-11-07,177
2025-11-08,160
2025-11-09,105
2025-11-10,78
2025-11-11,72
2025-11-12,234
2025-11-13,104
2025-11-14,69
2025-11-15,89
2025-11-16,63
2025-11-17,70
2025-11-18,82
2025-11-19,155
2025-11-20,71
2025-11-21,90
2025-11-22,71
2025-11-23,91
2025-11-24,47
2025-11-25,77
2025-11-26,93
2025-11-27,92
2025-11-28,77
2025-11-29,122
2025-11-30,125
2025-12-01,91
2025-12-02,94
2025-12-03,188
2025-12-04,131
2025-12-05,125
2025-12-06,190
2025-12-07,194
2025-12-08,108
2025-12-09,116
2025-12-10,110
2025-12-11,104
2025-12-12,120
2025-12-13,190
2025-12-14,172
2025-12-15,98
2025-12-16,100
2025-12-17,97
2025-12-18,85
2025-12-19,142
2025-12-20,127
2025-12-21,131
2025-12-22,69
2025-12-23,77
2025-12-24,108
2025-12-25,84
2025-12-26,89
2025-12-27,107
2025-12-28,95
2025-12-29,50
2025-12-30,67
2025-12-31,64
2026-01-01,77
2026-01-02,74
2026-01-03,69
2026-01-04,42
2026-01-05,56
2026-01-06,33
2026-01-07,52
2026-01-08,59
2026-01-09,58
2026-01-10,83
2026-01-11,75
2026-01-12,34
2026-01-13,46
2026-01-14,59
2026-01-15,31
2026-01-16,31
2026-01-17,66
2026-01-18,71
2026-01-19,53
2026-01-20,48
2026-01-21,40
2026-01-22,62
2026-01-23,46
2026-01-24,72
2026-01-25,86
2026-01-26,61
2026-01-27,57
2026-01-28,146
2026-01-29,102
2026-01-30,87
2026-01-31,61
2026-02-01,54
2026-02-02,53
2026-02-03,42
2026-02-04,39
2026-02-05,42
2026-02-06,55
2026-02-07,36
2026-02-08,47
2026-02-09,40
2026-02-10,60
2026-02-11,265
2026-02-12,59
2026-02-13,42
2026-02-14,31
2026-02-15,41
2026-02-16,28
2026-02-17,52
2026-02-18,23
2026-02-19,34
2026-02-20,26
2026-02-21,36
2026-02-22,35
2026-02-23,46
2026-02-24,45
2026-02-25,64
2026-02-26,180
2026-02-27,101
2026-02-28,167
2026-03-01,124
2026-03-02,106
2026-03-03,76
2026-03-04,105
2026-03-05,447
2026-03-06,216
2026-03-07,239
2026-03-08,206
2026-03-09,167
2026-03-10,108
2026-03-11,131
2026-03-12,177
2026-03-13,243
2026-03-14,131
2026-03-15,131
2026-03-16,108
2026-03-17,94
2026-03-18,95
2026-03-19,77
2026-03-20,103
2026-03-21,133
2026-03-22,126
2026-03-23,84
2026-03-24,80
2026-03-25,94
2026-03-26,85
2026-03-27,95
2026-03-28,104
2026-03-29,101
2026-03-30,81
2026-03-31,99
2026-04-01,138
2026-04-02,162
2026-04-03,771
2026-04-04,340
2026-04-05,185
2026-04-06,206
2026-04-07,212
2026-04-08,749
2026-04-09,336
2026-04-10,128
2026-04-11,174
2026-04-12,150
2026-04-13,116
2026-04-14,132
2026-04-15,126
2026-04-16,115
2026-04-17,84
2026-04-18,117
2026-04-19,119
2026-04-20,88
2026-04-21,97
2026-04-22,179
2026-04-23,139
2026-04-24,121
2026-04-25,140
2026-04-26,137
2026-04-27,120
2026-04-28,163
2026-04-29,65
2026-04-30,58
2026-05-01,63
2026-05-02,60
2026-05-03,60
2026-05-04,52
2026-05-05,70
2026-05-06,98
2026-05-07,133
2026-05-08,86
2026-05-09,89
2026-05-10,106
2026-05-11,98
2026-05-12,82
2026-05-13,99
2026-05-14,98
2026-05-15,166
2026-05-16,116
2026-05-17,123
2026-05-18,88
2026-05-19,115
2026-05-20,132
2026-05-21,130
2026-05-22,127
2026-05-23,200
2026-05-24,121
2026-05-25,67
2026-05-26,104
2026-05-27,99
2026-05-28,112
2026-05-29,107
2026-05-30,121
2026-05-31,178"""
# 解析数据
data = {}
for line in raw_data.strip().split('\n'):
dt, nu = line.split(',')
data[dt] = int(nu)
dates = sorted(data.keys())
start_date = date.fromisoformat(dates[0])
end_date = date.fromisoformat(dates[-1])
# ---- 活动日历(活动日+余波日) ----
activity_dates = set()
# 2025: 9/9-10, 9/19-23, 10/13-14, 10/16-17, 11/2, 11/7, 11/10, 11/12, 11/19, 12/3
for d in ['2025-09-09','2025-09-10','2025-09-19','2025-09-20','2025-09-21','2025-09-22','2025-09-23',
'2025-10-13','2025-10-14','2025-10-16','2025-10-17',
'2025-11-02','2025-11-07','2025-11-10','2025-11-12','2025-11-19',
'2025-12-03']:
activity_dates.add(d)
# 2026: 1/28-29(余波1天), 2/11, 2/26-3/2(余波4天), 3/5-8(余波3天), 3/9, 3/12-13, 4/3-7(余波4天), 4/8-10(余波2天), 4/22-23(余波1天), 4/28, 5/6-7
for d in ['2026-01-28','2026-01-29',
'2026-02-11',
'2026-02-26','2026-02-27','2026-02-28','2026-03-01','2026-03-02',
'2026-03-05','2026-03-06','2026-03-07','2026-03-08',
'2026-03-09',
'2026-03-12','2026-03-13',
'2026-04-03','2026-04-04','2026-04-05','2026-04-06','2026-04-07',
'2026-04-08','2026-04-09','2026-04-10',
'2026-04-22','2026-04-23',
'2026-04-28',
'2026-05-06','2026-05-07']:
activity_dates.add(d)
print(f"活动日+余波日总数: {len(activity_dates)}")
# ---- LOESS 实现 ----
def loess(x, y, x_eval, frac=0.236):
"""简单 LOESS 平滑"""
n = len(x)
k = max(int(n * frac), 3)
y_smooth = np.zeros(len(x_eval))
for i, xi in enumerate(x_eval):
# 计算距离
dists = np.abs(x - xi)
idx = np.argsort(dists)[:k]
max_dist = dists[idx[-1]]
# tricube 权重
weights = np.zeros(n)
for j in idx:
u = dists[j] / max_dist if max_dist > 0 else 0
weights[j] = (1 - u**3)**3 if u < 1 else 0
# 加权线性回归
w_sum = weights.sum()
if w_sum > 0:
x_w = (x * weights).sum() / w_sum
y_w = (y * weights).sum() / w_sum
xy_w = (x * y * weights).sum() / w_sum
x2_w = (x * x * weights).sum() / w_sum
denom = x2_w - x_w**2
if abs(denom) > 1e-10:
b = (xy_w - x_w * y_w) / denom
a = y_w - b * x_w
y_smooth[i] = a + b * xi
else:
y_smooth[i] = y_w
else:
y_smooth[i] = y.mean()
return y_smooth
# 准备数据
x_all = np.arange(len(dates))
y_all = np.array([data[d] for d in dates])
# 清洁日(非活动日)的索引
clean_idx = [i for i, d in enumerate(dates) if d not in activity_dates]
x_clean = x_all[clean_idx]
y_clean = y_all[clean_idx]
print(f"清洁日数量: {len(clean_idx)} / {len(dates)} ({len(clean_idx)/len(dates)*100:.1f}%)")
# LOESS 拟合(基于清洁日)
y_loess_clean = loess(x_clean, y_clean, x_clean, frac=0.236)
# 在全量日期上做 LOESS用清洁日拟合参数插值到所有日期
y_loess_all = loess(x_clean, y_clean, x_all, frac=0.236)
# ---- 星期因子修正 ----
# 基于清洁日计算每周每日的平均注册量与全局均值的比值
weekday_sums = defaultdict(list)
for i in clean_idx:
d = date.fromisoformat(dates[i])
wd = d.weekday() # 0=Mon, 6=Sun
weekday_sums[wd].append(y_all[i])
global_mean_clean = y_clean.mean()
weekday_factors = {}
for wd in range(7):
if weekday_sums[wd]:
wd_mean = np.mean(weekday_sums[wd])
weekday_factors[wd] = wd_mean / global_mean_clean
else:
weekday_factors[wd] = 1.0
day_names = ['周一','周二','周三','周四','周五','周六','周日']
print("\n星期因子(基于清洁日):")
for wd in range(7):
print(f" {day_names[wd]}: {weekday_factors[wd]:.4f}")
# ---- 拟合版每日注册 ----
fitted_daily = {}
for i, d in enumerate(dates):
wd = date.fromisoformat(d).weekday()
if d in activity_dates:
# 活动日/余波日:用星期修正后的 LOESS 拟合值
fitted_daily[d] = y_loess_all[i] * weekday_factors[wd]
else:
# 非活动日:保留实际值
fitted_daily[d] = y_all[i]
# ---- 月度汇总 ----
print("\n" + "=" * 70)
print("拟合版 vs 原始版 月度新增注册用户数")
print("=" * 70)
print(f"{'月份':<10} {'原始新增':>8} {'拟合新增':>8} {'差值':>8} {'压降比例':>8}")
print("-" * 50)
monthly_original = defaultdict(int)
monthly_fitted = defaultdict(float)
for d in dates:
mon = d[:7]
monthly_original[mon] += data[d]
monthly_fitted[mon] += fitted_daily[d]
for mon in sorted(monthly_original.keys()):
orig = monthly_original[mon]
fit = monthly_fitted[mon]
diff = orig - fit
pct = diff / orig * 100 if orig > 0 else 0
print(f"{mon:<10} {orig:>8,} {fit:>8,.0f} {diff:>8,.0f} {pct:>7.1f}%")
# ---- 3/4/5月详细 ----
print("\n" + "=" * 70)
print("3/4/5月 拟合版详细")
print("=" * 70)
for mon in ['2026-03', '2026-04', '2026-05']:
orig = monthly_original[mon]
fit = monthly_fitted[mon]
diff = orig - fit
pct = diff / orig * 100
print(f" {mon}: 原始 {orig:,} → 拟合 {fit:,.0f}(压降 {pct:.1f}%")
# ---- 3/4/5月每日拟合 vs 原始 ----
print("\n" + "=" * 70)
print("3/4/5月每日对比活动日标记 *")
print("=" * 70)
print(f"{'日期':<12} {'原始':>6} {'拟合':>8} {'是否活动':>6}")
print("-" * 40)
for d in dates:
if '2026-03' <= d <= '2026-05-31':
is_act = '*' if d in activity_dates else ''
print(f"{d:<12} {data[d]:>6} {fitted_daily[d]:>8.0f} {is_act:>6}")