#!/usr/bin/env python3 """拟合版新增注册用户数:LOESS + 星期因子修正""" import numpy as np from datetime import date, timedelta from collections import defaultdict # ---- 原始每日新增数据 ---- raw_data = """2025-09-01,3 2025-09-02,10 2025-09-03,4 2025-09-04,5 2025-09-05,11 2025-09-06,8 2025-09-07,16 2025-09-08,11 2025-09-09,137 2025-09-10,63 2025-09-11,26 2025-09-12,27 2025-09-13,41 2025-09-14,39 2025-09-15,27 2025-09-16,57 2025-09-17,58 2025-09-18,55 2025-09-19,133 2025-09-20,104 2025-09-21,101 2025-09-22,132 2025-09-23,126 2025-09-24,71 2025-09-25,37 2025-09-26,34 2025-09-27,81 2025-09-28,35 2025-09-29,47 2025-09-30,30 2025-10-01,48 2025-10-02,62 2025-10-03,45 2025-10-04,42 2025-10-05,45 2025-10-06,62 2025-10-07,42 2025-10-08,45 2025-10-09,36 2025-10-10,62 2025-10-11,90 2025-10-12,93 2025-10-13,162 2025-10-14,131 2025-10-15,112 2025-10-16,131 2025-10-17,215 2025-10-18,129 2025-10-19,81 2025-10-20,44 2025-10-21,41 2025-10-22,45 2025-10-23,37 2025-10-24,56 2025-10-25,79 2025-10-26,50 2025-10-27,89 2025-10-28,86 2025-10-29,82 2025-10-30,92 2025-10-31,78 2025-11-01,129 2025-11-02,168 2025-11-03,77 2025-11-04,68 2025-11-05,48 2025-11-06,67 2025-11-07,177 2025-11-08,160 2025-11-09,105 2025-11-10,78 2025-11-11,72 2025-11-12,234 2025-11-13,104 2025-11-14,69 2025-11-15,89 2025-11-16,63 2025-11-17,70 2025-11-18,82 2025-11-19,155 2025-11-20,71 2025-11-21,90 2025-11-22,71 2025-11-23,91 2025-11-24,47 2025-11-25,77 2025-11-26,93 2025-11-27,92 2025-11-28,77 2025-11-29,122 2025-11-30,125 2025-12-01,91 2025-12-02,94 2025-12-03,188 2025-12-04,131 2025-12-05,125 2025-12-06,190 2025-12-07,194 2025-12-08,108 2025-12-09,116 2025-12-10,110 2025-12-11,104 2025-12-12,120 2025-12-13,190 2025-12-14,172 2025-12-15,98 2025-12-16,100 2025-12-17,97 2025-12-18,85 2025-12-19,142 2025-12-20,127 2025-12-21,131 2025-12-22,69 2025-12-23,77 2025-12-24,108 2025-12-25,84 2025-12-26,89 2025-12-27,107 2025-12-28,95 2025-12-29,50 2025-12-30,67 2025-12-31,64 2026-01-01,77 2026-01-02,74 2026-01-03,69 2026-01-04,42 2026-01-05,56 2026-01-06,33 2026-01-07,52 2026-01-08,59 2026-01-09,58 2026-01-10,83 2026-01-11,75 2026-01-12,34 2026-01-13,46 2026-01-14,59 2026-01-15,31 2026-01-16,31 2026-01-17,66 2026-01-18,71 2026-01-19,53 2026-01-20,48 2026-01-21,40 2026-01-22,62 2026-01-23,46 2026-01-24,72 2026-01-25,86 2026-01-26,61 2026-01-27,57 2026-01-28,146 2026-01-29,102 2026-01-30,87 2026-01-31,61 2026-02-01,54 2026-02-02,53 2026-02-03,42 2026-02-04,39 2026-02-05,42 2026-02-06,55 2026-02-07,36 2026-02-08,47 2026-02-09,40 2026-02-10,60 2026-02-11,265 2026-02-12,59 2026-02-13,42 2026-02-14,31 2026-02-15,41 2026-02-16,28 2026-02-17,52 2026-02-18,23 2026-02-19,34 2026-02-20,26 2026-02-21,36 2026-02-22,35 2026-02-23,46 2026-02-24,45 2026-02-25,64 2026-02-26,180 2026-02-27,101 2026-02-28,167 2026-03-01,124 2026-03-02,106 2026-03-03,76 2026-03-04,105 2026-03-05,447 2026-03-06,216 2026-03-07,239 2026-03-08,206 2026-03-09,167 2026-03-10,108 2026-03-11,131 2026-03-12,177 2026-03-13,243 2026-03-14,131 2026-03-15,131 2026-03-16,108 2026-03-17,94 2026-03-18,95 2026-03-19,77 2026-03-20,103 2026-03-21,133 2026-03-22,126 2026-03-23,84 2026-03-24,80 2026-03-25,94 2026-03-26,85 2026-03-27,95 2026-03-28,104 2026-03-29,101 2026-03-30,81 2026-03-31,99 2026-04-01,138 2026-04-02,162 2026-04-03,771 2026-04-04,340 2026-04-05,185 2026-04-06,206 2026-04-07,212 2026-04-08,749 2026-04-09,336 2026-04-10,128 2026-04-11,174 2026-04-12,150 2026-04-13,116 2026-04-14,132 2026-04-15,126 2026-04-16,115 2026-04-17,84 2026-04-18,117 2026-04-19,119 2026-04-20,88 2026-04-21,97 2026-04-22,179 2026-04-23,139 2026-04-24,121 2026-04-25,140 2026-04-26,137 2026-04-27,120 2026-04-28,163 2026-04-29,65 2026-04-30,58 2026-05-01,63 2026-05-02,60 2026-05-03,60 2026-05-04,52 2026-05-05,70 2026-05-06,98 2026-05-07,133 2026-05-08,86 2026-05-09,89 2026-05-10,106 2026-05-11,98 2026-05-12,82 2026-05-13,99 2026-05-14,98 2026-05-15,166 2026-05-16,116 2026-05-17,123 2026-05-18,88 2026-05-19,115 2026-05-20,132 2026-05-21,130 2026-05-22,127 2026-05-23,200 2026-05-24,121 2026-05-25,67 2026-05-26,104 2026-05-27,99 2026-05-28,112 2026-05-29,107 2026-05-30,121 2026-05-31,178""" # 解析数据 data = {} for line in raw_data.strip().split('\n'): dt, nu = line.split(',') data[dt] = int(nu) dates = sorted(data.keys()) start_date = date.fromisoformat(dates[0]) end_date = date.fromisoformat(dates[-1]) # ---- 活动日历(活动日+余波日) ---- activity_dates = set() # 2025: 9/9-10, 9/19-23, 10/13-14, 10/16-17, 11/2, 11/7, 11/10, 11/12, 11/19, 12/3 for d in ['2025-09-09','2025-09-10','2025-09-19','2025-09-20','2025-09-21','2025-09-22','2025-09-23', '2025-10-13','2025-10-14','2025-10-16','2025-10-17', '2025-11-02','2025-11-07','2025-11-10','2025-11-12','2025-11-19', '2025-12-03']: activity_dates.add(d) # 2026: 1/28-29(余波1天), 2/11, 2/26-3/2(余波4天), 3/5-8(余波3天), 3/9, 3/12-13, 4/3-7(余波4天), 4/8-10(余波2天), 4/22-23(余波1天), 4/28, 5/6-7 for d in ['2026-01-28','2026-01-29', '2026-02-11', '2026-02-26','2026-02-27','2026-02-28','2026-03-01','2026-03-02', '2026-03-05','2026-03-06','2026-03-07','2026-03-08', '2026-03-09', '2026-03-12','2026-03-13', '2026-04-03','2026-04-04','2026-04-05','2026-04-06','2026-04-07', '2026-04-08','2026-04-09','2026-04-10', '2026-04-22','2026-04-23', '2026-04-28', '2026-05-06','2026-05-07']: activity_dates.add(d) print(f"活动日+余波日总数: {len(activity_dates)}") # ---- LOESS 实现 ---- def loess(x, y, x_eval, frac=0.236): """简单 LOESS 平滑""" n = len(x) k = max(int(n * frac), 3) y_smooth = np.zeros(len(x_eval)) for i, xi in enumerate(x_eval): # 计算距离 dists = np.abs(x - xi) idx = np.argsort(dists)[:k] max_dist = dists[idx[-1]] # tricube 权重 weights = np.zeros(n) for j in idx: u = dists[j] / max_dist if max_dist > 0 else 0 weights[j] = (1 - u**3)**3 if u < 1 else 0 # 加权线性回归 w_sum = weights.sum() if w_sum > 0: x_w = (x * weights).sum() / w_sum y_w = (y * weights).sum() / w_sum xy_w = (x * y * weights).sum() / w_sum x2_w = (x * x * weights).sum() / w_sum denom = x2_w - x_w**2 if abs(denom) > 1e-10: b = (xy_w - x_w * y_w) / denom a = y_w - b * x_w y_smooth[i] = a + b * xi else: y_smooth[i] = y_w else: y_smooth[i] = y.mean() return y_smooth # 准备数据 x_all = np.arange(len(dates)) y_all = np.array([data[d] for d in dates]) # 清洁日(非活动日)的索引 clean_idx = [i for i, d in enumerate(dates) if d not in activity_dates] x_clean = x_all[clean_idx] y_clean = y_all[clean_idx] print(f"清洁日数量: {len(clean_idx)} / {len(dates)} ({len(clean_idx)/len(dates)*100:.1f}%)") # LOESS 拟合(基于清洁日) y_loess_clean = loess(x_clean, y_clean, x_clean, frac=0.236) # 在全量日期上做 LOESS(用清洁日拟合参数插值到所有日期) y_loess_all = loess(x_clean, y_clean, x_all, frac=0.236) # ---- 星期因子修正 ---- # 基于清洁日计算每周每日的平均注册量与全局均值的比值 weekday_sums = defaultdict(list) for i in clean_idx: d = date.fromisoformat(dates[i]) wd = d.weekday() # 0=Mon, 6=Sun weekday_sums[wd].append(y_all[i]) global_mean_clean = y_clean.mean() weekday_factors = {} for wd in range(7): if weekday_sums[wd]: wd_mean = np.mean(weekday_sums[wd]) weekday_factors[wd] = wd_mean / global_mean_clean else: weekday_factors[wd] = 1.0 day_names = ['周一','周二','周三','周四','周五','周六','周日'] print("\n星期因子(基于清洁日):") for wd in range(7): print(f" {day_names[wd]}: {weekday_factors[wd]:.4f}") # ---- 拟合版每日注册 ---- fitted_daily = {} for i, d in enumerate(dates): wd = date.fromisoformat(d).weekday() if d in activity_dates: # 活动日/余波日:用星期修正后的 LOESS 拟合值 fitted_daily[d] = y_loess_all[i] * weekday_factors[wd] else: # 非活动日:保留实际值 fitted_daily[d] = y_all[i] # ---- 月度汇总 ---- print("\n" + "=" * 70) print("拟合版 vs 原始版 月度新增注册用户数") print("=" * 70) print(f"{'月份':<10} {'原始新增':>8} {'拟合新增':>8} {'差值':>8} {'压降比例':>8}") print("-" * 50) monthly_original = defaultdict(int) monthly_fitted = defaultdict(float) for d in dates: mon = d[:7] monthly_original[mon] += data[d] monthly_fitted[mon] += fitted_daily[d] for mon in sorted(monthly_original.keys()): orig = monthly_original[mon] fit = monthly_fitted[mon] diff = orig - fit pct = diff / orig * 100 if orig > 0 else 0 print(f"{mon:<10} {orig:>8,} {fit:>8,.0f} {diff:>8,.0f} {pct:>7.1f}%") # ---- 3/4/5月详细 ---- print("\n" + "=" * 70) print("3/4/5月 拟合版详细") print("=" * 70) for mon in ['2026-03', '2026-04', '2026-05']: orig = monthly_original[mon] fit = monthly_fitted[mon] diff = orig - fit pct = diff / orig * 100 print(f" {mon}: 原始 {orig:,} → 拟合 {fit:,.0f}(压降 {pct:.1f}%)") # ---- 3/4/5月每日拟合 vs 原始 ---- print("\n" + "=" * 70) print("3/4/5月每日对比(活动日标记 *)") print("=" * 70) print(f"{'日期':<12} {'原始':>6} {'拟合':>8} {'是否活动':>6}") print("-" * 40) for d in dates: if '2026-03' <= d <= '2026-05-31': is_act = '*' if d in activity_dates else '' print(f"{d:<12} {data[d]:>6} {fitted_daily[d]:>8.0f} {is_act:>6}")