ai_member_xiaoxi/scripts/fix_process_data.py

#!/usr/bin/env python3
"""Fix process data: cumulative lesson completion + 5月 Tom row"""

import json, requests, os, time, sys
import pandas as pd
import psycopg2
from collections import defaultdict

CRED_DIR = "/root/.openclaw/credentials/xiaoxi"
SPREADSHEET_TOKEN = "NoZqsFi47hIOHEt9j8WcfRtbnug"

# Load secrets
secrets = {}
with open("/root/.openclaw/workspace/secrets.env") as f:
    for line in f:
        line = line.strip()
        if line and not line.startswith("#") and "=" in line:
            k, v = line.split("=", 1)
            secrets[k.strip()] = v.strip().strip("'")

def get_feishu_token():
    with open(os.path.join(CRED_DIR, "config.json")) as f:
        cfg = json.load(f)
    resp = requests.post(
        "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal",
        json={"app_id": cfg["apps"][0]["appId"], "app_secret": cfg["apps"][0]["appSecret"]},
        timeout=15)
    return resp.json()["tenant_access_token"]

# ============================================================
# Step 1: Load 线索 data (微伴 + 销售表)
# ============================================================
print("=== Step 1: Loading 线索 data ===")

# Load 微伴 data
wb_file = "/root/.openclaw/media/inbound/å¾_ä¼_-å_æ_å_è_æ_æ_å_¼å_º_3---79d6aba9-7cd8-4e99-9b4f-dd981a5ca639.xlsx"
wb_df = pd.read_excel(wb_file)
print(f"  微伴: {len(wb_df)} rows, cols: {list(wb_df.columns)}")

# Load 销售表 from 飞书
token = get_feishu_token()

def read_sheet(sheet_id, range_str):
    resp = requests.get(
        f"https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/{SPREADSHEET_TOKEN}/values/{sheet_id}!{range_str}",
        headers={"Authorization": f"Bearer {token}"}, timeout=30)
    return resp.json()["data"]["valueRange"]["values"]

# 吴迪 sheet
wudi_data = read_sheet("f975f0", "A1:K700")
# 小龙 sheet
xl_data = read_sheet("qJF4I", "A1:K1200")
# 成都 sheet
cd_data = read_sheet("qJF4J", "A1:K2500")

print(f"  吴迪: {len(wudi_data)} rows, 小龙: {len(xl_data)} rows, 成都: {len(cd_data)} rows")

# Parse 销售表
CS_MAP = {"吴迪": "吴迪", "益达老师": "小龙", "瓦拉英语-Tom老师": "Tom", "瓦拉英语-Bob老师": "Bob"}

def parse_sales_sheet(data, sheet_name):
    """Parse sales sheet, return list of {date, name, phone, sales}"""
    results = []
    header = data[0]
    # Find columns
    date_col = name_col = phone_col = cs_col = None
    for i, h in enumerate(header):
        if h and "日期" in str(h): date_col = i
        if h and "昵称" in str(h): name_col = i
        if h and "手机" in str(h): phone_col = i
        if h and "客服" in str(h): cs_col = i

    if cs_col is None:
        # Try to find CS column by name
        for i, h in enumerate(header):
            if h and sheet_name in str(h): cs_col = i

    for row in data[1:]:
        if not row or len(row) <= max(filter(None, [date_col, name_col, phone_col, cs_col or 0])):
            continue
        date_val = row[date_col] if date_col is not None and date_col < len(row) else None
        name_val = row[name_col] if name_col is not None and name_col < len(row) else None
        phone_val = row[phone_col] if phone_col is not None and phone_col < len(row) else None
        cs_val = row[cs_col] if cs_col is not None and cs_col < len(row) else None

        if not date_val:
            continue

        # Parse date
        if isinstance(date_val, (int, float)):
            from datetime import datetime, timedelta
            date_str = (datetime(1899, 12, 30) + timedelta(days=int(date_val))).strftime("%Y-%m-%d")
        else:
            date_str = str(date_val).strip()

        # Map CS
        sales = None
        if cs_val:
            cs_str = str(cs_val).strip()
            for k, v in CS_MAP.items():
                if k in cs_str:
                    sales = v
                    break

        if not sales:
            if sheet_name == "吴迪": sales = "吴迪"
            elif sheet_name == "小龙": sales = "小龙"

        results.append({
            "date": date_str,
            "name": str(name_val).strip() if name_val else "",
            "phone": str(phone_val).strip() if phone_val else "",
            "sales": sales
        })
    return results

wudi_entries = parse_sales_sheet(wudi_data, "吴迪")
xl_entries = parse_sales_sheet(xl_data, "小龙")
cd_entries = parse_sales_sheet(cd_data, "成都")

print(f"  吴迪 entries: {len(wudi_entries)}, 小龙 entries: {len(xl_entries)}, 成都 entries: {len(cd_entries)}")

# ============================================================
# Step 2: Query database for user lesson completion
# ============================================================
print("\n=== Step 2: Querying database ===")

conn = psycopg2.connect(
    host="bj-postgres-16pob4sg.sql.tencentcdb.com",
    port=28591,
    user="ai_member",
    password=secrets["PG_ONLINE_PASSWORD"],
    dbname="vala_bi"
)

# Get all users with their phone numbers
cur = conn.cursor()
cur.execute("""
    SELECT id, tel, tel_encrypt
    FROM bi_vala_app_account
    WHERE status = 1 AND deleted_at IS NULL
""")
users = {row[0]: {"tel": row[1], "tel_encrypt": row[2]} for row in cur.fetchall()}
print(f"  Users: {len(users)}")

# Get cumulative lesson completion per user
cur.execute("""
    SELECT ucp.user_id, MAX(blu.course_lesson) as max_lesson
    FROM bi_user_chapter_play_record_0 ucp
    JOIN bi_level_unit_lesson blu ON ucp.chapter_id = blu.id
    WHERE ucp.play_status = 1
      AND blu.course_season = 'S0'
      AND blu.course_unit = 'U00'
      AND blu.course_level IN ('L1', 'L2')
    GROUP BY ucp.user_id
""")
user_lessons = {}
for row in cur.fetchall():
    user_lessons[row[0]] = row[1]
print(f"  Users with lessons: {len(user_lessons)}")

# Get paid users
cur.execute("""
    SELECT DISTINCT account_id
    FROM bi_vala_order
    WHERE pay_success_date IS NOT NULL
      AND order_status IN (3, 4)
      AND key_from IN ('app-active-h5-0-0', 'app-sales-bj-qhm-0')
""")
paid_users = set(row[0] for row in cur.fetchall())
print(f"  Paid users: {len(paid_users)}")

conn.close()

# ============================================================
# Step 3: Match users to sales via 线索 data
# ============================================================
print("\n=== Step 3: Matching users to sales ===")

# Build phone → sales mapping from 线索 data
# 微伴: phone → CS
wb_phone_to_sales = {}
for _, row in wb_df.iterrows():
    phone = str(row.get("手机号", "")).strip()
    cs = str(row.get("客服", "")).strip()
    if phone and cs:
        for k, v in CS_MAP.items():
            if k in cs:
                wb_phone_to_sales[phone] = v
                break

# 销售表: phone → sales
sales_phone_to_sales = {}
for entry in wudi_entries + xl_entries + cd_entries:
    if entry["phone"] and entry["sales"]:
        sales_phone_to_sales[entry["phone"]] = entry["sales"]

print(f"  微伴 phone→sales: {len(wb_phone_to_sales)}")
print(f"  销售表 phone→sales: {len(sales_phone_to_sales)}")

# Match users to sales via phone
# Try: exact match on tel, then partial match
user_to_sales = {}
unmatched = 0

for uid, uinfo in users.items():
    tel = (uinfo.get("tel") or "").strip()
    if not tel:
        unmatched += 1
        continue

    # Try 微伴 first
    if tel in wb_phone_to_sales:
        user_to_sales[uid] = wb_phone_to_sales[tel]
        continue

    # Try 销售表
    if tel in sales_phone_to_sales:
        user_to_sales[uid] = sales_phone_to_sales[tel]
        continue

    # Try partial match
    matched = False
    for phone, sales in wb_phone_to_sales.items():
        if tel in phone or phone in tel:
            user_to_sales[uid] = sales
            matched = True
            break
    if not matched:
        for phone, sales in sales_phone_to_sales.items():
            if tel in phone or phone in tel:
                user_to_sales[uid] = sales
                matched = True
                break
    if not matched:
        unmatched += 1

print(f"  Matched users: {len(user_to_sales)}, unmatched: {unmatched}")

# ============================================================
# Step 4: Aggregate by month and sales
# ============================================================
print("\n=== Step 4: Aggregating ===")

# Get user registration month
conn = psycopg2.connect(
    host="bj-postgres-16pob4sg.sql.tencentcdb.com",
    port=28591,
    user="ai_member",
    password=secrets["PG_ONLINE_PASSWORD"],
    dbname="vala_bi"
)
cur = conn.cursor()
cur.execute("""
    SELECT id, DATE_TRUNC('month', created_at)::date as reg_month
    FROM bi_vala_app_account
    WHERE status = 1 AND deleted_at IS NULL
      AND created_at >= '2026-03-01' AND created_at < '2026-07-01'
""")
user_reg_month = {row[0]: row[1].strftime("%Y-%m") for row in cur.fetchall()}
conn.close()

# For each month+sales, count cumulative lesson completion
LESSON_ORDER = ["L01", "L02", "L03", "L04", "L05"]
MONTHS = ["2026-03", "2026-04", "2026-05", "2026-06"]
SALES_LIST = {
    "2026-03": ["合计", "小龙", "Bob", "Tom"],
    "2026-04": ["合计", "小龙", "吴迪", "Bob", "Tom"],
    "2026-05": ["合计", "小龙", "吴迪", "Bob", "Tom"],
    "2026-06": ["合计", "小龙", "吴迪", "Bob", "Tom"],
}

# Build result: {month: {sales: {≥L01: count, ≥L02: count, ..., cv_≥L01: count, ...}}}
results = {}
for month in MONTHS:
    results[month] = {}
    for sales in SALES_LIST[month]:
        results[month][sales] = {f"gte_{l}": 0 for l in LESSON_ORDER}
        results[month][sales].update({f"cv_{l}": 0 for l in LESSON_ORDER})

for uid, reg_month in user_reg_month.items():
    if reg_month not in results:
        continue
    sales = user_to_sales.get(uid, None)
    max_lesson = user_lessons.get(uid, None)
    is_paid = uid in paid_users

    # Add to 合计
    if max_lesson:
        for lesson in LESSON_ORDER:
            if max_lesson >= lesson:
                results[reg_month]["合计"][f"gte_{lesson}"] += 1
                if is_paid:
                    results[reg_month]["合计"][f"cv_{lesson}"] += 1

    # Add to specific sales
    if sales and sales in results[reg_month]:
        if max_lesson:
            for lesson in LESSON_ORDER:
                if max_lesson >= lesson:
                    results[reg_month][sales][f"gte_{lesson}"] += 1
                    if is_paid:
                        results[reg_month][sales][f"cv_{lesson}"] += 1

# Print results
for month in MONTHS:
    print(f"\n  {month}:")
    for sales in SALES_LIST[month]:
        r = results[month][sales]
        print(f"    {sales}: ≥L01={r['gte_L01']} ≥L02={r['gte_L02']} ≥L03={r['gte_L03']} ≥L04={r['gte_L04']} ≥L05={r['gte_L05']} | cv: {r['cv_L01']} {r['cv_L02']} {r['cv_L03']} {r['cv_L04']} {r['cv_L05']}")

# ============================================================
# Step 5: Write to process data sheet
# ============================================================
print("\n=== Step 5: Writing to process data ===")

def write_values(sheet_id, range_str, values):
    resp = requests.put(
        f"https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/{SPREADSHEET_TOKEN}/values",
        headers={"Authorization": f"Bearer {token}", "Content-Type": "application/json"},
        json={"valueRange": {"range": range_str, "values": values}},
        params={"valueInputOption": "USER_ENTERED"},
        timeout=60)
    result = resp.json()
    ok = result.get("code") == 0
    if not ok:
        print(f"  ❌ {range_str}: code={result.get('code')} msg={result.get('msg')}")
    else:
        print(f"  ✅ {range_str}")
    return ok

# Column mapping for process data:
# H=首课人数 I=首课率 J=一节课转化人数 K=一节课转化率
# L=二次课人数 M=二次课率 N=二节课转化人数 O=二节课转化率
# P=三次课人数 Q=三次课率 R=三节课转化人数 S=三节课转化率
# T=四次课人数 U=四次课率 V=四节课转化人数 W=四节课转化率
# X=五次课人数 Y=五次课率 Z=五节课转化人数 AA=五节课转化率

# Row mapping: 2=3月合计, 3=3月小龙, 4=3月Bob, 5=3月Tom
# 6=4月合计, 7=4月小龙, 8=4月吴迪, 9=4月Bob, 10=4月Tom
# 11=5月合计, 12=5月小龙, 13=5月吴迪, 14=5月Bob, 15=5月Tom
# 16=6月合计, 17=6月小龙, 18=6月吴迪, 19=6月Bob, 20=6月Tom

row_map = [
    (2, "2026-03", "合计"), (3, "2026-03", "小龙"), (4, "2026-03", "Bob"), (5, "2026-03", "Tom"),
    (6, "2026-04", "合计"), (7, "2026-04", "小龙"), (8, "2026-04", "吴迪"), (9, "2026-04", "Bob"), (10, "2026-04", "Tom"),
    (11, "2026-05", "合计"), (12, "2026-05", "小龙"), (13, "2026-05", "吴迪"), (14, "2026-05", "Bob"), (15, "2026-05", "Tom"),
    (16, "2026-06", "合计"), (17, "2026-06", "小龙"), (18, "2026-06", "吴迪"), (19, "2026-06", "Bob"), (20, "2026-06", "Tom"),
]

# Write cumulative lesson data: H, J, L, N, P, R, T, V, X, Z
# H=首课人数(≥L01), J=一节课转化人数(≥L01+paid)
# L=二次课人数(≥L02), N=二节课转化人数(≥L02+paid)
# P=三次课人数(≥L03), R=三节课转化人数(≥L03+paid)
# T=四次课人数(≥L04), V=四节课转化人数(≥L04+paid)
# X=五次课人数(≥L05), Z=五节课转化人数(≥L05+paid)

for row_num, month, sales in row_map:
    r = results[month][sales]
    # Build the row data for columns H through AA
    # We only write the count columns (H, J, L, N, P, R, T, V, X, Z)
    # The rate columns (I, K, M, O, Q, S, U, W, Y, AA) are formulas

    row_data = [
        [r["gte_L01"]],  # H: 首课人数
        [f'=IFERROR(H{row_num}/C{row_num},"")'],  # I: 首课率
        [r["cv_L01"]],  # J: 一节课转化人数
        [f'=IFERROR(J{row_num}/C{row_num},"")'],  # K: 一节课转化率
        [r["gte_L02"]],  # L: 二次课人数
        [f'=IFERROR(L{row_num}/C{row_num},"")'],  # M: 二次课率
        [r["cv_L02"]],  # N: 二节课转化人数
        [f'=IFERROR(N{row_num}/C{row_num},"")'],  # O: 二节课转化率
        [r["gte_L03"]],  # P: 三次课人数
        [f'=IFERROR(P{row_num}/C{row_num},"")'],  # Q: 三次课率
        [r["cv_L03"]],  # R: 三节课转化人数
        [f'=IFERROR(R{row_num}/C{row_num},"")'],  # S: 三节课转化率
        [r["gte_L04"]],  # T: 四次课人数
        [f'=IFERROR(T{row_num}/C{row_num},"")'],  # U: 四次课率
        [r["cv_L04"]],  # V: 四节课转化人数
        [f'=IFERROR(V{row_num}/C{row_num},"")'],  # W: 四节课转化率
        [r["gte_L05"]],  # X: 五次课人数
        [f'=IFERROR(X{row_num}/C{row_num},"")'],  # Y: 五次课率
        [r["cv_L05"]],  # Z: 五节课转化人数
        [f'=IFERROR(Z{row_num}/C{row_num},"")'],  # AA: 五节课转化率
    ]

    # Write H through AA
    col_letters = "HIJKLMNOPQRSTUVWXYZAA"
    for i, (col, data) in enumerate(zip(
        ["H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","AA"],
        row_data
    )):
        write_values("3aOvV6", f"3aOvV6!{col}{row_num}:{col}{row_num}", [data])
        time.sleep(0.15)

print("\n✅ Process data updated!")