ai_member_xiaoxi/scripts/lead_user_analysis.py

#!/usr/bin/env python3
"""
销售线索用户分析报表

输入：Excel文件，包含「用户ID」列 或 手机号列
输出：每个用户+角色一行，包含注册信息、序章完成情况、购买和退款信息

修正口径：
- 体验课 = 固定10节课：L1 U00 L01-L05 (chapter_id: 343,344,345,346,348) + L2 U00 L01-L05 (55-59)
- 完成时间 = play_status=1 的最早 updated_at
- 若无「用户ID」列，则自动识别手机号列，脱敏匹配账号ID后再查询
"""

import os
import re
import sys
import psycopg2
import pandas as pd
from collections import defaultdict

SCRIPTS_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, SCRIPTS_DIR)
from phone_encrypt import encrypt_phone

# ── 数据库 ──
DB_HOST = "bj-postgres-16pob4sg.sql.tencentcdb.com"
DB_PORT = 28591
DB_USER = "ai_member"
DB_NAME = "vala_bi"

def get_password():
    secrets_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "secrets.env")
    # also try env
    pw = os.environ.get("PG_ONLINE_PASSWORD", "")
    if pw:
        return pw
    if os.path.exists(secrets_path):
        with open(secrets_path) as f:
            for line in f:
                if line.startswith("PG_ONLINE_PASSWORD="):
                    return line.strip().split("=", 1)[1].strip("'\"")
    raise RuntimeError("PG_ONLINE_PASSWORD not found")

def get_conn():
    return psycopg2.connect(host=DB_HOST, port=DB_PORT, user=DB_USER, password=get_password(), dbname=DB_NAME, connect_timeout=30)

# ── 序章 chapter_id ──
# L1 S0 U00 L01..L05
L1_CHAPTERS = {343: "L1_U00_L01", 344: "L1_U00_L02", 345: "L1_U00_L03", 346: "L1_U00_L04", 348: "L1_U00_L05"}
# L2 S0 U00 L01..L05
L2_CHAPTERS = {55: "L2_U00_L01", 56: "L2_U00_L02", 57: "L2_U00_L03", 58: "L2_U00_L04", 59: "L2_U00_L05"}

ALL_CHAPTERS = {**L1_CHAPTERS, **L2_CHAPTERS}
COL_ORDER = [
    "L1_U00_L01_完成时间", "L1_U00_L02_完成时间", "L1_U00_L03_完成时间",
    "L1_U00_L04_完成时间", "L1_U00_L05_完成时间",
    "L2_U00_L01_完成时间", "L2_U00_L02_完成时间", "L2_U00_L03_完成时间",
    "L2_U00_L04_完成时间", "L2_U00_L05_完成时间",
]

def encrypt_phone_local(phone):
    """手机号 XXTEA 加密"""
    return encrypt_phone(phone)

def mask_phone_display(phone):
    """手机号脱敏用于展示: 130****1234"""
    return f"{phone[:3]}****{phone[-4:]}"


def extract_phones_from_df(df):
    """从 DataFrame 中提取所有 1 开头的 11 位手机号（去重保持顺序）"""
    phones = []
    for col in df.columns:
        for val in df[col].dropna():
            val_str = str(int(val)) if isinstance(val, (int, float)) else str(val)
            val_str = val_str.strip()
            if re.match(r'^1\d{10}$', val_str):
                phones.append(val_str)
    seen = set()
    unique = []
    for p in phones:
        if p not in seen:
            seen.add(p)
            unique.append(p)
    return unique


def match_phones_to_accounts(phones, conn):
    """
    手机号 XXTEA 加密后匹配 bi_vala_app_account.tel_encrypt
    返回: (phone_to_account, account_ids, unmatched_phones)
    """
    if not phones:
        return {}, [], []

    # 加密
    encrypt_to_phones = {}
    for p in phones:
        enc = encrypt_phone(p)
        encrypt_to_phones.setdefault(enc, []).append(p)

    enc_list = list(encrypt_to_phones.keys())
    cur = conn.cursor()
    placeholders = ",".join(["%s"] * len(enc_list))
    cur.execute(f"""
        SELECT id AS account_id, tel_encrypt
        FROM bi_vala_app_account
        WHERE tel_encrypt IN ({placeholders})
          AND status = 1
          AND deleted_at IS NULL
    """, enc_list)
    rows = cur.fetchall()
    cur.close()

    # tel_encrypt -> account_id
    enc_to_account = {}
    for aid, enc in rows:
        if enc not in enc_to_account:
            enc_to_account[enc] = aid

    # 明文手机号 -> account_id
    phone_to_account = {}
    for p in phones:
        enc = encrypt_phone(p)
        if enc in enc_to_account:
            phone_to_account[p] = enc_to_account[enc]

    account_ids = list(set(phone_to_account.values()))
    unmatched = [p for p in phones if p not in phone_to_account]

    return phone_to_account, account_ids, unmatched


def main():
    input_file = sys.argv[1] if len(sys.argv) > 1 else "/root/.openclaw/media/inbound/3æ_è_çº_çº_ç---d9a41af7-b100-43a7-a983-d4fd1f164023.xlsx"
    print(f"读取输入文件: {input_file}")
    df_input = pd.read_excel(input_file, dtype=str)

    # ── 0. 识别输入文件类型：用户ID 还是 手机号 ──
    has_user_id = "用户ID" in df_input.columns
    account_ids = []
    phone_to_account = {}  # 明文手机号 -> account_id（仅手机号模式使用）
    unmatched_phones = []
    use_phone_mode = False
    df_input_map = None  # 每行的线索信息映射

    if has_user_id:
        print("检测到「用户ID」列，使用用户ID直接匹配")
        user_ids_raw = df_input["用户ID"].dropna().unique().tolist()
        valid_ids = []
        for x in user_ids_raw:
            try:
                valid_ids.append(int(x))
            except ValueError:
                print(f"  跳过非数字用户ID: {x}")
        account_ids = valid_ids
        print(f"共 {len(account_ids)} 个用户ID")

        # 原始文件线索信息
        df_input = df_input[df_input["用户ID"].apply(lambda x: str(x).isdigit() if pd.notna(x) else False)]
        df_input["用户ID_int"] = df_input["用户ID"].astype(int)
        df_input_map = df_input[["用户ID_int", "线索进线日期", "销售"]].drop_duplicates(subset="用户ID_int")

    else:
        print("未检测到「用户ID」列，尝试识别手机号...")
        phones = extract_phones_from_df(df_input)
        if not phones:
            print("ERROR: 既没有「用户ID」列，也没有找到手机号，无法继续")
            sys.exit(1)
        print(f"提取到 {len(phones)} 个手机号（去重后）")

        conn = get_conn()
        phone_to_account, account_ids, unmatched_phones = match_phones_to_accounts(phones, conn)
        conn.close()
        print(f"匹配到 {len(account_ids)} 个账号，未匹配 {len(unmatched_phones)} 个手机号")
        if unmatched_phones:
            print(f"  未匹配手机号: {unmatched_phones}")

        if not account_ids:
            print("ERROR: 没有任何手机号匹配到账号，无法继续")
            sys.exit(1)

        use_phone_mode = True

        # 构建手机号 → 线索信息的映射
        # 找手机号列（第一个含手机号的列）
        phone_col = None
        for col in df_input.columns:
            sample = df_input[col].dropna().head(5).tolist()
            if any(re.match(r'^1\d{10}$', str(int(v)) if isinstance(v, (int, float)) else str(v))
                   for v in sample):
                phone_col = col
                break

        if phone_col:
            # 规范化手机号
            def normalize_phone(val):
                try:
                    return str(int(float(val))) if pd.notna(val) else ""
                except (ValueError, TypeError):
                    return str(val).strip()

            df_input["_phone"] = df_input[phone_col].apply(normalize_phone)
            # 提取线索信息列
            clue_cols = [c for c in ["线索进线日期", "销售"] if c in df_input.columns]
            if clue_cols:
                df_input_map = df_input[["_phone"] + clue_cols].drop_duplicates(subset="_phone")
                df_input_map.rename(columns={"_phone": "手机号"}, inplace=True)
            else:
                # 没有线索列，只保留手机号用于后续关联
                df_input_map = df_input[["_phone"]].drop_duplicates(subset="_phone")
                df_input_map.rename(columns={"_phone": "手机号"}, inplace=True)

    # ── 公共部分：用 account_ids 查询后续数据 ──

    conn = get_conn()

    # ── 1. 获取用户基本信息（account） ──
    placeholders = ",".join(["%s"] * len(account_ids))
    df_accounts = pd.read_sql_query(
        f"SELECT id AS account_id, created_at AS reg_time FROM bi_vala_app_account WHERE id IN ({placeholders}) AND status = 1",
        conn, params=account_ids
    )
    print(f"  有效账户: {len(df_accounts)}")

    # ── 2. 获取角色（排除 nickname 为空的） ──
    df_chars = pd.read_sql_query(
        f"SELECT id AS character_id, account_id, nickname, created_at AS char_created_at FROM bi_vala_app_character WHERE account_id IN ({placeholders}) AND (nickname IS NOT NULL AND nickname != '') AND deleted_at IS NULL",
        conn, params=account_ids
    )
    print(f"  有效角色: {len(df_chars)}")

    if df_chars.empty:
        print("没有有效的角色，退出")
        conn.close()
        return

    # ── 3. 查询课时完成记录（10个 chapter_id，8个分表） ──
    chapter_ids = list(ALL_CHAPTERS.keys())
    char_id_set = set(df_chars["character_id"].tolist())
    char_play = defaultdict(dict)
    total_play = 0

    for tbl_idx in range(8):
        table = f"bi_user_chapter_play_record_{tbl_idx}"
        sql = f"""
            SELECT user_id, chapter_id, MIN(updated_at) AS done_time
            FROM {table}
            WHERE chapter_id IN %s
              AND play_status = 1
              AND deleted_at IS NULL
            GROUP BY user_id, chapter_id
        """
        try:
            cur = conn.cursor()
            cur.execute(sql, (tuple(chapter_ids),))
            rows = cur.fetchall()
            cur.close()
            total_play += len(rows)
            for user_id, ch_id, done_time in rows:
                if user_id in char_id_set:
                    label = ALL_CHAPTERS.get(ch_id)
                    if label:
                        char_play[user_id][ch_id] = done_time
        except Exception as e:
            print(f"  警告: {table} 查询失败: {e}")

    print(f"  课时完成记录: {total_play} 条, 匹配角色: {len(char_play)}")

    # ── 4. 订单信息 ──
    df_orders = pd.read_sql_query(
        f"""
        SELECT o.account_id, o.pay_success_date, o.goods_name, o.pay_amount_int, o.key_from, o.trade_no, o.order_status
        FROM bi_vala_order o
        INNER JOIN bi_vala_app_account a ON o.account_id = a.id AND a.status = 1
        WHERE o.account_id IN ({placeholders})
          AND o.deleted_at IS NULL
        ORDER BY o.account_id, o.pay_success_date
        """,
        conn, params=account_ids
    )
    print(f"  订单记录: {len(df_orders)}")

    # ── 5. 退款信息 ──
    all_trade_nos = df_orders["trade_no"].dropna().unique().tolist()
    refund_map = {}
    if all_trade_nos:
        # 分批查询（防止IN子句过长）
        for i in range(0, len(all_trade_nos), 500):
            batch = all_trade_nos[i:i+500]
            ph = ",".join(["%s"] * len(batch))
            cur = conn.cursor()
            cur.execute(f"SELECT trade_no, SUM(refund_amount_int) FROM bi_refund_order WHERE trade_no IN ({ph}) AND status = 3 AND deleted_at IS NULL GROUP BY trade_no", batch)
            for trade_no, amt in cur.fetchall():
                refund_map[trade_no] = amt
            cur.close()
    conn.close()
    print(f"  退费记录: {len(refund_map)} 条")

    # ── 6. 组装结果 ──
    df_chars = df_chars.merge(df_accounts, on="account_id", how="left")

    # 去掉时区信息（Excel不支持）
    for col in ["reg_time", "char_created_at"]:
        if col in df_chars.columns:
            df_chars[col] = pd.to_datetime(df_chars[col]).dt.tz_localize(None)

    rows = []
    for _, char_row in df_chars.iterrows():
        account_id = int(char_row["account_id"])
        char_id = int(char_row["character_id"])
        reg_time = char_row["reg_time"]
        char_created_at = char_row["char_created_at"]

        # 课时完成
        play_map = char_play.get(char_id, {})
        row_data = {
            "用户ID": account_id,
            "角色ID": char_id,
            "用户注册时间": reg_time,
            "角色创建时间": char_created_at,
        }
        for col_label in COL_ORDER:
            # 找到对应的 chapter_id（ALL_CHAPTERS 的 value 是去掉 "_完成时间" 后缀的）
            ch_id = None
            lbl_key = col_label.replace("_完成时间", "")
            for cid, lbl in ALL_CHAPTERS.items():
                if lbl == lbl_key:
                    ch_id = cid
                    break
            done_time = play_map.get(ch_id, None)
            if done_time is not None:
                done_time = done_time.replace(tzinfo=None)
            row_data[col_label] = done_time

        rows.append(row_data)

    df_result = pd.DataFrame(rows)

    # ── 7. 合并订单 & 退款 ──
    # 按 account_id 聚合
    order_agg = df_orders.groupby("account_id").agg(
        购买时间=("pay_success_date", lambda x: ";".join(str(v) for v in x if pd.notna(v))),
        购买课包名称=("goods_name", lambda x: ";".join(str(v) for v in x if pd.notna(v))),
        支付金额=("pay_amount_int", lambda x: ";".join(str(v/100) for v in x if pd.notna(v))),
        购买渠道key_from=("key_from", lambda x: ";".join(str(v) for v in x if pd.notna(v))),
        trade_nos=("trade_no", lambda x: list(x)),
    ).reset_index()

    # 退款判断
    def calc_refund(row):
        has_refund = False
        total_refund = 0
        for tn in row["trade_nos"]:
            if tn in refund_map:
                has_refund = True
                total_refund += refund_map[tn]
        return pd.Series({"是否退款": "是" if has_refund else "否", "退款金额": total_refund / 100.0})

    refund_info = order_agg.apply(calc_refund, axis=1)
    order_agg = pd.concat([order_agg[["account_id", "购买时间", "购买课包名称", "支付金额", "购买渠道key_from"]], refund_info], axis=1)

    # 合并到结果
    df_result["account_id_int"] = df_result["用户ID"].astype(int)
    df_result = df_result.merge(order_agg, left_on="account_id_int", right_on="account_id", how="left")
    df_result.drop(columns=["account_id"], inplace=True, errors="ignore")

    # ── 合并 线索进线日期、销售 和（手机号模式的）手机号 ──
    if df_input_map is not None:
        if use_phone_mode:
            # 手机号模式：通过 phone_to_account 反查手机号 → 合并线索信息
            # 先给 df_result 加上手机号列
            # account_id -> 明文手机号（取第一个匹配到的）
            account_to_phone = {}
            for phone, aid in phone_to_account.items():
                if aid not in account_to_phone:
                    account_to_phone[aid] = phone
            df_result["手机号"] = df_result["用户ID"].map(account_to_phone)

            # 用手机号关联线索信息
            if "线索进线日期" in df_input_map.columns or "销售" in df_input_map.columns:
                df_result = df_result.merge(df_input_map, on="手机号", how="left")
        else:
            # 用户ID模式：原逻辑
            df_result = df_result.merge(df_input_map, left_on="account_id_int", right_on="用户ID_int", how="left")
            df_result.drop(columns=["用户ID_int"], inplace=True, errors="ignore")

    df_result.drop(columns=["account_id_int"], inplace=True, errors="ignore")

    # 填充空值
    df_result["购买时间"] = df_result["购买时间"].fillna("")
    df_result["购买课包名称"] = df_result["购买课包名称"].fillna("")
    df_result["支付金额"] = df_result["支付金额"].fillna("")
    df_result["购买渠道key_from"] = df_result["购买渠道key_from"].fillna("")
    df_result["是否退款"] = df_result["是否退款"].fillna("否")
    df_result["退款金额"] = df_result["退款金额"].fillna(0.0)
    df_result["线索进线日期"] = df_result["线索进线日期"].fillna("") if "线索进线日期" in df_result.columns else ""
    df_result["销售"] = df_result["销售"].fillna("") if "销售" in df_result.columns else ""

    if use_phone_mode and "手机号" in df_result.columns:
        df_result["手机号"] = df_result["手机号"].fillna("")

    # 按用户ID升序排列
    df_result = df_result.sort_values(by=["用户ID", "角色ID"], ascending=True).reset_index(drop=True)

    # 调整列顺序
    if use_phone_mode:
        col_order = ["用户ID", "手机号", "线索进线日期", "销售", "角色ID", "用户注册时间", "角色创建时间"] + COL_ORDER + ["购买时间", "购买课包名称", "支付金额", "购买渠道key_from", "是否退款", "退款金额"]
    else:
        col_order = ["用户ID", "线索进线日期", "销售", "角色ID", "用户注册时间", "角色创建时间"] + COL_ORDER + ["购买时间", "购买课包名称", "支付金额", "购买渠道key_from", "是否退款", "退款金额"]
    # 只保留实际存在的列
    col_order = [c for c in col_order if c in df_result.columns]
    df_result = df_result[col_order]

    # ── 8. 输出 ──
    output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output")
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, "销售线索_用户分析.xlsx")

    with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
        df_result.to_excel(writer, sheet_name="用户分析", index=False)

    print(f"\n✅ 报表已生成: {output_path}")
    print(f"   总行数: {len(df_result)}")
    purchased = sum(1 for v in df_result["购买时间"] if v)
    print(f"   有购买记录: {purchased}")
    refunded = sum(1 for v in df_result["是否退款"] if v == "是")
    print(f"   有退款: {refunded}")
    done_l1 = sum(1 for v in df_result["L1_U00_L01_完成时间"] if pd.notna(v))
    done_l2 = sum(1 for v in df_result["L2_U00_L01_完成时间"] if pd.notna(v))
    print(f"   完成L1序章(U00 L01): {done_l1} 个角色")
    print(f"   完成L2序章(U00 L01): {done_l2} 个角色")

    if unmatched_phones:
        print(f"\n⚠️  未匹配到账号的手机号 ({len(unmatched_phones)} 个):")
        for p in unmatched_phones:
            print(f"   {p}")


if __name__ == "__main__":
    main()