#!/usr/bin/env python3 """ 销售线索用户分析报表 输入:Excel文件,包含「用户ID」列 或 手机号列 输出:每个用户+角色一行,包含注册信息、序章完成情况、购买和退款信息 修正口径: - 体验课 = 固定10节课:L1 U00 L01-L05 (chapter_id: 343,344,345,346,348) + L2 U00 L01-L05 (55-59) - 完成时间 = play_status=1 的最早 updated_at - 若无「用户ID」列,则自动识别手机号列,脱敏匹配账号ID后再查询 """ import os import re import sys import psycopg2 import pandas as pd from collections import defaultdict # ── 数据库 ── DB_HOST = "bj-postgres-16pob4sg.sql.tencentcdb.com" DB_PORT = 28591 DB_USER = "ai_member" DB_NAME = "vala_bi" def get_password(): secrets_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "secrets.env") # also try env pw = os.environ.get("PG_ONLINE_PASSWORD", "") if pw: return pw if os.path.exists(secrets_path): with open(secrets_path) as f: for line in f: if line.startswith("PG_ONLINE_PASSWORD="): return line.strip().split("=", 1)[1].strip("'\"") raise RuntimeError("PG_ONLINE_PASSWORD not found") def get_conn(): return psycopg2.connect(host=DB_HOST, port=DB_PORT, user=DB_USER, password=get_password(), dbname=DB_NAME, connect_timeout=30) # ── 序章 chapter_id ── # L1 S0 U00 L01..L05 L1_CHAPTERS = {343: "L1_U00_L01", 344: "L1_U00_L02", 345: "L1_U00_L03", 346: "L1_U00_L04", 348: "L1_U00_L05"} # L2 S0 U00 L01..L05 L2_CHAPTERS = {55: "L2_U00_L01", 56: "L2_U00_L02", 57: "L2_U00_L03", 58: "L2_U00_L04", 59: "L2_U00_L05"} ALL_CHAPTERS = {**L1_CHAPTERS, **L2_CHAPTERS} COL_ORDER = [ "L1_U00_L01_完成时间", "L1_U00_L02_完成时间", "L1_U00_L03_完成时间", "L1_U00_L04_完成时间", "L1_U00_L05_完成时间", "L2_U00_L01_完成时间", "L2_U00_L02_完成时间", "L2_U00_L03_完成时间", "L2_U00_L04_完成时间", "L2_U00_L05_完成时间", ] # ── 手机号匹配 ── def mask_phone(phone): """手机号脱敏:前3 + **** + 后4""" return f"{phone[:3]}****{phone[-4:]}" def extract_phones_from_df(df): """从 DataFrame 中提取所有 1 开头的 11 位手机号(去重保持顺序)""" phones = [] for col in df.columns: for val in df[col].dropna(): val_str = str(int(val)) if isinstance(val, (int, float)) else str(val) val_str = val_str.strip() if re.match(r'^1\d{10}$', val_str): phones.append(val_str) seen = set() unique = [] for p in phones: if p not in seen: seen.add(p) unique.append(p) return unique def match_phones_to_accounts(phones, conn): """ 手机号脱敏后匹配 bi_vala_app_account.tel 返回: (phone_to_account, account_ids, unmatched_phones) - phone_to_account: {明文手机号: account_id} - account_ids: 匹配到的 account_id 列表 - unmatched_phones: 未匹配的明文手机号列表 """ if not phones: return {}, [], [] # 脱敏 masked_to_phones = {} for p in phones: m = mask_phone(p) masked_to_phones.setdefault(m, []).append(p) masks = list(masked_to_phones.keys()) cur = conn.cursor() placeholders = ",".join(["%s"] * len(masks)) cur.execute(f""" SELECT id AS account_id, tel FROM bi_vala_app_account WHERE tel IN ({placeholders}) AND status = 1 AND deleted_at IS NULL """, masks) rows = cur.fetchall() cur.close() # masked -> account_id masked_to_account = {} for aid, masked in rows: if masked not in masked_to_account: masked_to_account[masked] = aid # 明文手机号 -> account_id phone_to_account = {} for p in phones: m = mask_phone(p) if m in masked_to_account: phone_to_account[p] = masked_to_account[m] account_ids = list(set(phone_to_account.values())) unmatched = [p for p in phones if p not in phone_to_account] return phone_to_account, account_ids, unmatched def main(): input_file = sys.argv[1] if len(sys.argv) > 1 else "/root/.openclaw/media/inbound/3æ_è_çº_çº_ç---d9a41af7-b100-43a7-a983-d4fd1f164023.xlsx" print(f"读取输入文件: {input_file}") df_input = pd.read_excel(input_file, dtype=str) # ── 0. 识别输入文件类型:用户ID 还是 手机号 ── has_user_id = "用户ID" in df_input.columns account_ids = [] phone_to_account = {} # 明文手机号 -> account_id(仅手机号模式使用) unmatched_phones = [] use_phone_mode = False df_input_map = None # 每行的线索信息映射 if has_user_id: print("检测到「用户ID」列,使用用户ID直接匹配") user_ids_raw = df_input["用户ID"].dropna().unique().tolist() valid_ids = [] for x in user_ids_raw: try: valid_ids.append(int(x)) except ValueError: print(f" 跳过非数字用户ID: {x}") account_ids = valid_ids print(f"共 {len(account_ids)} 个用户ID") # 原始文件线索信息 df_input = df_input[df_input["用户ID"].apply(lambda x: str(x).isdigit() if pd.notna(x) else False)] df_input["用户ID_int"] = df_input["用户ID"].astype(int) df_input_map = df_input[["用户ID_int", "线索进线日期", "销售"]].drop_duplicates(subset="用户ID_int") else: print("未检测到「用户ID」列,尝试识别手机号...") phones = extract_phones_from_df(df_input) if not phones: print("ERROR: 既没有「用户ID」列,也没有找到手机号,无法继续") sys.exit(1) print(f"提取到 {len(phones)} 个手机号(去重后)") conn = get_conn() phone_to_account, account_ids, unmatched_phones = match_phones_to_accounts(phones, conn) conn.close() print(f"匹配到 {len(account_ids)} 个账号,未匹配 {len(unmatched_phones)} 个手机号") if unmatched_phones: print(f" 未匹配手机号: {unmatched_phones}") if not account_ids: print("ERROR: 没有任何手机号匹配到账号,无法继续") sys.exit(1) use_phone_mode = True # 构建手机号 → 线索信息的映射 # 找手机号列(第一个含手机号的列) phone_col = None for col in df_input.columns: sample = df_input[col].dropna().head(5).tolist() if any(re.match(r'^1\d{10}$', str(int(v)) if isinstance(v, (int, float)) else str(v)) for v in sample): phone_col = col break if phone_col: # 规范化手机号 def normalize_phone(val): try: return str(int(float(val))) if pd.notna(val) else "" except (ValueError, TypeError): return str(val).strip() df_input["_phone"] = df_input[phone_col].apply(normalize_phone) # 提取线索信息列 clue_cols = [c for c in ["线索进线日期", "销售"] if c in df_input.columns] if clue_cols: df_input_map = df_input[["_phone"] + clue_cols].drop_duplicates(subset="_phone") df_input_map.rename(columns={"_phone": "手机号"}, inplace=True) else: # 没有线索列,只保留手机号用于后续关联 df_input_map = df_input[["_phone"]].drop_duplicates(subset="_phone") df_input_map.rename(columns={"_phone": "手机号"}, inplace=True) # ── 公共部分:用 account_ids 查询后续数据 ── conn = get_conn() # ── 1. 获取用户基本信息(account) ── placeholders = ",".join(["%s"] * len(account_ids)) df_accounts = pd.read_sql_query( f"SELECT id AS account_id, created_at AS reg_time FROM bi_vala_app_account WHERE id IN ({placeholders}) AND status = 1", conn, params=account_ids ) print(f" 有效账户: {len(df_accounts)}") # ── 2. 获取角色(排除 nickname 为空的) ── df_chars = pd.read_sql_query( f"SELECT id AS character_id, account_id, nickname, created_at AS char_created_at FROM bi_vala_app_character WHERE account_id IN ({placeholders}) AND (nickname IS NOT NULL AND nickname != '') AND deleted_at IS NULL", conn, params=account_ids ) print(f" 有效角色: {len(df_chars)}") if df_chars.empty: print("没有有效的角色,退出") conn.close() return # ── 3. 查询课时完成记录(10个 chapter_id,8个分表) ── chapter_ids = list(ALL_CHAPTERS.keys()) char_id_set = set(df_chars["character_id"].tolist()) char_play = defaultdict(dict) total_play = 0 for tbl_idx in range(8): table = f"bi_user_chapter_play_record_{tbl_idx}" sql = f""" SELECT user_id, chapter_id, MIN(updated_at) AS done_time FROM {table} WHERE chapter_id IN %s AND play_status = 1 AND deleted_at IS NULL GROUP BY user_id, chapter_id """ try: cur = conn.cursor() cur.execute(sql, (tuple(chapter_ids),)) rows = cur.fetchall() cur.close() total_play += len(rows) for user_id, ch_id, done_time in rows: if user_id in char_id_set: label = ALL_CHAPTERS.get(ch_id) if label: char_play[user_id][ch_id] = done_time except Exception as e: print(f" 警告: {table} 查询失败: {e}") print(f" 课时完成记录: {total_play} 条, 匹配角色: {len(char_play)}") # ── 4. 订单信息 ── df_orders = pd.read_sql_query( f""" SELECT o.account_id, o.pay_success_date, o.goods_name, o.pay_amount_int, o.key_from, o.trade_no, o.order_status FROM bi_vala_order o INNER JOIN bi_vala_app_account a ON o.account_id = a.id AND a.status = 1 WHERE o.account_id IN ({placeholders}) AND o.deleted_at IS NULL ORDER BY o.account_id, o.pay_success_date """, conn, params=account_ids ) print(f" 订单记录: {len(df_orders)}") # ── 5. 退款信息 ── all_trade_nos = df_orders["trade_no"].dropna().unique().tolist() refund_map = {} if all_trade_nos: # 分批查询(防止IN子句过长) for i in range(0, len(all_trade_nos), 500): batch = all_trade_nos[i:i+500] ph = ",".join(["%s"] * len(batch)) cur = conn.cursor() cur.execute(f"SELECT trade_no, SUM(refund_amount_int) FROM bi_refund_order WHERE trade_no IN ({ph}) AND status = 3 AND deleted_at IS NULL GROUP BY trade_no", batch) for trade_no, amt in cur.fetchall(): refund_map[trade_no] = amt cur.close() conn.close() print(f" 退费记录: {len(refund_map)} 条") # ── 6. 组装结果 ── df_chars = df_chars.merge(df_accounts, on="account_id", how="left") # 去掉时区信息(Excel不支持) for col in ["reg_time", "char_created_at"]: if col in df_chars.columns: df_chars[col] = pd.to_datetime(df_chars[col]).dt.tz_localize(None) rows = [] for _, char_row in df_chars.iterrows(): account_id = int(char_row["account_id"]) char_id = int(char_row["character_id"]) reg_time = char_row["reg_time"] char_created_at = char_row["char_created_at"] # 课时完成 play_map = char_play.get(char_id, {}) row_data = { "用户ID": account_id, "角色ID": char_id, "用户注册时间": reg_time, "角色创建时间": char_created_at, } for col_label in COL_ORDER: # 找到对应的 chapter_id(ALL_CHAPTERS 的 value 是去掉 "_完成时间" 后缀的) ch_id = None lbl_key = col_label.replace("_完成时间", "") for cid, lbl in ALL_CHAPTERS.items(): if lbl == lbl_key: ch_id = cid break done_time = play_map.get(ch_id, None) if done_time is not None: done_time = done_time.replace(tzinfo=None) row_data[col_label] = done_time rows.append(row_data) df_result = pd.DataFrame(rows) # ── 7. 合并订单 & 退款 ── # 按 account_id 聚合 order_agg = df_orders.groupby("account_id").agg( 购买时间=("pay_success_date", lambda x: ";".join(str(v) for v in x if pd.notna(v))), 购买课包名称=("goods_name", lambda x: ";".join(str(v) for v in x if pd.notna(v))), 支付金额=("pay_amount_int", lambda x: ";".join(str(v/100) for v in x if pd.notna(v))), 购买渠道key_from=("key_from", lambda x: ";".join(str(v) for v in x if pd.notna(v))), trade_nos=("trade_no", lambda x: list(x)), ).reset_index() # 退款判断 def calc_refund(row): has_refund = False total_refund = 0 for tn in row["trade_nos"]: if tn in refund_map: has_refund = True total_refund += refund_map[tn] return pd.Series({"是否退款": "是" if has_refund else "否", "退款金额": total_refund / 100.0}) refund_info = order_agg.apply(calc_refund, axis=1) order_agg = pd.concat([order_agg[["account_id", "购买时间", "购买课包名称", "支付金额", "购买渠道key_from"]], refund_info], axis=1) # 合并到结果 df_result["account_id_int"] = df_result["用户ID"].astype(int) df_result = df_result.merge(order_agg, left_on="account_id_int", right_on="account_id", how="left") df_result.drop(columns=["account_id"], inplace=True, errors="ignore") # ── 合并 线索进线日期、销售 和(手机号模式的)手机号 ── if df_input_map is not None: if use_phone_mode: # 手机号模式:通过 phone_to_account 反查手机号 → 合并线索信息 # 先给 df_result 加上手机号列 # account_id -> 明文手机号(取第一个匹配到的) account_to_phone = {} for phone, aid in phone_to_account.items(): if aid not in account_to_phone: account_to_phone[aid] = phone df_result["手机号"] = df_result["用户ID"].map(account_to_phone) # 用手机号关联线索信息 if "线索进线日期" in df_input_map.columns or "销售" in df_input_map.columns: df_result = df_result.merge(df_input_map, on="手机号", how="left") else: # 用户ID模式:原逻辑 df_result = df_result.merge(df_input_map, left_on="account_id_int", right_on="用户ID_int", how="left") df_result.drop(columns=["用户ID_int"], inplace=True, errors="ignore") df_result.drop(columns=["account_id_int"], inplace=True, errors="ignore") # 填充空值 df_result["购买时间"] = df_result["购买时间"].fillna("") df_result["购买课包名称"] = df_result["购买课包名称"].fillna("") df_result["支付金额"] = df_result["支付金额"].fillna("") df_result["购买渠道key_from"] = df_result["购买渠道key_from"].fillna("") df_result["是否退款"] = df_result["是否退款"].fillna("否") df_result["退款金额"] = df_result["退款金额"].fillna(0.0) df_result["线索进线日期"] = df_result["线索进线日期"].fillna("") if "线索进线日期" in df_result.columns else "" df_result["销售"] = df_result["销售"].fillna("") if "销售" in df_result.columns else "" if use_phone_mode and "手机号" in df_result.columns: df_result["手机号"] = df_result["手机号"].fillna("") # 按用户ID升序排列 df_result = df_result.sort_values(by=["用户ID", "角色ID"], ascending=True).reset_index(drop=True) # 调整列顺序 if use_phone_mode: col_order = ["用户ID", "手机号", "线索进线日期", "销售", "角色ID", "用户注册时间", "角色创建时间"] + COL_ORDER + ["购买时间", "购买课包名称", "支付金额", "购买渠道key_from", "是否退款", "退款金额"] else: col_order = ["用户ID", "线索进线日期", "销售", "角色ID", "用户注册时间", "角色创建时间"] + COL_ORDER + ["购买时间", "购买课包名称", "支付金额", "购买渠道key_from", "是否退款", "退款金额"] # 只保留实际存在的列 col_order = [c for c in col_order if c in df_result.columns] df_result = df_result[col_order] # ── 8. 输出 ── output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output") os.makedirs(output_dir, exist_ok=True) output_path = os.path.join(output_dir, "销售线索_用户分析.xlsx") with pd.ExcelWriter(output_path, engine="openpyxl") as writer: df_result.to_excel(writer, sheet_name="用户分析", index=False) print(f"\n✅ 报表已生成: {output_path}") print(f" 总行数: {len(df_result)}") purchased = sum(1 for v in df_result["购买时间"] if v) print(f" 有购买记录: {purchased}") refunded = sum(1 for v in df_result["是否退款"] if v == "是") print(f" 有退款: {refunded}") done_l1 = sum(1 for v in df_result["L1_U00_L01_完成时间"] if pd.notna(v)) done_l2 = sum(1 for v in df_result["L2_U00_L01_完成时间"] if pd.notna(v)) print(f" 完成L1序章(U00 L01): {done_l1} 个角色") print(f" 完成L2序章(U00 L01): {done_l2} 个角色") if unmatched_phones: print(f"\n⚠️ 未匹配到账号的手机号 ({len(unmatched_phones)} 个):") for p in unmatched_phones: print(f" {p}") if __name__ == "__main__": main()