#!/usr/bin/env python3 """ 小红书线索进线×成单 lag 表分析 数据源:xiaoxi_xhs_lead_detail.csv(陈逸鸫提供) 匹配:手机号→bi_vala_app_account.tel,备UID→bi_vala_app_account.id 成单口径:bi_vala_order order_status IN (3,4), pay_success_date IS NOT NULL """ import csv import sys import os sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from phone_encrypt import encrypt_phone import psycopg2 import psycopg2.extras from collections import defaultdict from datetime import datetime, date import openpyxl from openpyxl.styles import Font, Alignment, PatternFill, Border, Side # ── Database connection ── PG_CONFIG = { "host": "bj-postgres-16pob4sg.sql.tencentcdb.com", "port": 28591, "user": "ai_member", "password": "LdfjdjL83h3h3^$&**YGG*", "dbname": "vala_bi", } def get_conn(): return psycopg2.connect(**PG_CONFIG) # ── Step 1: Load CSV ── print("=" * 60) print("Step 1: Loading CSV...") leads = [] with open("/root/.openclaw/workspace/tmp/xiaoxi_xhs_lead_detail.csv", "r", encoding="utf-8-sig") as f: reader = csv.DictReader(f) for row in reader: lead_month = row.get("进线月", "").strip() if lead_month in ("2026-03", "2026-04", "2026-05"): leads.append(row) print(f" 3-5月进线线索总数: {len(leads)}") # Extract unique phones and UIDs phones = set() uids = set() for r in leads: phone = r.get("手机号", "").strip() uid = r.get("用户ID", "").strip() if phone and phone != "0": phones.add(phone) if uid and uid != "0" and uid.isdigit(): uids.add(int(uid)) print(f" 唯一手机号: {len(phones)}, 唯一UID: {len(uids)}") # ── Step 2: Match phone → account_id ── print("\nStep 2: Matching phone → bi_vala_app_account...") conn = get_conn() cur = conn.cursor() phone_to_account = {} if phones: # XXTEA encrypt phones and match against tel_encrypt phone_encrypt_map = {} for p in phones: try: phone_encrypt_map[encrypt_phone(p)] = p except Exception as e: pass encrypted_list = list(phone_encrypt_map.keys()) batch_size = 500 for i in range(0, len(encrypted_list), batch_size): batch = encrypted_list[i:i+batch_size] placeholders = ",".join(["%s"] * len(batch)) cur.execute( f"SELECT tel_encrypt, id FROM bi_vala_app_account WHERE tel_encrypt IN ({placeholders}) AND status = 1 AND deleted_at IS NULL", batch ) for tel_enc, aid in cur.fetchall(): original_phone = phone_encrypt_map.get(tel_enc) if original_phone: phone_to_account[original_phone] = aid print(f" 手机号匹配到账号: {len(phone_to_account)}/{len(phones)}") # ── Step 3: Match UID → account_id ── print("\nStep 3: Matching UID → bi_vala_app_account...") uid_to_account = {} if uids: uid_list = list(uids) for i in range(0, len(uid_list), batch_size): batch = uid_list[i:i+batch_size] placeholders = ",".join(["%s"] * len(batch)) cur.execute( f"SELECT id FROM bi_vala_app_account WHERE id IN ({placeholders}) AND status = 1 AND deleted_at IS NULL", batch ) for (aid,) in cur.fetchall(): uid_to_account[aid] = aid print(f" UID匹配到有效账号: {len(uid_to_account)}/{len(uids)}") # ── Step 4: Build lead → account_id mapping ── print("\nStep 4: Building lead→account mapping...") lead_account_map = {} # lead_row_index → account_id matched_by_phone = 0 matched_by_uid = 0 unmatched = 0 for idx, r in enumerate(leads): phone = r.get("手机号", "").strip() uid = r.get("用户ID", "").strip() aid = None match_method = None # Try phone first if phone and phone in phone_to_account: aid = phone_to_account[phone] match_method = "phone" matched_by_phone += 1 # Fallback to UID elif uid and uid.isdigit() and int(uid) in uid_to_account: aid = int(uid) match_method = "uid" matched_by_uid += 1 else: unmatched += 1 lead_account_map[idx] = {"account_id": aid, "match_method": match_method} print(f" 手机号匹配: {matched_by_phone}, UID匹配: {matched_by_uid}, 未匹配: {unmatched}") print(f" 总匹配率: {(matched_by_phone + matched_by_uid) / len(leads) * 100:.1f}%") # ── Step 5: Get all orders for matched accounts ── print("\nStep 5: Fetching orders for matched accounts...") matched_aids = set( v["account_id"] for v in lead_account_map.values() if v["account_id"] is not None ) print(f" 去重匹配账号数: {len(matched_aids)}") # Get orders with order_status 3 or 4, pay_success_date not null # Also join refund info aid_list = list(matched_aids) account_orders = defaultdict(list) # account_id → list of order dicts for i in range(0, len(aid_list), batch_size): batch = aid_list[i:i+batch_size] placeholders = ",".join(["%s"] * len(batch)) cur.execute(f""" SELECT o.account_id, o.id as order_id, o.pay_success_date, o.pay_amount_int, o.order_status, o.key_from, o.trade_no, o.out_trade_no, r.refund_amount_int, r.status as refund_status FROM bi_vala_order o LEFT JOIN bi_refund_order r ON ( (o.trade_no = r.trade_no OR o.out_trade_no = r.out_trade_no) AND r.status = 3 ) WHERE o.account_id IN ({placeholders}) AND o.pay_success_date IS NOT NULL AND o.order_status IN (3, 4) ORDER BY o.account_id, o.pay_success_date """, batch) for row in cur.fetchall(): aid, oid, psd, amt, ost, kf, tn, otn, ref_amt, ref_st = row account_orders[aid].append({ "order_id": oid, "pay_success_date": psd, "pay_amount_int": amt, "order_status": ost, "key_from": kf, "trade_no": tn, "out_trade_no": otn, "refund_amount_int": ref_amt or 0, "refund_status": ref_st, }) total_orders = sum(len(v) for v in account_orders.values()) accounts_with_orders = len(account_orders) print(f" 有订单的账号数: {accounts_with_orders}, 总订单数: {total_orders}") cur.close() conn.close() # ── Step 6: Build lag table ── print("\nStep 6: Building lag table...") # For each lead, determine lead_month and find orders # lag = (order_month - lead_month) in months def month_diff(d1, d2): """Months between two dates: d2 - d1""" return (d2.year - d1.year) * 12 + (d2.month - d1.month) # Aggregate: lead_month × lag_month # For each lead_month, count leads, and for each lag bucket count orders lag_data = defaultdict(lambda: defaultdict(lambda: { "lead_count": 0, "order_count": 0, "order_accounts": set(), "gmv": 0, "refund": 0, "gsv": 0, })) # Also track per-lead details for debugging lead_details = [] for idx, r in enumerate(leads): lead_month = r.get("进线月", "").strip() lead_date_str = r.get("进线日期", "").strip() info = lead_account_map[idx] aid = info["account_id"] # Count lead lag_data[lead_month]["total"]["lead_count"] += 1 if aid is None: lead_details.append({ "lead_month": lead_month, "lead_date": lead_date_str, "phone": r.get("手机号", "").strip(), "uid": r.get("用户ID", "").strip(), "account_id": None, "matched": False, "match_method": "", "has_order": False, "order_month": None, "lag": None, "gmv": 0, "refund": 0, "gsv": 0, }) continue # Parse lead date try: lead_dt = datetime.strptime(lead_date_str, "%Y-%m-%d").date() except: lead_dt = datetime.strptime(lead_month + "-01", "%Y-%m-%d").date() orders = account_orders.get(aid, []) if not orders: lead_details.append({ "lead_month": lead_month, "lead_date": lead_date_str, "phone": r.get("手机号", "").strip(), "uid": r.get("用户ID", "").strip(), "account_id": aid, "matched": True, "match_method": info.get("match_method", "") or "", "has_order": False, "order_month": None, "lag": None, "gmv": 0, "refund": 0, "gsv": 0, }) continue # For each order, determine lag has_any_order = False for order in orders: psd = order["pay_success_date"] if isinstance(psd, datetime): order_dt = psd.date() else: order_dt = psd lag = month_diff(lead_dt, order_dt) # Only count orders at or after lead month (lag >= 0) if lag < 0: continue has_any_order = True order_month = order_dt.strftime("%Y-%m") lag_key = f"M{lag}" lag_data[lead_month][lag_key]["order_count"] += 1 lag_data[lead_month][lag_key]["order_accounts"].add(aid) lag_data[lead_month][lag_key]["gmv"] += order["pay_amount_int"] lag_data[lead_month][lag_key]["refund"] += order["refund_amount_int"] lag_data[lead_month][lag_key]["gsv"] += (order["pay_amount_int"] - order["refund_amount_int"]) # Record first order for lead detail first_order = min( [o for o in orders if month_diff(lead_dt, o["pay_success_date"] if isinstance(o["pay_success_date"], datetime) else o["pay_success_date"]) >= 0], key=lambda o: o["pay_success_date"], default=None ) if orders else None if first_order: psd = first_order["pay_success_date"] order_dt = psd.date() if isinstance(psd, datetime) else psd first_lag = month_diff(lead_dt, order_dt) lead_details.append({ "lead_month": lead_month, "lead_date": lead_date_str, "phone": r.get("手机号", "").strip(), "uid": r.get("用户ID", "").strip(), "account_id": aid, "matched": True, "match_method": info.get("match_method", "") or "", "has_order": True, "order_month": order_dt.strftime("%Y-%m"), "lag": first_lag, "gmv": first_order["pay_amount_int"] / 100, "refund": first_order["refund_amount_int"] / 100, "gsv": (first_order["pay_amount_int"] - first_order["refund_amount_int"]) / 100, }) else: lead_details.append({ "lead_month": lead_month, "lead_date": lead_date_str, "phone": r.get("手机号", "").strip(), "uid": r.get("用户ID", "").strip(), "account_id": aid, "matched": True, "match_method": info.get("match_method", "") or "", "has_order": False, "order_month": None, "lag": None, "gmv": 0, "refund": 0, "gsv": 0, }) # ── Step 7: Print summary ── print("\n" + "=" * 60) print("Lag Table Summary") print("=" * 60) for lead_month in sorted(lag_data.keys()): print(f"\n── 进线月: {lead_month} ──") total_leads = lag_data[lead_month]["total"]["lead_count"] print(f" 留资总数: {total_leads}") all_lag_keys = sorted( [k for k in lag_data[lead_month].keys() if k.startswith("M")], key=lambda x: int(x[1:]) ) cum_orders = 0 cum_gmv = 0 cum_refund = 0 for lag_key in all_lag_keys: d = lag_data[lead_month][lag_key] cum_orders += d["order_count"] cum_gmv += d["gmv"] cum_refund += d["refund"] rate = d["order_count"] / total_leads * 100 if total_leads > 0 else 0 cum_rate = cum_orders / total_leads * 100 if total_leads > 0 else 0 print(f" {lag_key}: 成单{d['order_count']}单 " f"({len(d['order_accounts'])}人) " f"成单率{rate:.1f}% " f"累计{cum_orders}单({cum_rate:.1f}%) " f"GMV¥{d['gmv']/100:,.0f} " f"退¥{d['refund']/100:,.0f} " f"GSV¥{d['gsv']/100:,.0f}") # ── Step 8: Export to Excel ── print("\nStep 8: Exporting to Excel...") wb = openpyxl.Workbook() # ── Sheet 1: Lag Summary ── ws1 = wb.active ws1.title = "Lag汇总表" header_font = Font(bold=True, size=11) header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid") header_font_white = Font(bold=True, size=11, color="FFFFFF") thin_border = Border( left=Side(style='thin'), right=Side(style='thin'), top=Side(style='thin'), bottom=Side(style='thin') ) # Title ws1.merge_cells("A1:J1") ws1["A1"] = "小红书线索进线×成单 Lag 表(2026年3-5月)" ws1["A1"].font = Font(bold=True, size=14) ws1["A1"].alignment = Alignment(horizontal="center") ws1.merge_cells("A2:J2") ws1["A2"] = "数据源:xiaoxi_xhs_lead_detail.csv | 成单口径:bi_vala_order order_status IN (3,4) pay_success_date NOT NULL | 测试账号已剔除" ws1["A2"].font = Font(size=9, color="666666") ws1["A2"].alignment = Alignment(horizontal="center") # Headers headers = ["进线月", "留资数", "Lag", "成单数", "成单人数", "成单率%", "累计成单数", "累计成单率%", "GMV(元)", "退款(元)", "GSV(元)"] for col, h in enumerate(headers, 1): cell = ws1.cell(row=4, column=col, value=h) cell.font = header_font_white cell.fill = header_fill cell.alignment = Alignment(horizontal="center") cell.border = thin_border row = 5 for lead_month in sorted(lag_data.keys()): total_leads = lag_data[lead_month]["total"]["lead_count"] all_lag_keys = sorted( [k for k in lag_data[lead_month].keys() if k.startswith("M")], key=lambda x: int(x[1:]) ) cum_orders = 0 # First row for this lead_month for li, lag_key in enumerate(all_lag_keys): d = lag_data[lead_month][lag_key] cum_orders += d["order_count"] rate = d["order_count"] / total_leads * 100 if total_leads > 0 else 0 cum_rate = cum_orders / total_leads * 100 if total_leads > 0 else 0 values = [ lead_month if li == 0 else "", total_leads if li == 0 else "", lag_key, d["order_count"], len(d["order_accounts"]), round(rate, 1), cum_orders, round(cum_rate, 1), round(d["gmv"] / 100, 2), round(d["refund"] / 100, 2), round(d["gsv"] / 100, 2), ] for col, v in enumerate(values, 1): cell = ws1.cell(row=row, column=col, value=v) cell.border = thin_border cell.alignment = Alignment(horizontal="center") row += 1 # Total row for this lead_month total_orders = cum_orders total_rate = total_orders / total_leads * 100 if total_leads > 0 else 0 total_gmv = sum(lag_data[lead_month][k]["gmv"] for k in all_lag_keys) total_refund = sum(lag_data[lead_month][k]["refund"] for k in all_lag_keys) total_gsv = sum(lag_data[lead_month][k]["gsv"] for k in all_lag_keys) total_values = [ f"{lead_month} 合计", total_leads, "全部", total_orders, len(set().union(*[lag_data[lead_month][k]["order_accounts"] for k in all_lag_keys])), round(total_rate, 1), "", "", round(total_gmv / 100, 2), round(total_refund / 100, 2), round(total_gsv / 100, 2), ] for col, v in enumerate(total_values, 1): cell = ws1.cell(row=row, column=col, value=v) cell.font = Font(bold=True) cell.fill = PatternFill(start_color="D9E2F3", end_color="D9E2F3", fill_type="solid") cell.border = thin_border cell.alignment = Alignment(horizontal="center") row += 1 row += 1 # blank row # Adjust column widths for col in range(1, 12): ws1.column_dimensions[openpyxl.utils.get_column_letter(col)].width = 14 # ── Sheet 2: Lead Detail ── ws2 = wb.create_sheet("线索明细") detail_headers = ["进线月", "进线日期", "手机号", "用户ID", "匹配账号ID", "匹配方式", "是否成单", "首单月份", "Lag月数", "GMV(元)", "退款(元)", "GSV(元)"] for col, h in enumerate(detail_headers, 1): cell = ws2.cell(row=1, column=col, value=h) cell.font = header_font_white cell.fill = header_fill cell.alignment = Alignment(horizontal="center") cell.border = thin_border for i, ld in enumerate(lead_details, 2): match_method = ld.get("match_method", "") or "" values = [ ld["lead_month"], ld["lead_date"], ld["phone"], ld["uid"], ld["account_id"] or "", match_method, "是" if ld["has_order"] else "否", ld["order_month"] or "", ld["lag"] if ld["lag"] is not None else "", ld["gmv"], ld["refund"], ld["gsv"], ] for col, v in enumerate(values, 1): cell = ws2.cell(row=i, column=col, value=v) cell.border = thin_border cell.alignment = Alignment(horizontal="center") for col in range(1, 13): ws2.column_dimensions[openpyxl.utils.get_column_letter(col)].width = 14 # ── Sheet 3: Match Stats ── ws3 = wb.create_sheet("匹配统计") stats = [ ["指标", "数值"], ["3-5月进线线索总数", len(leads)], ["手机号匹配成功", matched_by_phone], ["UID匹配成功", matched_by_uid], ["未匹配", unmatched], ["匹配率", f"{(matched_by_phone + matched_by_uid) / len(leads) * 100:.1f}%"], ["匹配到去重账号数", len(matched_aids)], ["有订单的账号数", accounts_with_orders], ["总订单数", total_orders], ] for i, (k, v) in enumerate(stats, 1): ws3.cell(row=i, column=1, value=k).font = Font(bold=True) ws3.cell(row=i, column=2, value=v) ws3.cell(row=i, column=1).border = thin_border ws3.cell(row=i, column=2).border = thin_border ws3.column_dimensions["A"].width = 25 ws3.column_dimensions["B"].width = 20 # Save output_path = "/root/.openclaw/workspace/output/xhs_lead_lag_analysis_202603-202605.xlsx" wb.save(output_path) print(f"\n✅ 输出文件: {output_path}") print("Done!")