#!/usr/bin/env python3 """ TI沉淀/新进拆分 - 线索明细补全 需求:基于 xiaoxi_xhs_lead_detail.csv 进线月 3-5 月,join bi_vala_app_account 补全字段 输出:ti_pool_split_20260608.csv """ import csv import psycopg2 from datetime import datetime, date from collections import defaultdict # ── 1. 读取线索明细 ────────────────────────────────────────── leads = [] with open('/root/.openclaw/workspace/tmp/xiaoxi_xhs_lead_detail.csv', 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: intake_month = row.get('进线月', '').strip() if intake_month in ('2026-03', '2026-04', '2026-05'): leads.append(row) print(f"进线月 3-5 月共 {len(leads)} 条线索") # ── 2. 收集所有用户ID ──────────────────────────────────────── user_ids = set() for row in leads: uid = row.get('用户ID', '').strip() if uid and uid.isdigit(): user_ids.add(int(uid)) print(f"去重用户ID: {len(user_ids)} 个") # ── 3. 查询 bi_vala_app_account ────────────────────────────── conn = psycopg2.connect( host='bj-postgres-16pob4sg.sql.tencentcdb.com', port=28591, user='ai_member', password='LdfjdjL83h3h3^$&**YGG*', dbname='vala_bi' ) cur = conn.cursor() # 批量查询 account_map = {} # id -> {created_at, key_from} batch_size = 500 uid_list = list(user_ids) for i in range(0, len(uid_list), batch_size): batch = uid_list[i:i+batch_size] placeholders = ','.join(['%s'] * len(batch)) cur.execute(f""" SELECT id, created_at, key_from FROM bi_vala_app_account WHERE id IN ({placeholders}) """, batch) for row in cur.fetchall(): account_map[row[0]] = { 'created_at': row[1], 'key_from': row[2] or '' } print(f"bi_vala_app_account 匹配到 {len(account_map)} 个用户") # ── 4. 计算 prior_lead_same_phone ──────────────────────────── # 读取全部线索(含2月/6月),按手机号找最早进线月 all_leads = [] with open('/root/.openclaw/workspace/tmp/xiaoxi_xhs_lead_detail.csv', 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: all_leads.append(row) phone_earliest_month = {} for row in all_leads: phone = row.get('手机号', '').strip() intake_month = row.get('进线月', '').strip() if phone and intake_month: if phone not in phone_earliest_month or intake_month < phone_earliest_month[phone]: phone_earliest_month[phone] = intake_month print(f"去重手机号: {len(phone_earliest_month)} 个") # ── 5. 组装输出 ────────────────────────────────────────────── output_rows = [] for row in leads: uid_str = row.get('用户ID', '').strip() uid = int(uid_str) if uid_str.isdigit() else None phone = row.get('手机号', '').strip() intake_month = row.get('进线月', '').strip() intake_date_str = row.get('进线日期', '').strip() acct = account_map.get(uid, {}) create_time = acct.get('created_at') key_from = acct.get('key_from', '') # register_before_intake: 注册是否早于进线月第一天 register_before_intake = '' days_register_to_intake = '' if create_time and intake_month: try: intake_first_day = datetime.strptime(intake_month + '-01', '%Y-%m-%d').date() create_date = create_time.date() if hasattr(create_time, 'date') else datetime.strptime(str(create_time)[:10], '%Y-%m-%d').date() register_before_intake = '是' if create_date < intake_first_day else '否' # days from registration to intake date if intake_date_str: intake_date = datetime.strptime(intake_date_str, '%Y-%m-%d').date() days_register_to_intake = (intake_date - create_date).days except: pass # prior_lead_same_phone: 进线月前是否另有同手机留资 prior_lead_same_phone = '' if phone and intake_month and phone in phone_earliest_month: earliest = phone_earliest_month[phone] prior_lead_same_phone = '是' if earliest < intake_month else '否' out = dict(row) out['create_time'] = str(create_time)[:19] if create_time else '' out['key_from'] = key_from out['register_before_intake'] = register_before_intake out['days_register_to_intake'] = days_register_to_intake out['prior_lead_same_phone'] = prior_lead_same_phone output_rows.append(out) # ── 6. 写 CSV ──────────────────────────────────────────────── fieldnames = list(leads[0].keys()) + ['create_time', 'key_from', 'register_before_intake', 'days_register_to_intake', 'prior_lead_same_phone'] with open('/root/.openclaw/workspace/output/ti_pool_split_20260608.csv', 'w', encoding='utf-8-sig', newline='') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(output_rows) print(f"\n输出: output/ti_pool_split_20260608.csv, {len(output_rows)} 行") # ── 7. 统计摘要 ────────────────────────────────────────────── reg_before = sum(1 for r in output_rows if r['register_before_intake'] == '是') reg_after = sum(1 for r in output_rows if r['register_before_intake'] == '否') reg_unknown = sum(1 for r in output_rows if r['register_before_intake'] == '') prior_yes = sum(1 for r in output_rows if r['prior_lead_same_phone'] == '是') prior_no = sum(1 for r in output_rows if r['prior_lead_same_phone'] == '否') prior_unknown = sum(1 for r in output_rows if r['prior_lead_same_phone'] == '') print(f"\n── 统计摘要 ──") print(f"注册早于进线月: 是={reg_before}, 否={reg_after}, 未知={reg_unknown}") print(f"同手机号早前进线: 是={prior_yes}, 否={prior_no}, 未知={prior_unknown}") cur.close() conn.close()