ai_member_xiaoxi/scripts/ti_pool_split_20260608.py
2026-06-09 08:00:01 +08:00

147 lines
6.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
TI沉淀/新进拆分 - 线索明细补全
需求:基于 xiaoxi_xhs_lead_detail.csv 进线月 3-5 月join bi_vala_app_account 补全字段
输出ti_pool_split_20260608.csv
"""
import csv
import psycopg2
from datetime import datetime, date
from collections import defaultdict
# ── 1. 读取线索明细 ──────────────────────────────────────────
leads = []
with open('/root/.openclaw/workspace/tmp/xiaoxi_xhs_lead_detail.csv', 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
intake_month = row.get('进线月', '').strip()
if intake_month in ('2026-03', '2026-04', '2026-05'):
leads.append(row)
print(f"进线月 3-5 月共 {len(leads)} 条线索")
# ── 2. 收集所有用户ID ────────────────────────────────────────
user_ids = set()
for row in leads:
uid = row.get('用户ID', '').strip()
if uid and uid.isdigit():
user_ids.add(int(uid))
print(f"去重用户ID: {len(user_ids)}")
# ── 3. 查询 bi_vala_app_account ──────────────────────────────
conn = psycopg2.connect(
host='bj-postgres-16pob4sg.sql.tencentcdb.com',
port=28591,
user='ai_member',
password='LdfjdjL83h3h3^$&**YGG*',
dbname='vala_bi'
)
cur = conn.cursor()
# 批量查询
account_map = {} # id -> {created_at, key_from}
batch_size = 500
uid_list = list(user_ids)
for i in range(0, len(uid_list), batch_size):
batch = uid_list[i:i+batch_size]
placeholders = ','.join(['%s'] * len(batch))
cur.execute(f"""
SELECT id, created_at, key_from
FROM bi_vala_app_account
WHERE id IN ({placeholders})
""", batch)
for row in cur.fetchall():
account_map[row[0]] = {
'created_at': row[1],
'key_from': row[2] or ''
}
print(f"bi_vala_app_account 匹配到 {len(account_map)} 个用户")
# ── 4. 计算 prior_lead_same_phone ────────────────────────────
# 读取全部线索含2月/6月按手机号找最早进线月
all_leads = []
with open('/root/.openclaw/workspace/tmp/xiaoxi_xhs_lead_detail.csv', 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
all_leads.append(row)
phone_earliest_month = {}
for row in all_leads:
phone = row.get('手机号', '').strip()
intake_month = row.get('进线月', '').strip()
if phone and intake_month:
if phone not in phone_earliest_month or intake_month < phone_earliest_month[phone]:
phone_earliest_month[phone] = intake_month
print(f"去重手机号: {len(phone_earliest_month)}")
# ── 5. 组装输出 ──────────────────────────────────────────────
output_rows = []
for row in leads:
uid_str = row.get('用户ID', '').strip()
uid = int(uid_str) if uid_str.isdigit() else None
phone = row.get('手机号', '').strip()
intake_month = row.get('进线月', '').strip()
intake_date_str = row.get('进线日期', '').strip()
acct = account_map.get(uid, {})
create_time = acct.get('created_at')
key_from = acct.get('key_from', '')
# register_before_intake: 注册是否早于进线月第一天
register_before_intake = ''
days_register_to_intake = ''
if create_time and intake_month:
try:
intake_first_day = datetime.strptime(intake_month + '-01', '%Y-%m-%d').date()
create_date = create_time.date() if hasattr(create_time, 'date') else datetime.strptime(str(create_time)[:10], '%Y-%m-%d').date()
register_before_intake = '' if create_date < intake_first_day else ''
# days from registration to intake date
if intake_date_str:
intake_date = datetime.strptime(intake_date_str, '%Y-%m-%d').date()
days_register_to_intake = (intake_date - create_date).days
except:
pass
# prior_lead_same_phone: 进线月前是否另有同手机留资
prior_lead_same_phone = ''
if phone and intake_month and phone in phone_earliest_month:
earliest = phone_earliest_month[phone]
prior_lead_same_phone = '' if earliest < intake_month else ''
out = dict(row)
out['create_time'] = str(create_time)[:19] if create_time else ''
out['key_from'] = key_from
out['register_before_intake'] = register_before_intake
out['days_register_to_intake'] = days_register_to_intake
out['prior_lead_same_phone'] = prior_lead_same_phone
output_rows.append(out)
# ── 6. 写 CSV ────────────────────────────────────────────────
fieldnames = list(leads[0].keys()) + ['create_time', 'key_from', 'register_before_intake', 'days_register_to_intake', 'prior_lead_same_phone']
with open('/root/.openclaw/workspace/output/ti_pool_split_20260608.csv', 'w', encoding='utf-8-sig', newline='') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(output_rows)
print(f"\n输出: output/ti_pool_split_20260608.csv, {len(output_rows)}")
# ── 7. 统计摘要 ──────────────────────────────────────────────
reg_before = sum(1 for r in output_rows if r['register_before_intake'] == '')
reg_after = sum(1 for r in output_rows if r['register_before_intake'] == '')
reg_unknown = sum(1 for r in output_rows if r['register_before_intake'] == '')
prior_yes = sum(1 for r in output_rows if r['prior_lead_same_phone'] == '')
prior_no = sum(1 for r in output_rows if r['prior_lead_same_phone'] == '')
prior_unknown = sum(1 for r in output_rows if r['prior_lead_same_phone'] == '')
print(f"\n── 统计摘要 ──")
print(f"注册早于进线月: 是={reg_before}, 否={reg_after}, 未知={reg_unknown}")
print(f"同手机号早前进线: 是={prior_yes}, 否={prior_no}, 未知={prior_unknown}")
cur.close()
conn.close()