147 lines
6.2 KiB
Python
147 lines
6.2 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
TI沉淀/新进拆分 - 线索明细补全
|
||
需求:基于 xiaoxi_xhs_lead_detail.csv 进线月 3-5 月,join bi_vala_app_account 补全字段
|
||
输出:ti_pool_split_20260608.csv
|
||
"""
|
||
|
||
import csv
|
||
import psycopg2
|
||
from datetime import datetime, date
|
||
from collections import defaultdict
|
||
|
||
# ── 1. 读取线索明细 ──────────────────────────────────────────
|
||
leads = []
|
||
with open('/root/.openclaw/workspace/tmp/xiaoxi_xhs_lead_detail.csv', 'r', encoding='utf-8') as f:
|
||
reader = csv.DictReader(f)
|
||
for row in reader:
|
||
intake_month = row.get('进线月', '').strip()
|
||
if intake_month in ('2026-03', '2026-04', '2026-05'):
|
||
leads.append(row)
|
||
|
||
print(f"进线月 3-5 月共 {len(leads)} 条线索")
|
||
|
||
# ── 2. 收集所有用户ID ────────────────────────────────────────
|
||
user_ids = set()
|
||
for row in leads:
|
||
uid = row.get('用户ID', '').strip()
|
||
if uid and uid.isdigit():
|
||
user_ids.add(int(uid))
|
||
|
||
print(f"去重用户ID: {len(user_ids)} 个")
|
||
|
||
# ── 3. 查询 bi_vala_app_account ──────────────────────────────
|
||
conn = psycopg2.connect(
|
||
host='bj-postgres-16pob4sg.sql.tencentcdb.com',
|
||
port=28591,
|
||
user='ai_member',
|
||
password='LdfjdjL83h3h3^$&**YGG*',
|
||
dbname='vala_bi'
|
||
)
|
||
cur = conn.cursor()
|
||
|
||
# 批量查询
|
||
account_map = {} # id -> {created_at, key_from}
|
||
batch_size = 500
|
||
uid_list = list(user_ids)
|
||
for i in range(0, len(uid_list), batch_size):
|
||
batch = uid_list[i:i+batch_size]
|
||
placeholders = ','.join(['%s'] * len(batch))
|
||
cur.execute(f"""
|
||
SELECT id, created_at, key_from
|
||
FROM bi_vala_app_account
|
||
WHERE id IN ({placeholders})
|
||
""", batch)
|
||
for row in cur.fetchall():
|
||
account_map[row[0]] = {
|
||
'created_at': row[1],
|
||
'key_from': row[2] or ''
|
||
}
|
||
|
||
print(f"bi_vala_app_account 匹配到 {len(account_map)} 个用户")
|
||
|
||
# ── 4. 计算 prior_lead_same_phone ────────────────────────────
|
||
# 读取全部线索(含2月/6月),按手机号找最早进线月
|
||
all_leads = []
|
||
with open('/root/.openclaw/workspace/tmp/xiaoxi_xhs_lead_detail.csv', 'r', encoding='utf-8') as f:
|
||
reader = csv.DictReader(f)
|
||
for row in reader:
|
||
all_leads.append(row)
|
||
|
||
phone_earliest_month = {}
|
||
for row in all_leads:
|
||
phone = row.get('手机号', '').strip()
|
||
intake_month = row.get('进线月', '').strip()
|
||
if phone and intake_month:
|
||
if phone not in phone_earliest_month or intake_month < phone_earliest_month[phone]:
|
||
phone_earliest_month[phone] = intake_month
|
||
|
||
print(f"去重手机号: {len(phone_earliest_month)} 个")
|
||
|
||
# ── 5. 组装输出 ──────────────────────────────────────────────
|
||
output_rows = []
|
||
for row in leads:
|
||
uid_str = row.get('用户ID', '').strip()
|
||
uid = int(uid_str) if uid_str.isdigit() else None
|
||
phone = row.get('手机号', '').strip()
|
||
intake_month = row.get('进线月', '').strip()
|
||
intake_date_str = row.get('进线日期', '').strip()
|
||
|
||
acct = account_map.get(uid, {})
|
||
create_time = acct.get('created_at')
|
||
key_from = acct.get('key_from', '')
|
||
|
||
# register_before_intake: 注册是否早于进线月第一天
|
||
register_before_intake = ''
|
||
days_register_to_intake = ''
|
||
if create_time and intake_month:
|
||
try:
|
||
intake_first_day = datetime.strptime(intake_month + '-01', '%Y-%m-%d').date()
|
||
create_date = create_time.date() if hasattr(create_time, 'date') else datetime.strptime(str(create_time)[:10], '%Y-%m-%d').date()
|
||
register_before_intake = '是' if create_date < intake_first_day else '否'
|
||
# days from registration to intake date
|
||
if intake_date_str:
|
||
intake_date = datetime.strptime(intake_date_str, '%Y-%m-%d').date()
|
||
days_register_to_intake = (intake_date - create_date).days
|
||
except:
|
||
pass
|
||
|
||
# prior_lead_same_phone: 进线月前是否另有同手机留资
|
||
prior_lead_same_phone = ''
|
||
if phone and intake_month and phone in phone_earliest_month:
|
||
earliest = phone_earliest_month[phone]
|
||
prior_lead_same_phone = '是' if earliest < intake_month else '否'
|
||
|
||
out = dict(row)
|
||
out['create_time'] = str(create_time)[:19] if create_time else ''
|
||
out['key_from'] = key_from
|
||
out['register_before_intake'] = register_before_intake
|
||
out['days_register_to_intake'] = days_register_to_intake
|
||
out['prior_lead_same_phone'] = prior_lead_same_phone
|
||
output_rows.append(out)
|
||
|
||
# ── 6. 写 CSV ────────────────────────────────────────────────
|
||
fieldnames = list(leads[0].keys()) + ['create_time', 'key_from', 'register_before_intake', 'days_register_to_intake', 'prior_lead_same_phone']
|
||
|
||
with open('/root/.openclaw/workspace/output/ti_pool_split_20260608.csv', 'w', encoding='utf-8-sig', newline='') as f:
|
||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||
writer.writeheader()
|
||
writer.writerows(output_rows)
|
||
|
||
print(f"\n输出: output/ti_pool_split_20260608.csv, {len(output_rows)} 行")
|
||
|
||
# ── 7. 统计摘要 ──────────────────────────────────────────────
|
||
reg_before = sum(1 for r in output_rows if r['register_before_intake'] == '是')
|
||
reg_after = sum(1 for r in output_rows if r['register_before_intake'] == '否')
|
||
reg_unknown = sum(1 for r in output_rows if r['register_before_intake'] == '')
|
||
prior_yes = sum(1 for r in output_rows if r['prior_lead_same_phone'] == '是')
|
||
prior_no = sum(1 for r in output_rows if r['prior_lead_same_phone'] == '否')
|
||
prior_unknown = sum(1 for r in output_rows if r['prior_lead_same_phone'] == '')
|
||
|
||
print(f"\n── 统计摘要 ──")
|
||
print(f"注册早于进线月: 是={reg_before}, 否={reg_after}, 未知={reg_unknown}")
|
||
print(f"同手机号早前进线: 是={prior_yes}, 否={prior_no}, 未知={prior_unknown}")
|
||
|
||
cur.close()
|
||
conn.close()
|