ai_member_xiaoxi/scripts/process_xiaolong.py

#!/usr/bin/env python3
"""
Process 小龙 sheet: match phones to UIDs, fill trial counts, registration dates, download channels.
"""
import sys
sys.path.insert(0, '/root/.openclaw/workspace/scripts')
from phone_encrypt import encrypt_phone
import psycopg2
import json
import requests
import time

# === Config ===
PG_CONFIG = {
    'host': 'bj-postgres-16pob4sg.sql.tencentcdb.com',
    'port': 28591,
    'user': 'ai_member',
    'password': 'LdfjdjL83h3h3^$&**YGG*',
    'database': 'vala_bi',
}

FEISHU_TOKEN = None
SPREADSHEET_TOKEN = 'DU4dsUOJThfbPStMcgBcsMH5nyb'
SHEET_ID = 'dff8c7'

def get_feishu_token():
    global FEISHU_TOKEN
    if FEISHU_TOKEN:
        return FEISHU_TOKEN
    resp = requests.post('https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal',
        json={'app_id': 'cli_a929ae22e0b8dcc8', 'app_secret': 'OtFjMy7p3qE3VvLbMdcWidwgHOnGD4FJ'})
    FEISHU_TOKEN = resp.json()['tenant_access_token']
    return FEISHU_TOKEN

def read_sheet():
    """Read all data from the sheet."""
    token = get_feishu_token()
    url = f'https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/{SPREADSHEET_TOKEN}/values/{SHEET_ID}!A3:J1142?valueRenderOption=ToString'
    resp = requests.get(url, headers={'Authorization': f'Bearer {token}'})
    data = resp.json()
    return data.get('data', {}).get('valueRange', {}).get('values', [])

def write_batch(range_str, values):
    """Write a batch of values to the sheet."""
    token = get_feishu_token()
    url = f'https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/{SPREADSHEET_TOKEN}/values'
    body = {"valueRange": {"range": f"{SHEET_ID}!{range_str}", "values": values}}
    resp = requests.put(url, headers={
        'Authorization': f'Bearer {token}',
        'Content-Type': 'application/json'
    }, json=body)
    result = resp.json()
    if result.get('code') != 0:
        print(f"  ERROR writing {range_str}: {result}")
        return False
    return True

def main():
    # 1. Read sheet data
    print("Reading sheet data...")
    rows = read_sheet()
    print(f"  Got {len(rows)} rows")

    # 2. Extract phones and encrypt
    print("\nExtracting and encrypting phones...")
    phone_map = {}  # encrypted -> (row_idx, phone)
    row_phones = {}  # row_idx -> phone
    for i, row in enumerate(rows):
        row_num = i + 3  # 1-indexed row number in sheet
        if len(row) > 4 and row[4]:
            phone = row[4].strip()
            # Only process 11-digit phone numbers
            if len(phone) == 11 and phone.isdigit():
                enc = encrypt_phone(phone)
                phone_map[enc] = (i, phone)
                row_phones[i] = phone

    print(f"  Found {len(phone_map)} valid 11-digit phones")

    # 3. Query PostgreSQL for phone matching
    print("\nQuerying PostgreSQL for phone matching...")
    conn = psycopg2.connect(**PG_CONFIG)
    cur = conn.cursor()

    enc_list = list(phone_map.keys())
    # Batch query in chunks of 500
    phone_to_uid = {}  # encrypted -> account_id
    phone_to_created = {}  # encrypted -> created_at
    phone_to_channel = {}  # encrypted -> download_channel

    for chunk_start in range(0, len(enc_list), 500):
        chunk = enc_list[chunk_start:chunk_start+500]
        placeholders = ','.join(['%s'] * len(chunk))
        cur.execute(f"""
            SELECT id, tel_encrypt, created_at, download_channel
            FROM bi_vala_app_account
            WHERE tel_encrypt IN ({placeholders})
            AND status = 1
            AND deleted_at IS NULL
        """, chunk)
        for row in cur.fetchall():
            uid, enc, created, channel = row
            phone_to_uid[enc] = str(uid)
            phone_to_created[enc] = created
            phone_to_channel[enc] = channel or ''

    print(f"  Matched {len(phone_to_uid)} phones to UIDs")

    # 4. Query trial lesson counts for matched UIDs
    print("\nQuerying trial lesson counts...")
    all_uids = list(set(phone_to_uid.values()))
    uid_to_trial_count = {}

    for chunk_start in range(0, len(all_uids), 500):
        chunk = all_uids[chunk_start:chunk_start+500]
        placeholders = ','.join(['%s'] * len(chunk))
        cur.execute(f"""
            SELECT account_id, COUNT(*) as trial_count
            FROM bi_user_course_detail
            WHERE account_id IN ({placeholders})
            AND expire_time IS NULL
            AND deleted_at IS NULL
            GROUP BY account_id
        """, [int(x) for x in chunk])
        for row in cur.fetchall():
            uid_to_trial_count[str(row[0])] = row[1]

    print(f"  Found trial counts for {len(uid_to_trial_count)} users")

    cur.close()
    conn.close()

    # 5. Prepare writes
    print("\nPreparing writes...")

    # Build row-level data
    # row_idx -> {H: uid, D: trial_count, I: reg_date, J: channel}
    row_data = {}

    for enc, (row_idx, phone) in phone_map.items():
        uid = phone_to_uid.get(enc)
        if uid:
            trial_count = uid_to_trial_count.get(uid, 0)
            created = phone_to_created.get(enc)
            channel = phone_to_channel.get(enc, '')
            reg_date = created.strftime('%Y-%m-%d') if created else ''

            row_data[row_idx] = {
                'H': uid,
                'D': str(trial_count) if trial_count > 0 else '',
                'I': reg_date,
                'J': channel,
            }

    # Also fill D/I/J for rows where H already has a valid UID but D is empty
    print("  Checking rows with existing UIDs but empty D column...")
    for i, row in enumerate(rows):
        if i in row_data:
            continue  # Already processed above
        if len(row) > 7 and row[7]:
            # H column has a value - check if it's a numeric UID
            h_val = row[7].strip()
            if h_val.isdigit():
                # Check if D is empty
                d_empty = len(row) <= 3 or not row[3]
                if d_empty:
                    trial_count = uid_to_trial_count.get(h_val, 0)
                    if trial_count > 0:
                        if i not in row_data:
                            row_data[i] = {}
                        row_data[i]['D'] = str(trial_count)

                # Check if I is empty but we have data
                i_empty = len(row) <= 8 or not row[8]
                if i_empty:
                    # Need to query this UID's created_at
                    pass  # Skip for now - we don't have this data from our query

    # 6. Write H column (UIDs)
    print("\nWriting H column (UIDs)...")
    h_writes = []
    for row_idx, data in row_data.items():
        if 'H' in data:
            h_writes.append((row_idx, data['H']))

    h_writes.sort(key=lambda x: x[0])

    # Batch consecutive rows
    h_batches = []
    if h_writes:
        batch_start = h_writes[0][0]
        batch_vals = [[h_writes[0][1]]]
        for j in range(1, len(h_writes)):
            if h_writes[j][0] == h_writes[j-1][0] + 1:
                batch_vals.append([h_writes[j][1]])
            else:
                h_batches.append((batch_start, batch_vals))
                batch_start = h_writes[j][0]
                batch_vals = [[h_writes[j][1]]]
        h_batches.append((batch_start, batch_vals))

    h_written = 0
    for start_idx, vals in h_batches:
        start_row = start_idx + 3
        end_row = start_row + len(vals) - 1
        range_str = f'H{start_row}:H{end_row}'
        if write_batch(range_str, vals):
            h_written += len(vals)
            print(f"  Wrote H{start_row}:H{end_row} ({len(vals)} cells)")
        time.sleep(0.05)

    # 7. Write D column (trial counts)
    print("\nWriting D column (trial counts)...")
    d_writes = []
    for row_idx, data in row_data.items():
        if 'D' in data and data['D']:
            d_writes.append((row_idx, data['D']))

    d_writes.sort(key=lambda x: x[0])

    d_batches = []
    if d_writes:
        batch_start = d_writes[0][0]
        batch_vals = [[d_writes[0][1]]]
        for j in range(1, len(d_writes)):
            if d_writes[j][0] == d_writes[j-1][0] + 1:
                batch_vals.append([d_writes[j][1]])
            else:
                d_batches.append((batch_start, batch_vals))
                batch_start = d_writes[j][0]
                batch_vals = [[d_writes[j][1]]]
        d_batches.append((batch_start, batch_vals))

    d_written = 0
    for start_idx, vals in d_batches:
        start_row = start_idx + 3
        end_row = start_row + len(vals) - 1
        range_str = f'D{start_row}:D{end_row}'
        if write_batch(range_str, vals):
            d_written += len(vals)
            print(f"  Wrote D{start_row}:D{end_row} ({len(vals)} cells)")
        time.sleep(0.05)

    # 8. Write I column (registration dates)
    print("\nWriting I column (registration dates)...")
    i_writes = []
    for row_idx, data in row_data.items():
        if 'I' in data and data['I']:
            i_writes.append((row_idx, data['I']))

    i_writes.sort(key=lambda x: x[0])

    i_batches = []
    if i_writes:
        batch_start = i_writes[0][0]
        batch_vals = [[i_writes[0][1]]]
        for j in range(1, len(i_writes)):
            if i_writes[j][0] == i_writes[j-1][0] + 1:
                batch_vals.append([i_writes[j][1]])
            else:
                i_batches.append((batch_start, batch_vals))
                batch_start = i_writes[j][0]
                batch_vals = [[i_writes[j][1]]]
        i_batches.append((batch_start, batch_vals))

    i_written = 0
    for start_idx, vals in i_batches:
        start_row = start_idx + 3
        end_row = start_row + len(vals) - 1
        range_str = f'I{start_row}:I{end_row}'
        if write_batch(range_str, vals):
            i_written += len(vals)
            print(f"  Wrote I{start_row}:I{end_row} ({len(vals)} cells)")
        time.sleep(0.05)

    # 9. Write J column (download channels)
    print("\nWriting J column (download channels)...")
    j_writes = []
    for row_idx, data in row_data.items():
        if 'J' in data and data['J']:
            j_writes.append((row_idx, data['J']))

    j_writes.sort(key=lambda x: x[0])

    j_batches = []
    if j_writes:
        batch_start = j_writes[0][0]
        batch_vals = [[j_writes[0][1]]]
        for j in range(1, len(j_writes)):
            if j_writes[j][0] == j_writes[j-1][0] + 1:
                batch_vals.append([j_writes[j][1]])
            else:
                j_batches.append((batch_start, batch_vals))
                batch_start = j_writes[j][0]
                batch_vals = [[j_writes[j][1]]]
        j_batches.append((batch_start, batch_vals))

    j_written = 0
    for start_idx, vals in j_batches:
        start_row = start_idx + 3
        end_row = start_row + len(vals) - 1
        range_str = f'J{start_row}:J{end_row}'
        if write_batch(range_str, vals):
            j_written += len(vals)
            print(f"  Wrote J{start_row}:J{end_row} ({len(vals)} cells)")
        time.sleep(0.05)

    # 10. Report
    print("\n" + "="*60)
    print("FINAL REPORT")
    print("="*60)
    print(f"Total rows processed: {len(rows)}")
    print(f"Phones extracted: {len(phone_map)}")
    print(f"Phones matched to UIDs: {len(phone_to_uid)}")
    print(f"Phones NOT matched: {len(phone_map) - len(phone_to_uid)}")
    print(f"H column (UIDs) written: {h_written}")
    print(f"D column (trial counts) written: {d_written}")
    print(f"I column (reg dates) written: {i_written}")
    print(f"J column (channels) written: {j_written}")

    # Show unmatched phones
    unmatched = []
    for enc, (row_idx, phone) in phone_map.items():
        if enc not in phone_to_uid:
            unmatched.append(f"  Row {row_idx+3}: {phone}")

    if unmatched:
        print(f"\nUnmatched phones ({len(unmatched)}):")
        for u in unmatched[:20]:
            print(u)
        if len(unmatched) > 20:
            print(f"  ... and {len(unmatched)-20} more")

if __name__ == '__main__':
    main()