ai_member_xiaoxi/scripts/update_xiaolong_sheet.py

#!/usr/bin/env python3
"""Update the 小龙 sheet with UID, trial count, registration date, and download channel."""

import json
import re
import time
import sys
import mysql.connector
import psycopg2
import requests

FEISHU_TOKEN = "t-g10464c0UK5L67JVXSDDT3EWM4DPLSDY5C7R7NS6"
SPREADSHEET_TOKEN = "DU4dsUOJThfbPStMcgBcsMH5nyb"
SHEET_ID = "dff8c7"

# DB connections
MYSQL_CONFIG = {
    "host": "bj-cdb-dh2fkqa0.sql.tencentcdb.com",
    "port": 27751,
    "user": "read_only",
    "password": "fsdo45ijfmfmuu77$%^&",
    "database": "vala_user",
}

PG_CONFIG = {
    "host": "bj-postgres-16pob4sg.sql.tencentcdb.com",
    "port": 28591,
    "user": "ai_member",
    "password": "LdfjdjL83h3h3^$&**YGG*",
    "database": "vala_bi",
}

def read_sheet():
    """Read all data from the sheet."""
    url = f"https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/{SPREADSHEET_TOKEN}/values/{SHEET_ID}!A3:J1142?valueRenderOption=ToString"
    headers = {"Authorization": f"Bearer {FEISHU_TOKEN}"}
    r = requests.get(url, headers=headers)
    data = r.json()
    if data.get("code") != 0:
        print(f"Error reading sheet: {data}")
        sys.exit(1)
    return data["data"]["valueRange"]["values"]

def write_batch(rows_data):
    """Write a batch of rows to the sheet. rows_data is a list of (row_num, col, value) tuples."""
    # Group by row for efficiency
    # We'll write column by column for contiguous ranges
    pass

def write_range(start_row, end_row, col_letter, values):
    """Write values to a column range."""
    # start_row and end_row are 1-based sheet row numbers
    range_str = f"{SHEET_ID}!{col_letter}{start_row}:{col_letter}{end_row}"
    url = f"https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/{SPREADSHEET_TOKEN}/values"
    headers = {
        "Authorization": f"Bearer {FEISHU_TOKEN}",
        "Content-Type": "application/json",
    }
    body = {
        "valueRange": {
            "range": range_str,
            "values": values,
        }
    }
    r = requests.put(url, headers=headers, json=body)
    data = r.json()
    if data.get("code") != 0:
        print(f"Error writing {range_str}: {data}")
        return False
    print(f"  Wrote {range_str}: {len(values)} cells")
    return True

def main():
    print("Reading sheet data...")
    rows = read_sheet()
    print(f"Got {len(rows)} rows")

    # Step 1: Identify rows needing phone lookup
    # Columns: A=0(sheet), B=1(name), C=2(date), D=3(trial_count), E=4(phone), F=5(grade), G=6(notes), H=7(UID), I=8(reg_date), J=9(channel)
    phone_rows = []  # (row_index, phone, row_num)
    existing_uids = set()  # UIDs already in H column

    for i, row in enumerate(rows):
        row_num = i + 3  # sheet row number

        # Pad row to 10 columns
        while len(row) < 10:
            row.append("")

        phone = row[4].strip() if len(row) > 4 else ""
        h_val = row[7].strip() if len(row) > 7 else ""

        # Check H column for existing UID (pure digits)
        if h_val and h_val.isdigit():
            existing_uids.add(h_val)

        # Check if phone is 11-digit and H is empty or "未注册"
        if re.match(r'^\d{11}$', phone):
            if not h_val or h_val == "未注册":
                phone_rows.append((i, phone, row_num))

    print(f"Rows with 11-digit phone and empty H: {len(phone_rows)}")
    print(f"Existing UIDs in H column: {len(existing_uids)}")

    # Step 2: Query MySQL for phone -> UID mapping
    phone_to_uid = {}
    if phone_rows:
        mysql_conn = mysql.connector.connect(**MYSQL_CONFIG)
        mysql_cur = mysql_conn.cursor()

        # Batch query in chunks of 50
        chunk_size = 50
        for chunk_start in range(0, len(phone_rows), chunk_size):
            chunk = phone_rows[chunk_start:chunk_start + chunk_size]
            # Build LIKE conditions
            conditions = []
            for _, phone, _ in chunk:
                first3 = phone[:3]
                last4 = phone[-4:]
                conditions.append(f"tel LIKE '{first3}%{last4}'")

            query = f"SELECT id, tel FROM vala_app_account WHERE ({' OR '.join(conditions)}) AND deleted_at IS NULL"
            mysql_cur.execute(query)
            for uid, tel in mysql_cur.fetchall():
                # tel is masked like "138****3774", match by first3+last4
                for _, phone, _ in chunk:
                    if phone[:3] == tel[:3] and phone[-4:] == tel[-4:]:
                        phone_to_uid[phone] = str(uid)
                        break

            time.sleep(0.05)

        mysql_cur.close()
        mysql_conn.close()
        print(f"Phone -> UID matches found: {len(phone_to_uid)}")

    # Step 3: Collect all UIDs to query PostgreSQL
    all_uids = set(existing_uids)
    for phone, uid in phone_to_uid.items():
        all_uids.add(uid)

    print(f"Total unique UIDs to query: {len(all_uids)}")

    # Step 4: Query PostgreSQL for registration info and trial count
    uid_reg_info = {}  # uid -> {created_at, download_channel}
    uid_trial_count = {}  # uid -> trial_count

    if all_uids:
        pg_conn = psycopg2.connect(**PG_CONFIG)
        pg_cur = pg_conn.cursor()

        uid_list = list(all_uids)

        # Query bi_vala_app_account for reg info
        chunk_size = 100
        for chunk_start in range(0, len(uid_list), chunk_size):
            chunk = uid_list[chunk_start:chunk_start + chunk_size]
            placeholders = ",".join(["%s"] * len(chunk))
            query = f"SELECT id, created_at::date, download_channel FROM bi_vala_app_account WHERE id IN ({placeholders}) AND status=1 AND deleted_at IS NULL"
            pg_cur.execute(query, chunk)
            for uid, created_at, download_channel in pg_cur.fetchall():
                uid_reg_info[str(uid)] = {
                    "created_at": str(created_at) if created_at else "",
                    "download_channel": download_channel or "",
                }
            time.sleep(0.05)

        # Query bi_user_course_detail for trial count
        for chunk_start in range(0, len(uid_list), chunk_size):
            chunk = uid_list[chunk_start:chunk_start + chunk_size]
            placeholders = ",".join(["%s"] * len(chunk))
            query = f"SELECT account_id, COUNT(*) FROM bi_user_course_detail WHERE account_id IN ({placeholders}) AND expire_time IS NULL AND deleted_at IS NULL GROUP BY account_id"
            pg_cur.execute(query, chunk)
            for uid, cnt in pg_cur.fetchall():
                uid_trial_count[str(uid)] = cnt
            time.sleep(0.05)

        pg_cur.close()
        pg_conn.close()

    print(f"UIDs with reg info: {len(uid_reg_info)}")
    print(f"UIDs with trial count: {len(uid_trial_count)}")

    # Step 5: Build the update data
    # For each row, determine what to write
    # H column: UID (only if found in MySQL)
    # D column: trial count
    # I column: registration date
    # J column: download channel

    # We'll batch writes by column for contiguous ranges
    updates = []  # (row_num, col_index, value)
    # col_index: 3=D, 7=H, 8=I, 9=J

    for i, row in enumerate(rows):
        row_num = i + 3
        while len(row) < 10:
            row.append("")

        phone = row[4].strip() if len(row) > 4 else ""
        h_val = row[7].strip() if len(row) > 7 else ""

        # Determine UID for this row
        uid = None
        if re.match(r'^\d{11}$', phone) and phone in phone_to_uid:
            uid = phone_to_uid[phone]
        elif h_val and h_val.isdigit():
            uid = h_val

        if not uid:
            continue

        # H column: write UID if phone was matched and H was empty
        if re.match(r'^\d{11}$', phone) and phone in phone_to_uid:
            if not h_val or h_val == "未注册":
                updates.append((row_num, 7, uid))

        # D, I, J columns: write if we have data
        if uid in uid_trial_count:
            updates.append((row_num, 3, str(uid_trial_count[uid])))
        if uid in uid_reg_info:
            info = uid_reg_info[uid]
            if info["created_at"]:
                updates.append((row_num, 8, info["created_at"]))
            if info["download_channel"]:
                updates.append((row_num, 9, info["download_channel"]))

    print(f"Total updates to write: {len(updates)}")

    # Step 6: Write updates in batches
    # Group by column and find contiguous ranges
    # Column index -> list of (row_num, value)
    col_updates = {3: [], 7: [], 8: [], 9: []}
    col_names = {3: "D", 7: "H", 8: "I", 9: "J"}

    for row_num, col_idx, value in updates:
        col_updates[col_idx].append((row_num, value))

    # Sort each column's updates by row_num
    for col_idx in col_updates:
        col_updates[col_idx].sort(key=lambda x: x[0])

    # Write in contiguous batches
    total_written = 0
    for col_idx in [7, 3, 8, 9]:  # Write H first, then D, I, J
        col_letter = col_names[col_idx]
        items = col_updates[col_idx]
        if not items:
            continue

        # Group into contiguous ranges
        i = 0
        while i < len(items):
            start_row = items[i][0]
            values = [[items[i][1]]]
            j = i + 1
            while j < len(items) and items[j][0] == items[j-1][0] + 1:
                values.append([items[j][1]])
                j += 1
            end_row = items[j-1][0]

            if write_range(start_row, end_row, col_letter, values):
                total_written += len(values)
            time.sleep(0.05)
            i = j

    # Summary
    phones_matched = len(phone_to_uid)
    h_written = len(col_updates[7])
    d_written = len(col_updates[3])
    i_written = len(col_updates[8])
    j_written = len(col_updates[9])

    print(f"\n=== SUMMARY ===")
    print(f"Phones matched in MySQL: {phones_matched}")
    print(f"H column (UID) written: {h_written}")
    print(f"D column (trial count) written: {d_written}")
    print(f"I column (reg date) written: {i_written}")
    print(f"J column (channel) written: {j_written}")
    print(f"Total cells written: {total_written}")

if __name__ == "__main__":
    main()