ai_member_xiaoxi/scripts/process_wudi_sheet.py

#!/usr/bin/env python3
"""
Process 吴迪 sheet: match phones via XXTEA encryption, fill H/D/I/J columns.
"""
import sys
import json
import time
import urllib.request
import urllib.error

# Import phone encryption
sys.path.insert(0, '/root/.openclaw/workspace/scripts')
from phone_encrypt import encrypt_phone

# --- Config ---
FEISHU_TOKEN = sys.argv[1] if len(sys.argv) > 1 else None
if not FEISHU_TOKEN:
    print("Usage: python3 process_wudi_sheet.py <FEISHU_TOKEN>")
    sys.exit(1)

SPREADSHEET_TOKEN = "NoZqsFi47hIOHEt9j8WcfRtbnug"
SHEET_ID = "f975f0"

# PostgreSQL config
import psycopg2
PG_CONFIG = {
    "host": "bj-postgres-16pob4sg.sql.tencentcdb.com",
    "port": 28591,
    "user": "ai_member",
    "password": "LdfjdjL83h3h3^$&**YGG*",
    "database": "vala_bi",
}

# --- Step 1: Read all sheet data ---
def feishu_get(url):
    req = urllib.request.Request(url)
    req.add_header("Authorization", f"Bearer {FEISHU_TOKEN}")
    req.add_header("Content-Type", "application/json")
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read())

def feishu_put(url, body):
    data = json.dumps(body).encode()
    req = urllib.request.Request(url, data=data, method="PUT")
    req.add_header("Authorization", f"Bearer {FEISHU_TOKEN}")
    req.add_header("Content-Type", "application/json")
    with urllib.request.urlopen(req) as resp:
        return json.loads(resp.read())

print("Step 1: Reading sheet data...")
url = f"https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/{SPREADSHEET_TOKEN}/values/{SHEET_ID}!A3:J8016?valueRenderOption=ToString"
result = feishu_get(url)
rows = result.get("data", {}).get("valueRange", {}).get("values", [])
print(f"Read {len(rows)} rows")

# Parse rows
parsed = []
for i, row in enumerate(rows):
    excel_row = i + 3
    padded = row + [''] * (10 - len(row))
    parsed.append({
        "excel_row": excel_row,
        "A": str(padded[0]).strip() if padded[0] else '',
        "B": str(padded[1]).strip() if padded[1] else '',
        "C": str(padded[2]).strip() if padded[2] else '',
        "D": str(padded[3]).strip() if padded[3] else '',
        "E": str(padded[4]).strip() if padded[4] else '',
        "F": str(padded[5]).strip() if padded[5] else '',
        "G": str(padded[6]).strip() if padded[6] else '',
        "H": str(padded[7]).strip() if padded[7] else '',
        "I": str(padded[8]).strip() if padded[8] else '',
        "J": str(padded[9]).strip() if padded[9] else '',
    })

# --- Step 2: Identify rows needing processing ---
# Case A: Has phone (E), H is empty -> need phone match
# Case B: Has valid H (numeric UID), D is empty -> need to fill D/I/J from DB

phones_to_match = []  # (excel_row, phone)
rows_need_dij = []    # (excel_row, uid)

for p in parsed:
    phone = p["E"]
    h_val = p["H"]
    d_val = p["D"]

    # Check if phone is 11-digit
    if phone and len(phone) == 11 and phone.isdigit():
        if not h_val or not h_val.isdigit():
            phones_to_match.append((p["excel_row"], phone))

    # Check if H has valid UID but D is empty
    if h_val and h_val.isdigit():
        if not d_val or d_val == '':
            rows_need_dij.append((p["excel_row"], h_val))

print(f"\nStep 2: Analysis")
print(f"  Phones to match (H empty): {len(phones_to_match)}")
print(f"  Rows with UID but D empty: {len(rows_need_dij)}")

# --- Step 3: Encrypt phones and query PostgreSQL ---
print("\nStep 3: Encrypting phones and querying DB...")

# Encrypt all phones
phone_to_enc = {}
for excel_row, phone in phones_to_match:
    phone_to_enc[phone] = encrypt_phone(phone)

# Build lookup: enc -> phone
enc_to_phone = {v: k for k, v in phone_to_enc.items()}

# Query PostgreSQL for phone matches
conn = psycopg2.connect(**PG_CONFIG)
cur = conn.cursor()

# Get account IDs for encrypted phones
enc_list = list(enc_to_phone.keys())
phone_matches = {}  # phone -> account_id
if enc_list:
    # Query in batches of 500
    batch_size = 500
    for batch_start in range(0, len(enc_list), batch_size):
        batch = enc_list[batch_start:batch_start + batch_size]
        placeholders = ','.join(['%s'] * len(batch))
        cur.execute(
            f"SELECT id, tel_encrypt FROM bi_vala_app_account WHERE tel_encrypt IN ({placeholders}) AND status=1 AND deleted_at IS NULL",
            batch
        )
        for row in cur.fetchall():
            account_id, tel_enc = row
            phone = enc_to_phone.get(tel_enc)
            if phone:
                phone_matches[phone] = str(account_id)
        print(f"  Batch {batch_start//batch_size + 1}: matched {len(cur.fetchall())} (but we already consumed)")

# Re-query properly
cur.close()
cur = conn.cursor()
phone_matches = {}
if enc_list:
    batch_size = 500
    for batch_start in range(0, len(enc_list), batch_size):
        batch = enc_list[batch_start:batch_start + batch_size]
        placeholders = ','.join(['%s'] * len(batch))
        cur.execute(
            f"SELECT id, tel_encrypt FROM bi_vala_app_account WHERE tel_encrypt IN ({placeholders}) AND status=1 AND deleted_at IS NULL",
            batch
        )
        results = cur.fetchall()
        for row in results:
            account_id, tel_enc = row
            phone = enc_to_phone.get(tel_enc)
            if phone:
                phone_matches[phone] = str(account_id)
        print(f"  Batch {batch_start//batch_size + 1}: {len(results)} results from {len(batch)} phones")

print(f"  Total phone matches: {len(phone_matches)}")

# Get D/I/J for all matched UIDs + existing UIDs
all_uids = set()
for phone, uid in phone_matches.items():
    all_uids.add(int(uid))
for excel_row, uid in rows_need_dij:
    all_uids.add(int(uid))

print(f"\n  Fetching D/I/J for {len(all_uids)} unique UIDs...")

uid_info = {}  # uid -> {d_count, created_at_date, download_channel}

if all_uids:
    # Get trial lesson count (D column)
    uid_list = list(all_uids)
    for batch_start in range(0, len(uid_list), 500):
        batch = uid_list[batch_start:batch_start + 500]
        placeholders = ','.join(['%s'] * len(batch))
        cur.execute(
            f"SELECT account_id, COUNT(*) FROM bi_user_course_detail WHERE account_id IN ({placeholders}) AND expire_time IS NULL AND deleted_at IS NULL GROUP BY account_id",
            batch
        )
        for row in cur.fetchall():
            uid = row[0]
            count = row[1]
            if uid not in uid_info:
                uid_info[uid] = {}
            uid_info[uid]['d_count'] = count

    # Get registration date and download channel
    for batch_start in range(0, len(uid_list), 500):
        batch = uid_list[batch_start:batch_start + 500]
        placeholders = ','.join(['%s'] * len(batch))
        cur.execute(
            f"SELECT id, created_at, download_channel FROM bi_vala_app_account WHERE id IN ({placeholders}) AND status=1 AND deleted_at IS NULL",
            batch
        )
        for row in cur.fetchall():
            uid = row[0]
            created_at = row[1]
            download_channel = row[2] or ''
            if uid not in uid_info:
                uid_info[uid] = {}
            uid_info[uid]['created_at'] = str(created_at.date()) if created_at else ''
            uid_info[uid]['download_channel'] = download_channel

cur.close()
conn.close()

print(f"  Fetched info for {len(uid_info)} UIDs")

# --- Step 4: Prepare writes ---
print("\nStep 4: Preparing writes...")

# Map excel_row -> {H, D, I, J}
writes = {}  # excel_row -> {col: value}

# From phone matches: H column
for excel_row, phone in phones_to_match:
    uid = phone_matches.get(phone)
    if uid:
        if excel_row not in writes:
            writes[excel_row] = {}
        writes[excel_row]['H'] = uid

# From UIDs (both matched and existing): D, I, J
# Collect all excel_rows that need D/I/J
rows_for_dij = {}
for excel_row, uid_str in rows_need_dij:
    rows_for_dij[excel_row] = int(uid_str)
# Also add matched phones that now have UIDs
for excel_row, phone in phones_to_match:
    uid = phone_matches.get(phone)
    if uid:
        rows_for_dij[excel_row] = int(uid)

for excel_row, uid in rows_for_dij.items():
    info = uid_info.get(uid, {})
    if excel_row not in writes:
        writes[excel_row] = {}

    d_count = info.get('d_count', 0)
    writes[excel_row]['D'] = str(d_count) if d_count > 0 else '0'

    created_at = info.get('created_at', '')
    if created_at:
        writes[excel_row]['I'] = created_at

    download_channel = info.get('download_channel', '')
    if download_channel:
        writes[excel_row]['J'] = download_channel

print(f"  Total rows to write: {len(writes)}")

# --- Step 5: Execute writes in batches ---
print("\nStep 5: Writing to sheet...")

# Group by column for batch writes
# H column writes
h_writes = [(r, writes[r]['H']) for r in sorted(writes.keys()) if 'H' in writes[r]]
d_writes = [(r, writes[r]['D']) for r in sorted(writes.keys()) if 'D' in writes[r]]
i_writes = [(r, writes[r]['I']) for r in sorted(writes.keys()) if 'I' in writes[r]]
j_writes = [(r, writes[r]['J']) for r in sorted(writes.keys()) if 'J' in writes[r]]

def write_batch(col_letter, row_values, col_name):
    """Write consecutive rows in batches"""
    if not row_values:
        print(f"  {col_name} ({col_letter}): nothing to write")
        return 0

    written = 0
    i = 0
    while i < len(row_values):
        # Find consecutive rows
        j = i + 1
        while j < len(row_values) and row_values[j][0] == row_values[j-1][0] + 1:
            j += 1

        batch = row_values[i:j]
        start_row = batch[0][0]
        end_row = batch[-1][0]
        values = [[v[1]] for v in batch]

        range_str = f"{SHEET_ID}!{col_letter}{start_row}:{col_letter}{end_row}"
        body = {"valueRange": {"range": range_str, "values": values}}

        try:
            url = f"https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/{SPREADSHEET_TOKEN}/values"
            result = feishu_put(url, body)
            code = result.get("code", -1)
            if code == 0:
                written += len(batch)
                print(f"  {col_name} rows {start_row}-{end_row}: OK ({len(batch)} cells)")
            else:
                print(f"  {col_name} rows {start_row}-{end_row}: ERROR code={code} msg={result.get('msg','')}")
        except Exception as e:
            print(f"  {col_name} rows {start_row}-{end_row}: EXCEPTION {e}")

        time.sleep(0.05)
        i = j

    return written

h_count = write_batch('H', h_writes, 'H(UID)')
d_count = write_batch('D', d_writes, 'D(Trial)')
i_count = write_batch('I', i_writes, 'I(RegDate)')
j_count = write_batch('J', j_writes, 'J(Channel)')

# --- Summary ---
print(f"\n{'='*60}")
print(f"SUMMARY")
print(f"{'='*60}")
print(f"  Phones matched to UID: {len(phone_matches)}")
print(f"  H (UID) written: {h_count}")
print(f"  D (Trial count) written: {d_count}")
print(f"  I (Reg date) written: {i_count}")
print(f"  J (Channel) written: {j_count}")
print(f"  Total rows updated: {len(writes)}")

# Show unmatched phones
unmatched = [(r, p) for r, p in phones_to_match if p not in phone_matches]
if unmatched:
    print(f"\n  Unmatched phones ({len(unmatched)}):")
    for r, p in unmatched[:20]:
        print(f"    Row {r}: {p}")
    if len(unmatched) > 20:
        print(f"    ... and {len(unmatched) - 20} more")