ai_member_xiaoxi/scripts/update_xiaolong_sheet.py
2026-06-05 08:00:01 +08:00

284 lines
9.9 KiB
Python

#!/usr/bin/env python3
"""Update the 小龙 sheet with UID, trial count, registration date, and download channel."""
import json
import re
import time
import sys
import mysql.connector
import psycopg2
import requests
FEISHU_TOKEN = "t-g10464c0UK5L67JVXSDDT3EWM4DPLSDY5C7R7NS6"
SPREADSHEET_TOKEN = "DU4dsUOJThfbPStMcgBcsMH5nyb"
SHEET_ID = "dff8c7"
# DB connections
MYSQL_CONFIG = {
"host": "bj-cdb-dh2fkqa0.sql.tencentcdb.com",
"port": 27751,
"user": "read_only",
"password": "fsdo45ijfmfmuu77$%^&",
"database": "vala_user",
}
PG_CONFIG = {
"host": "bj-postgres-16pob4sg.sql.tencentcdb.com",
"port": 28591,
"user": "ai_member",
"password": "LdfjdjL83h3h3^$&**YGG*",
"database": "vala_bi",
}
def read_sheet():
"""Read all data from the sheet."""
url = f"https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/{SPREADSHEET_TOKEN}/values/{SHEET_ID}!A3:J1142?valueRenderOption=ToString"
headers = {"Authorization": f"Bearer {FEISHU_TOKEN}"}
r = requests.get(url, headers=headers)
data = r.json()
if data.get("code") != 0:
print(f"Error reading sheet: {data}")
sys.exit(1)
return data["data"]["valueRange"]["values"]
def write_batch(rows_data):
"""Write a batch of rows to the sheet. rows_data is a list of (row_num, col, value) tuples."""
# Group by row for efficiency
# We'll write column by column for contiguous ranges
pass
def write_range(start_row, end_row, col_letter, values):
"""Write values to a column range."""
# start_row and end_row are 1-based sheet row numbers
range_str = f"{SHEET_ID}!{col_letter}{start_row}:{col_letter}{end_row}"
url = f"https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/{SPREADSHEET_TOKEN}/values"
headers = {
"Authorization": f"Bearer {FEISHU_TOKEN}",
"Content-Type": "application/json",
}
body = {
"valueRange": {
"range": range_str,
"values": values,
}
}
r = requests.put(url, headers=headers, json=body)
data = r.json()
if data.get("code") != 0:
print(f"Error writing {range_str}: {data}")
return False
print(f" Wrote {range_str}: {len(values)} cells")
return True
def main():
print("Reading sheet data...")
rows = read_sheet()
print(f"Got {len(rows)} rows")
# Step 1: Identify rows needing phone lookup
# Columns: A=0(sheet), B=1(name), C=2(date), D=3(trial_count), E=4(phone), F=5(grade), G=6(notes), H=7(UID), I=8(reg_date), J=9(channel)
phone_rows = [] # (row_index, phone, row_num)
existing_uids = set() # UIDs already in H column
for i, row in enumerate(rows):
row_num = i + 3 # sheet row number
# Pad row to 10 columns
while len(row) < 10:
row.append("")
phone = row[4].strip() if len(row) > 4 else ""
h_val = row[7].strip() if len(row) > 7 else ""
# Check H column for existing UID (pure digits)
if h_val and h_val.isdigit():
existing_uids.add(h_val)
# Check if phone is 11-digit and H is empty or "未注册"
if re.match(r'^\d{11}$', phone):
if not h_val or h_val == "未注册":
phone_rows.append((i, phone, row_num))
print(f"Rows with 11-digit phone and empty H: {len(phone_rows)}")
print(f"Existing UIDs in H column: {len(existing_uids)}")
# Step 2: Query MySQL for phone -> UID mapping
phone_to_uid = {}
if phone_rows:
mysql_conn = mysql.connector.connect(**MYSQL_CONFIG)
mysql_cur = mysql_conn.cursor()
# Batch query in chunks of 50
chunk_size = 50
for chunk_start in range(0, len(phone_rows), chunk_size):
chunk = phone_rows[chunk_start:chunk_start + chunk_size]
# Build LIKE conditions
conditions = []
for _, phone, _ in chunk:
first3 = phone[:3]
last4 = phone[-4:]
conditions.append(f"tel LIKE '{first3}%{last4}'")
query = f"SELECT id, tel FROM vala_app_account WHERE ({' OR '.join(conditions)}) AND deleted_at IS NULL"
mysql_cur.execute(query)
for uid, tel in mysql_cur.fetchall():
# tel is masked like "138****3774", match by first3+last4
for _, phone, _ in chunk:
if phone[:3] == tel[:3] and phone[-4:] == tel[-4:]:
phone_to_uid[phone] = str(uid)
break
time.sleep(0.05)
mysql_cur.close()
mysql_conn.close()
print(f"Phone -> UID matches found: {len(phone_to_uid)}")
# Step 3: Collect all UIDs to query PostgreSQL
all_uids = set(existing_uids)
for phone, uid in phone_to_uid.items():
all_uids.add(uid)
print(f"Total unique UIDs to query: {len(all_uids)}")
# Step 4: Query PostgreSQL for registration info and trial count
uid_reg_info = {} # uid -> {created_at, download_channel}
uid_trial_count = {} # uid -> trial_count
if all_uids:
pg_conn = psycopg2.connect(**PG_CONFIG)
pg_cur = pg_conn.cursor()
uid_list = list(all_uids)
# Query bi_vala_app_account for reg info
chunk_size = 100
for chunk_start in range(0, len(uid_list), chunk_size):
chunk = uid_list[chunk_start:chunk_start + chunk_size]
placeholders = ",".join(["%s"] * len(chunk))
query = f"SELECT id, created_at::date, download_channel FROM bi_vala_app_account WHERE id IN ({placeholders}) AND status=1 AND deleted_at IS NULL"
pg_cur.execute(query, chunk)
for uid, created_at, download_channel in pg_cur.fetchall():
uid_reg_info[str(uid)] = {
"created_at": str(created_at) if created_at else "",
"download_channel": download_channel or "",
}
time.sleep(0.05)
# Query bi_user_course_detail for trial count
for chunk_start in range(0, len(uid_list), chunk_size):
chunk = uid_list[chunk_start:chunk_start + chunk_size]
placeholders = ",".join(["%s"] * len(chunk))
query = f"SELECT account_id, COUNT(*) FROM bi_user_course_detail WHERE account_id IN ({placeholders}) AND expire_time IS NULL AND deleted_at IS NULL GROUP BY account_id"
pg_cur.execute(query, chunk)
for uid, cnt in pg_cur.fetchall():
uid_trial_count[str(uid)] = cnt
time.sleep(0.05)
pg_cur.close()
pg_conn.close()
print(f"UIDs with reg info: {len(uid_reg_info)}")
print(f"UIDs with trial count: {len(uid_trial_count)}")
# Step 5: Build the update data
# For each row, determine what to write
# H column: UID (only if found in MySQL)
# D column: trial count
# I column: registration date
# J column: download channel
# We'll batch writes by column for contiguous ranges
updates = [] # (row_num, col_index, value)
# col_index: 3=D, 7=H, 8=I, 9=J
for i, row in enumerate(rows):
row_num = i + 3
while len(row) < 10:
row.append("")
phone = row[4].strip() if len(row) > 4 else ""
h_val = row[7].strip() if len(row) > 7 else ""
# Determine UID for this row
uid = None
if re.match(r'^\d{11}$', phone) and phone in phone_to_uid:
uid = phone_to_uid[phone]
elif h_val and h_val.isdigit():
uid = h_val
if not uid:
continue
# H column: write UID if phone was matched and H was empty
if re.match(r'^\d{11}$', phone) and phone in phone_to_uid:
if not h_val or h_val == "未注册":
updates.append((row_num, 7, uid))
# D, I, J columns: write if we have data
if uid in uid_trial_count:
updates.append((row_num, 3, str(uid_trial_count[uid])))
if uid in uid_reg_info:
info = uid_reg_info[uid]
if info["created_at"]:
updates.append((row_num, 8, info["created_at"]))
if info["download_channel"]:
updates.append((row_num, 9, info["download_channel"]))
print(f"Total updates to write: {len(updates)}")
# Step 6: Write updates in batches
# Group by column and find contiguous ranges
# Column index -> list of (row_num, value)
col_updates = {3: [], 7: [], 8: [], 9: []}
col_names = {3: "D", 7: "H", 8: "I", 9: "J"}
for row_num, col_idx, value in updates:
col_updates[col_idx].append((row_num, value))
# Sort each column's updates by row_num
for col_idx in col_updates:
col_updates[col_idx].sort(key=lambda x: x[0])
# Write in contiguous batches
total_written = 0
for col_idx in [7, 3, 8, 9]: # Write H first, then D, I, J
col_letter = col_names[col_idx]
items = col_updates[col_idx]
if not items:
continue
# Group into contiguous ranges
i = 0
while i < len(items):
start_row = items[i][0]
values = [[items[i][1]]]
j = i + 1
while j < len(items) and items[j][0] == items[j-1][0] + 1:
values.append([items[j][1]])
j += 1
end_row = items[j-1][0]
if write_range(start_row, end_row, col_letter, values):
total_written += len(values)
time.sleep(0.05)
i = j
# Summary
phones_matched = len(phone_to_uid)
h_written = len(col_updates[7])
d_written = len(col_updates[3])
i_written = len(col_updates[8])
j_written = len(col_updates[9])
print(f"\n=== SUMMARY ===")
print(f"Phones matched in MySQL: {phones_matched}")
print(f"H column (UID) written: {h_written}")
print(f"D column (trial count) written: {d_written}")
print(f"I column (reg date) written: {i_written}")
print(f"J column (channel) written: {j_written}")
print(f"Total cells written: {total_written}")
if __name__ == "__main__":
main()