From b234be7bba4e4e342eb053b42b693e514e19087f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=8F=E6=BA=AA?= Date: Tue, 19 May 2026 08:00:01 +0800 Subject: [PATCH] =?UTF-8?q?=F0=9F=A4=96=20=E6=AF=8F=E6=97=A5=E8=87=AA?= =?UTF-8?q?=E5=8A=A8=E5=A4=87=E4=BB=BD=20-=202026-05-19=2008:00:01?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- memory/2026-05-18-0958.md | 15 + memory/2026-05-18-1804.md | 5 + memory/2026-05-18.md | 28 ++ scripts/build_xhs_learning_report.py | 413 ++++++++++++++++++ .../scripts/phone_chapter_query.py | 49 ++- 5 files changed, 491 insertions(+), 19 deletions(-) create mode 100644 memory/2026-05-18-0958.md create mode 100644 memory/2026-05-18-1804.md create mode 100644 memory/2026-05-18.md create mode 100644 scripts/build_xhs_learning_report.py diff --git a/memory/2026-05-18-0958.md b/memory/2026-05-18-0958.md new file mode 100644 index 0000000..719ca8b --- /dev/null +++ b/memory/2026-05-18-0958.md @@ -0,0 +1,15 @@ +# Session: 2026-05-18 09:58:46 GMT+8 + +- **Session Key**: agent:main:feishu:direct:ou_e63ce6b760ad39382852472f28fbe2a2 +- **Session ID**: e42c0253-2c2c-41da-9802-4c980f0fad8d +- **Source**: feishu + +## Conversation Summary + +assistant: 承龙,`/eset` 不是我能识别的命令。你是想输入以下哪个吗? + +- `/exec` — 执行命令 +- `/set` — 设置(具体是什么设置?) +- 或者其他? + +如果这是你自定义的一个指令,请跟我说一下它的用途,我来配合。 diff --git a/memory/2026-05-18-1804.md b/memory/2026-05-18-1804.md new file mode 100644 index 0000000..eac2fa7 --- /dev/null +++ b/memory/2026-05-18-1804.md @@ -0,0 +1,5 @@ +# Session: 2026-05-18 18:04:40 GMT+8 + +- **Session Key**: agent:main:feishu:direct:ou_e63ce6b760ad39382852472f28fbe2a2 +- **Session ID**: 9e66388a-8314-48f3-8131-02d369f1e547 +- **Source**: feishu diff --git a/memory/2026-05-18.md b/memory/2026-05-18.md new file mode 100644 index 0000000..65190c5 --- /dev/null +++ b/memory/2026-05-18.md @@ -0,0 +1,28 @@ +# 2026-05-18 工作日志 + +## 小红书线索学习数据匹配 + +[李承龙/陈逸鸫需求] 按CSV文件 `may2026_xhs_leads_user_sales.csv` 匹配小红书线索用户的学习数据。 + +### 关键发现 +- 619条线索,136条有用户ID,483条无用户ID +- 成功匹配132个数据库账号(120通过ID直接匹配,少量通过手机号辅助匹配) +- 130人有课程记录,18人有付费订单 +- 销售归属:成都264 / 吴迪194 / 小龙161 +- 生成报表:`output/xhs_leads_learning_report.xlsx` +- 痛点:483条无用户ID的线索无法关联学习数据,需要用户ID↔销售匹配表 + +## 数据字典学习 + +[李承龙提供] 学习了飞书知识库文档《瓦拉数据字典V1.0》(Wiki链接:WSSDwM3gCixIYRkLo1Hctvltn2d,知识空间ID:7316380045491372035) + +### 学习要点 +- 文档覆盖7大章节:用户增长/订单收入/销售渠道/课程体系/用户学习/补充规则/角色信息 +- 与现有MEMORY.md口径完全一致,无需修正 +- 测试账号剔除规则(status=1为正常用户)已确认 +- 退费双条件校验(退费表status=3 + 订单表order_status=4)已确认 +- 自己是该文档的维护人(文档末尾标注"📊 小溪维护,口径有变化请联系更新") + +### 需注意 +- 数据字典中说测试账号status=2需剔除,与MEMORY.md中"仅保留status=1"一致 +- 后续如有口径变更,需同步更新数据字典文档和MEMORY.md diff --git a/scripts/build_xhs_learning_report.py b/scripts/build_xhs_learning_report.py new file mode 100644 index 0000000..56af191 --- /dev/null +++ b/scripts/build_xhs_learning_report.py @@ -0,0 +1,413 @@ +#!/usr/bin/env python3 +"""Build comprehensive report: 小红书线索 → 用户学习数据匹配""" +import csv +import psycopg2 +import psycopg2.extras +from collections import defaultdict +from openpyxl import Workbook +from openpyxl.styles import Font, PatternFill, Alignment, Border, Side +from openpyxl.utils import get_column_letter +import os, re + +PG_PASS = "LdfjdjL83h3h3^$&**YGG*" +PG_CONFIG = { + "host": "bj-postgres-16pob4sg.sql.tencentcdb.com", + "port": 28591, + "user": "ai_member", + "password": PG_PASS, + "dbname": "vala_bi", +} + +CSV_PATH = os.path.join(os.path.dirname(__file__), "../tmp/may2026_xhs_leads_user_sales.csv") +OUTPUT = os.path.join(os.path.dirname(__file__), "../output/xhs_leads_learning_report.xlsx") + +# Read CSV +rows = [] +with open(CSV_PATH, "r", encoding="utf-8-sig") as f: + reader = csv.DictReader(f) + for r in reader: + rows.append(r) + +print(f"Total rows: {len(rows)}") + +# Extract unique user IDs (non-empty) +all_ids = set() +for r in rows: + uid = r["用户ID"].strip() + if uid: + all_ids.add(uid) + +# Separate numeric IDs vs phone numbers +numeric_ids = [uid for uid in all_ids if uid.isdigit() and len(uid) <= 10] +phone_ids = [uid for uid in all_ids if len(uid) == 11 and uid.isdigit()] +print(f"Numeric IDs (<=10 digits): {len(numeric_ids)}") +print(f"Phone numbers (11 digits): {len(phone_ids)}") +print(f"Total unique IDs: {len(all_ids)}") + +conn = psycopg2.connect(**PG_CONFIG) +cur = conn.cursor() + +# Query 1: Account info for numeric IDs +numeric_ids_str = ",".join(numeric_ids) +cur.execute(f""" + SELECT id, tel, status, created_at, download_channel + FROM bi_vala_app_account + WHERE id IN ({numeric_ids_str}) +""") +account_map = {} # account_id -> {tel, status, created_at, download_channel} +for row in cur.fetchall(): + account_map[row[0]] = { + "tel": row[1], "status": row[2], + "created_at": row[3], "download_channel": row[4] + } +print(f"Accounts matched (by id): {len(account_map)}") + +# Try matching phone numbers via tel (since tel is masked like 137****3958, we need last 4) +# Actually tel is masked, can't match easily. Let's skip for now. + +# Query 2: Course details for matched accounts +matched_account_ids = sorted(account_map.keys()) +matched_str = ",".join(str(x) for x in matched_account_ids) + +cur.execute(f""" + SELECT cd.account_id, cd.user_id, cd.course_level, cd.expire_time, cd.deleted_at, cd.created_at + FROM bi_user_course_detail cd + WHERE cd.account_id IN ({matched_str}) + ORDER BY cd.account_id, cd.course_level, cd.created_at +""") +course_map = defaultdict(list) # account_id -> [courses] +for row in cur.fetchall(): + course_map[row[0]].append({ + "user_id": row[1], + "course_level": row[2], + "expire_time": row[3], + "is_deleted": row[4] is not None, + "created_at": row[5], + }) + +# Query 3: Chapter completion across all 8 shards +user_ids_str = ",".join(str(c["user_id"]) for courses in course_map.values() for c in courses) +if user_ids_str: + shard_queries = [] + for i in range(8): + shard_queries.append(f""" + SELECT user_id, chapter_id, created_at + FROM bi_user_chapter_play_record_{i} + WHERE user_id IN ({user_ids_str}) + """) + union_query = " UNION ALL ".join(shard_queries) + cur.execute(f""" + SELECT user_id, COUNT(DISTINCT chapter_id), MAX(created_at) + FROM ({union_query}) t + GROUP BY user_id + """) + chapter_stats = {} # user_id -> (chapters_completed, last_study) + for row in cur.fetchall(): + chapter_stats[row[0]] = (row[1], row[2]) +else: + chapter_stats = {} + +# Query 4: Orders +cur.execute(f""" + SELECT o.account_id, o.order_status, o.pay_amount_int, o.goods_name, o.pay_success_date, o.key_from + FROM bi_vala_order o + WHERE o.account_id IN ({matched_str}) + ORDER BY o.account_id, o.pay_success_date +""") +order_map = defaultdict(list) +for row in cur.fetchall(): + order_map[row[0]].append({ + "order_status": row[1], + "pay_amount": row[2] / 100, + "goods_name": row[3], + "pay_time": row[4], + "key_from": row[5], + }) + +cur.close() +conn.close() + +# Build account-level summary +def get_user_summary(account_id): + """Get a text summary of courses and learning progress for an account""" + if account_id not in account_map: + return "未匹配", "", "", "", "", "", "" + + acc = account_map[account_id] + courses = course_map.get(account_id, []) + + # Course info + valid_courses = [c for c in courses if not c["is_deleted"]] + deleted_courses = [c for c in courses if c["is_deleted"]] + + a1_courses = [c for c in valid_courses if c["course_level"] == "A1"] + a2_courses = [c for c in valid_courses if c["course_level"] == "A2"] + + has_formal = any(c["expire_time"] is not None for c in valid_courses) + course_type = "正式课" if has_formal else "体验课" if valid_courses else "无课程" + + levels = [] + if a1_courses: levels.append("L1") + if a2_courses: levels.append("L2") + level_str = "+".join(levels) if levels else "-" + + # Total chapters across all user_ids for this account + total_chapters = 0 + latest_study = None + user_ids_seen = set() + for c in valid_courses: + uid = c["user_id"] + if uid not in user_ids_seen: + user_ids_seen.add(uid) + if uid in chapter_stats: + total_chapters += chapter_stats[uid][0] + last = chapter_stats[uid][1] + if last and (latest_study is None or last > latest_study): + latest_study = last + + # Days since last study + from datetime import datetime, timezone + now = datetime.now(timezone.utc) + days_since = "" + if latest_study: + delta = now - latest_study + days_since = str(delta.days) + + # Orders + orders = order_map.get(account_id, []) + valid_orders = [o for o in orders if o["order_status"] in (3, 4)] # completed or refunded + refunded = [o for o in valid_orders if o["order_status"] == 4] + completed = [o for o in valid_orders if o["order_status"] == 3] + + has_purchased = "是" if valid_orders else "否" + order_summary = "; ".join([f"{o['goods_name']}(¥{o['pay_amount']})" for o in valid_orders[:3]]) + if len(valid_orders) > 3: order_summary += f" ...共{len(valid_orders)}单" + + gmv = sum(o["pay_amount"] for o in valid_orders) + gsv = sum(o["pay_amount"] for o in completed) + + return ( + "已匹配", + acc["created_at"].strftime("%Y-%m-%d %H:%M") if acc["created_at"] else "", + acc["download_channel"] or "未设置", + course_type, + level_str, + str(total_chapters), + latest_study.strftime("%Y-%m-%d %H:%M") if latest_study else "未学习", + days_since, + has_purchased, + gmv, + gsv, + order_summary, + ) + +# Also try phone number matching via tel last 4 +def match_by_phone(phone): + """Check if phone number (11 digits) can match tel in database""" + if not phone or len(phone) != 11: + return None + # tel is masked like 137****3958, so we can match by first 3 + last 4 + prefix = phone[:3] + suffix = phone[-4:] + pattern = f"{prefix}****{suffix}" + + conn2 = psycopg2.connect(**PG_CONFIG) + cur2 = conn2.cursor() + cur2.execute("SELECT id FROM bi_vala_app_account WHERE tel = %s AND status = 1", (pattern,)) + result = cur2.fetchone() + cur2.close() + conn2.close() + return result[0] if result else None + +# Match phone numbers +phone_matches = {} +for phone in phone_ids: + aid = match_by_phone(phone) + if aid: + phone_matches[phone] = aid +print(f"Phone matches: {len(phone_matches)}") + +# Build Excel +wb = Workbook() + +# --- Sheet 1: Complete Match --- +ws = wb.active +ws.title = "小红书线索学习数据" + +# Header style +header_font = Font(name="微软雅黑", bold=True, size=11, color="FFFFFF") +header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid") +header_align = Alignment(horizontal="center", vertical="center", wrap_text=True) +cell_border = Border( + left=Side(style="thin"), right=Side(style="thin"), + top=Side(style="thin"), bottom=Side(style="thin") +) + +headers = [ + "用户ID", "销售归属", "匹配方式", "微伴昵称", "销售表昵称", "手机号", + "进线日期", "微伴添加时间", "企业标签", + "账号ID(DB)", "注册时间", "下载渠道", "课程类型", "课程等级", + "完成课时数", "最后学习时间", "距今日数", "是否付费", "GMV", "GSV", "订单摘要" +] + +for col, h in enumerate(headers, 1): + cell = ws.cell(row=1, column=col, value=h) + cell.font = header_font + cell.fill = header_fill + cell.alignment = header_align + cell.border = cell_border + +# Data rows +row_num = 2 +for r in rows: + uid = r["用户ID"].strip() + account_id = None + + if uid and uid.isdigit() and len(uid) <= 10: + account_id = int(uid) + elif uid and len(uid) == 11 and uid.isdigit(): + # Check phone match + account_id = phone_matches.get(uid) + + vals = [ + uid, + r["销售归属"].strip(), + r["匹配方式"].strip(), + r["微伴昵称"].strip(), + r["销售表昵称"].strip(), + r["手机号"].strip(), + r["进线日期"].strip(), + r["微伴添加时间"].strip(), + r["企业标签"].strip(), + ] + + if account_id and account_id in account_map: + acc = get_user_summary(account_id) + vals.extend([ + str(account_id), acc[1], acc[2], acc[3], acc[4], + acc[5], acc[6], acc[7], acc[8], + f"¥{acc[9]:.2f}" if isinstance(acc[9], (int, float)) and acc[9] > 0 else "-", + f"¥{acc[10]:.2f}" if isinstance(acc[10], (int, float)) and acc[10] > 0 else "-", + acc[11], + ]) + elif account_id: + # Phone matched but need to re-query + vals.extend([str(account_id), "手机号匹配(需进一步验证)", "", "", "", "", "", "", "", "", "", ""]) + else: + vals.extend(["未匹配", "", "", "", "", "", "", "", "", "", "", ""]) + + for col, v in enumerate(vals, 1): + cell = ws.cell(row=row_num, column=col, value=v) + cell.border = cell_border + cell.alignment = Alignment(vertical="center") + row_num += 1 + +# Freeze header +ws.freeze_panes = "A2" +# Auto-filter +ws.auto_filter.ref = f"A1:{get_column_letter(len(headers))}{row_num-1}" + +# Column widths +col_widths = [10, 10, 22, 22, 18, 14, 12, 20, 18, 12, 18, 14, 10, 8, 10, 18, 8, 8, 10, 10, 30] +for i, w in enumerate(col_widths, 1): + ws.column_dimensions[get_column_letter(i)].width = w + +# --- Sheet 2: Summary Stats --- +ws2 = wb.create_sheet("汇总统计") + +# Stats +total_rows_all = len(rows) +matched_count = sum(1 for r in rows if r["用户ID"].strip() and r["用户ID"].strip().isdigit() and len(r["用户ID"].strip()) <= 10 and int(r["用户ID"].strip()) in account_map) +phone_matched_count = sum(1 for r in rows if r["用户ID"].strip() and len(r["用户ID"].strip()) == 11 and r["用户ID"].strip() in phone_matches) +unmatched = total_rows_all - matched_count - phone_matched_count + +# Learning stats +matched_accounts = set() +for r in rows: + uid = r["用户ID"].strip() + if uid and uid.isdigit() and len(uid) <= 10: + aid = int(uid) + if aid in account_map: + matched_accounts.add(aid) + elif uid and len(uid) == 11 and uid.isdigit(): + if uid in phone_matches: + matched_accounts.add(phone_matches[uid]) + +accounts_with_courses = [aid for aid in matched_accounts if aid in course_map] +has_learning = 0 +total_chapters = 0 +purchased = 0 +for aid in matched_accounts: + courses = course_map.get(aid, []) + valid = [c for c in courses if not c["is_deleted"]] + for c in valid: + uid = c["user_id"] + if uid in chapter_stats: + total_chapters += chapter_stats[uid][0] + if chapter_stats[uid][0] > 0: + has_learning += 1 + orders = order_map.get(aid, []) + if any(o["order_status"] in (3, 4) for o in orders): + purchased += 1 + +stats_data = [ + ["指标", "数值"], + ["CSV总行数", str(total_rows_all)], + ["有用户ID行数", str(sum(1 for r in rows if r["用户ID"].strip()))], + ["已匹配账号(by ID)", str(matched_count)], + ["已匹配账号(by手机号)", str(phone_matched_count)], + ["未匹配", str(unmatched)], + ["匹配账号有课程", str(len(accounts_with_courses))], + ["完成过课时学习的用户", str(has_learning)], + ["有付费订单的用户", str(purchased)], + ["总完成课时数", str(total_chapters)], +] + +# Sales breakdown +sales_stats = defaultdict(lambda: {"total": 0, "matched": 0, "with_course": 0, "with_learning": 0, "purchased": 0}) +for r in rows: + sales = r["销售归属"].strip() + sales_stats[sales]["total"] += 1 + uid = r["用户ID"].strip() + aid = None + if uid and uid.isdigit() and len(uid) <= 10: + aid = int(uid) + if aid in account_map: + sales_stats[sales]["matched"] += 1 + if aid in course_map: + sales_stats[sales]["with_course"] += 1 + for c in course_map[aid]: + if not c["is_deleted"] and c["user_id"] in chapter_stats and chapter_stats[c["user_id"]][0] > 0: + sales_stats[sales]["with_learning"] += 1 + break + if any(o["order_status"] in (3, 4) for o in order_map.get(aid, [])): + sales_stats[sales]["purchased"] += 1 + +stats_data.append([]) +stats_data.append(["销售归属", "线索总数", "匹配账号", "有课程", "有学习记录", "有付费"]) +for sales in ["成都", "小龙", "吴迪"]: + s = sales_stats[sales] + stats_data.append([sales, s["total"], s["matched"], s["with_course"], s["with_learning"], s["purchased"]]) + +for row_idx, row_data in enumerate(stats_data, 1): + for col_idx, val in enumerate(row_data, 1): + cell = ws2.cell(row=row_idx, column=col_idx, value=val) + if row_idx == 1: + cell.font = header_font + cell.fill = header_fill + cell.alignment = header_align + cell.border = cell_border + +ws2.column_dimensions["A"].width = 25 +ws2.column_dimensions["B"].width = 15 +ws2.column_dimensions["C"].width = 12 +ws2.column_dimensions["D"].width = 10 +ws2.column_dimensions["E"].width = 12 +ws2.column_dimensions["F"].width = 10 + +wb.save(OUTPUT) +print(f"\nReport saved to: {OUTPUT}") +print(f"Total matched accounts: {len(matched_accounts)}") +print(f"With courses: {len(accounts_with_courses)}") +print(f"With learning: {has_learning}") +print(f"With purchases: {purchased}") diff --git a/skills/phone-chapter-query/scripts/phone_chapter_query.py b/skills/phone-chapter-query/scripts/phone_chapter_query.py index b7535e0..c180172 100644 --- a/skills/phone-chapter-query/scripts/phone_chapter_query.py +++ b/skills/phone-chapter-query/scripts/phone_chapter_query.py @@ -184,40 +184,51 @@ def step2_query_chapter_play(account_ids, pg_password): SHARD_COUNT, ) - comp_union = build_union_sql( - "bi_user_component_play_record", - "chapter_unique_id, interval_time", - "", - SHARD_COUNT, - ) + # 为每个分表生成带过滤的 component_play_record 查询 + comp_parts = [] + for i in range(SHARD_COUNT): + comp_parts.append( + f"SELECT chapter_unique_id, SUM(interval_time) AS total_interval " + f"FROM bi_user_component_play_record_{i} " + f"WHERE chapter_unique_id IN (SELECT chapter_unique_id FROM cp_unique_ids) " + f"GROUP BY chapter_unique_id" + ) + comp_filtered_union = " UNION ALL ".join(comp_parts) sql = f""" WITH chapter_play AS ( {chapter_union} ), +filtered_play AS ( + SELECT cp.user_id, cp.chapter_id, cp.chapter_unique_id, cp.finish_date, + ROW_NUMBER() OVER (PARTITION BY cp.user_id, cp.chapter_id ORDER BY cp.finish_date) AS rn + FROM chapter_play cp + JOIN bi_vala_app_character c ON cp.user_id = c.id + JOIN bi_vala_app_account a ON c.account_id = a.id + WHERE a.id IN ({aid_list}) +), +cp_unique_ids AS ( + SELECT DISTINCT chapter_unique_id FROM filtered_play WHERE rn = 1 +), comp_time AS ( - SELECT chapter_unique_id, SUM(interval_time) AS total_interval - FROM ({comp_union}) t + SELECT chapter_unique_id, SUM(total_interval) AS total_interval + FROM ({comp_filtered_union}) t GROUP BY chapter_unique_id ), course_detail AS ( SELECT - cp.user_id, - cp.chapter_id, + fp.user_id, + fp.chapter_id, FORMAT('%s-%s-%s-%s', l.course_level, l.course_season, l.course_unit, l.course_lesson) AS course_id, - cp.finish_date, + fp.finish_date, FORMAT('%s:%s', FLOOR(ct.total_interval / 1000 / 60), LPAD(CAST(MOD(ct.total_interval / 1000, 60) AS TEXT), 2, '0') ) AS finish_time - FROM ( - SELECT user_id, chapter_id, chapter_unique_id, finish_date, - ROW_NUMBER() OVER (PARTITION BY user_id, chapter_id ORDER BY finish_date) AS rn - FROM chapter_play - ) cp - LEFT JOIN bi_level_unit_lesson l ON cp.chapter_id = l.id - LEFT JOIN comp_time ct ON cp.chapter_unique_id = ct.chapter_unique_id - WHERE cp.rn = 1 + FROM filtered_play fp + LEFT JOIN bi_level_unit_lesson l ON fp.chapter_id = l.id + LEFT JOIN comp_time ct ON fp.chapter_unique_id = ct.chapter_unique_id + WHERE fp.rn = 1 ) SELECT