ai_member_xiaoyan/skills/interactive-component-json/scripts/pipeline.py

#!/usr/bin/env python3
"""
端到端流水线
飞书wiki URL → 读取文档 → 解析sheet → 类型匹配 → LLM生成jsonData/kpInfo → 写入SQLite
"""

import os
import sys
import json
import time
import logging
import re
import threading
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed

CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, CURRENT_PATH)
PROJECT_ROOT = os.path.dirname(CURRENT_PATH)
CONFIG_PATH = os.path.join(PROJECT_ROOT, 'config.json')

from feishu_client import read_wiki_doc_with_sheet
from parse_script import parse_script_from_sheet
from match_component import match_component_type
from generate_json import generate_component
from llm_client import get_client
from db_manager import get_connection, init_db, insert_component, update_component_field
from html_report import generate_html_report

logger = logging.getLogger("pipeline")
if not logger.handlers:
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter(
        "%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s"
    ))
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)


def setup_file_logging(title=""):
    """
    设置文件日志：将 pipeline 和 llm_client 的日志写入 outputs 目录下的日志文件。
    成功时记录摘要，失败时记录完整 prompt + LLM 返回内容。

    Returns:
        str: 日志文件路径
    """
    from datetime import datetime
    outputs_dir = os.path.join(PROJECT_ROOT, "outputs")
    os.makedirs(outputs_dir, exist_ok=True)

    # 生成日志文件名
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    safe_title = re.sub(r'[^\w\u4e00-\u9fff-]', '_', title)[:40] if title else "pipeline"
    log_filename = f"{safe_title}_{timestamp}.log"
    log_path = os.path.join(outputs_dir, log_filename)

    # 创建文件 handler
    file_handler = logging.FileHandler(log_path, encoding="utf-8")
    file_handler.setFormatter(logging.Formatter(
        "%(asctime)s - %(levelname)s - %(name)s - %(filename)s:%(lineno)d - %(message)s"
    ))
    file_handler.setLevel(logging.DEBUG)

    # 添加到 pipeline 和 llm_client logger
    logger.addHandler(file_handler)
    llm_logger = logging.getLogger("llm_client")
    llm_logger.addHandler(file_handler)

    logger.info(f"日志文件: {log_path}")
    return log_path


def _load_proxy_config():
    """读取 config.json 中的 proxy 配置"""
    if not os.path.exists(CONFIG_PATH):
        return None
    try:
        with open(CONFIG_PATH, 'r') as f:
            return json.load(f).get('proxy')
    except Exception:
        return None


def _ensure_proxy_running():
    """检测代理服务是否运行，未运行则自动启动。返回代理 URL 或 None。"""
    import requests as _req

    proxy_cfg = _load_proxy_config()
    if not proxy_cfg:
        logger.warning("未找到 config.json 或 proxy 配置，跳过代理")
        return None

    port = proxy_cfg['port']
    external_ip = proxy_cfg.get('external_ip', '127.0.0.1')
    health_url = f'http://127.0.0.1:{port}/health'

    # 健康检查
    try:
        resp = _req.get(health_url, timeout=2)
        if resp.status_code == 200:
            logger.info(f"代理服务已运行 (port {port})")
            return f'http://{external_ip}:{port}/api/push'
    except Exception:
        pass

    # 尝试启动代理
    logger.info("代理服务未运行，正在启动...")
    proxy_script = os.path.join(CURRENT_PATH, 'proxy_server.py')
    subprocess.Popen(
        [sys.executable, proxy_script],
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL,
        start_new_session=True,
    )

    # 等待启动
    for _ in range(5):
        time.sleep(1)
        try:
            resp = _req.get(health_url, timeout=2)
            if resp.status_code == 200:
                logger.info(f"代理服务启动成功 (port {port})")
                return f'http://{external_ip}:{port}/api/push'
        except Exception:
            continue

    logger.warning("代理服务启动失败，HTML报告将使用直连地址")
    return None


def process_script(wiki_url_or_token, db_path=None, dry_run=False):
    """
    端到端处理一个剧本文档

    Args:
        wiki_url_or_token: 飞书 wiki URL 或 wiki_token
        db_path: SQLite 数据库路径（默认使用 db/components.db）
        dry_run: 如果为 True，只生成不写入DB

    Returns:
        dict: 处理报告
    """
    report = {
        "wiki_url": wiki_url_or_token,
        "title": "",
        "total_components": 0,
        "success": 0,
        "failed": 0,
        "skipped": 0,
        "results": [],
        "errors": [],
    }

    # 设置文件日志（早期设置，后续用标题重命名）
    log_path = setup_file_logging()

    # Step 1: 读取飞书文档
    logger.info(f"=== Step 1: 读取飞书文档 ===")
    try:
        doc_data = read_wiki_doc_with_sheet(wiki_url_or_token)
        report["title"] = doc_data["title"]
        logger.info(f"文档: {doc_data['title']}, obj_token={doc_data['obj_token']}")
    except Exception as e:
        report["errors"].append(f"读取文档失败: {e}")
        logger.error(f"读取文档失败: {e}")
        return report

    if not doc_data["sheet_rows"] and not doc_data.get("all_sheets"):
        report["errors"].append("文档中未找到内嵌sheet数据")
        logger.error("文档中未找到内嵌sheet数据")
        return report

    # Step 2: 解析剧本（尝试所有sheet，取组件数最多的结果）
    logger.info(f"=== Step 2: 解析剧本 ===")
    llm_client = get_client()

    # MySQL 连通性检测
    try:
        from kp_matcher import _get_connection
        _get_connection()
        logger.info("MySQL 连接成功（kpId 匹配可用）")
    except Exception as e:
        logger.error(f"MySQL 连接失败: {e} — kpId 匹配将全部为空")

    # 收集所有sheet的数据，逐一尝试解析
    all_sheets = doc_data.get("all_sheets", [])
    if not all_sheets and doc_data["sheet_rows"]:
        all_sheets = [doc_data["sheet_rows"]]

    best_parsed = None
    best_sheet_rows = None
    best_count = 0
    for idx, sheet_rows in enumerate(all_sheets):
        if not sheet_rows or len(sheet_rows) < 2:
            continue
        try:
            parsed = parse_script_from_sheet(sheet_rows, doc_data["markdown"], llm_client=llm_client)
            count = len(parsed.get("components", []))
            logger.info(f"  Sheet[{idx}]: 识别到 {count} 个组件")
            if count > best_count:
                best_count = count
                best_parsed = parsed
                best_sheet_rows = sheet_rows
        except Exception as e:
            logger.warning(f"  Sheet[{idx}] 解析失败: {e}")

    if best_parsed is None or best_count == 0:
        report["errors"].append("未识别到任何组件行（已尝试所有sheet）")
        return report

    parsed = best_parsed
    character_map = parsed["character_map"]
    section_char_map = parsed.get("section_char_map", [])
    components = parsed["components"]
    # 更新 doc_data 中的 sheet_rows 为匹配到的那个
    if best_sheet_rows is not None:
        doc_data["sheet_rows"] = best_sheet_rows
    report["total_components"] = len(components)
    logger.info(f"最终识别到 {len(components)} 个组件, 角色映射: {character_map}, section映射: {len(section_char_map)}条")

    # Step 3: 初始化DB
    if not dry_run:
        init_db()
        logger.info(f"数据库已初始化")

    # Step 4: 并行组件生成
    logger.info(f"=== Step 3: 并行组件生成 jsonData/kpInfo (workers=4) ===")

    # 提取元数据
    metadata = parsed.get("metadata", {})
    script_id = doc_data["obj_token"]
    script_title = doc_data["title"]

    # 从标题提取 level 和 unit
    level = _extract_level(script_title)
    unit_id = _extract_unit(script_title)

    # 预加载 examples cache（线程安全：主线程加载一次后只读）
    try:
        from generate_json import _load_examples
        _load_examples()
    except Exception:
        pass

    # 进度锁
    _progress_lock = threading.Lock()
    _progress = {"success": 0, "failed": 0, "skipped": 0}

    def _process_one(i, comp):
        """处理单个组件（线程 worker）"""
        from llm_client import LLMClient
        from kp_matcher import _close_connection

        # 每个 worker 独立 LLM client
        worker_llm = LLMClient()

        cId = comp["cId"]
        type_name = comp["type_name"]

        try:
            # 类型匹配
            type_info = match_component_type(type_name)
            cType = type_info["cType"]
        except ValueError as e:
            with _progress_lock:
                _progress["skipped"] += 1
            logger.warning(f"跳过: {e}")
            return {"index": i, "cId": cId, "type_name": type_name,
                    "status": "skipped", "reason": str(e),
                    "raw_config": comp.get("teaching_config", "")}

        if cType is None:
            with _progress_lock:
                _progress["skipped"] += 1
            logger.warning(f"跳过: {type_name} 尚未实现")
            return {"index": i, "cId": cId, "type_name": type_name,
                    "status": "skipped", "reason": f"类型 '{type_name}' 尚未实现生成器",
                    "raw_config": comp.get("teaching_config", "")}

        try:
            # LLM 生成（使用 worker 独立的 llm_client）
            result = generate_component(comp, character_map, section_char_map, worker_llm, level=level)
            category = result.get("category", "mid")

            entry = {
                "index": i,
                "cId": cId,
                "cType": result["cType"],
                "type_name": type_name,
                "category": category,
                "result": result,
            }

            if dry_run:
                entry["status"] = "generated"
            else:
                # 写入数据库
                component_id = insert_component(
                    script_id=script_id,
                    cType=result["cType"],
                    type_name=type_name,
                    category=category,
                    has_image=result["has_image"],
                    level=level,
                    unit_id=unit_id,
                    knowledge_points_raw=comp.get("knowledge_text", ""),
                    raw_config=comp["teaching_config"],
                    component_index=i,
                    script_title=script_title,
                    bitable_token=result.get("bitable"),
                    db_table=result.get("db_table"),
                )
                update_component_field(component_id, "final_config_json",
                    json.dumps(result["jsonData"], ensure_ascii=False))
                if result.get("kpInfo"):
                    update_component_field(component_id, "kp_info_json",
                        json.dumps(result["kpInfo"], ensure_ascii=False))

                if category == "core":
                    _core_db_map = [
                        ("task_info_json", "taskInfo"),
                        ("material_info_json", "materialInfo"),
                        ("flow_info_json", "flowInfo"),
                        ("study_info_json", "studyInfo"),
                        ("question_group_json", "questionGroup"),
                        ("dialog_setting_json", "dialogSetting"),
                        ("dialog_config_json", "dialogConfig"),
                        ("image_info_json", "imageInfo"),
                        ("option_list_json", "optionList"),
                        ("question_list_json", "questionList"),
                        ("pre_dialog_json", "preDialog"),
                        ("dialog_list_json", "dialogList"),
                        ("text_info_json", "textInfo"),
                        ("eval_info_json", "evalInfo"),
                    ]
                    for field, key in _core_db_map:
                        data = result.get(key)
                        if data:
                            update_component_field(component_id, field,
                                json.dumps(data, ensure_ascii=False))
                    if result.get("intermediate"):
                        update_component_field(component_id, "intermediate_json",
                            json.dumps(result["intermediate"], ensure_ascii=False))

                update_component_field(component_id, "status", "generated")
                entry["status"] = "saved"
                entry["component_id"] = component_id

            with _progress_lock:
                _progress["success"] += 1
                done = _progress["success"] + _progress["failed"] + _progress["skipped"]
            logger.info(f"成功: {cType} ({cId})")
            print(f"  ✓ [{done}/{len(components)}] {type_name} ({cType})", flush=True)
            return entry

        except Exception as e:
            import traceback as _tb
            with _progress_lock:
                _progress["failed"] += 1
                done = _progress["success"] + _progress["failed"] + _progress["skipped"]
            # 详细日志：包含 teaching_config 和完整堆栈
            logger.error(
                f"组件生成失败!\n"
                f"── 组件信息 ──\n"
                f"  index={i}, cId={cId}, type_name={type_name}\n"
                f"── teaching_config ({len(comp.get('teaching_config', ''))} chars) ──\n"
                f"{comp.get('teaching_config', '')[:2000]}\n"
                f"── 错误堆栈 ──\n{_tb.format_exc()}"
            )
            print(f"  ✗ [{done}/{len(components)}] {type_name}: {str(e)[:60]}", flush=True)
            return {"index": i, "cId": cId, "type_name": type_name,
                    "status": "failed", "error": str(e),
                    "raw_config": comp.get("teaching_config", "")}
        finally:
            _close_connection()

    # 执行并行处理
    results_map = {}
    max_workers = min(4, len(components))
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(_process_one, i, comp): i
            for i, comp in enumerate(components, 1)
        }
        for future in as_completed(futures):
            res = future.result()
            results_map[res["index"]] = res

    # 按原始顺序组装 report
    for i in sorted(results_map.keys()):
        res = results_map[i]
        status = res["status"]

        if status in ("generated", "saved"):
            result_data = res.get("result", {})
            category = res.get("category", "mid")
            entry = {
                "cId": res["cId"],
                "cType": res.get("cType", ""),
                "type_name": res["type_name"],
                "category": category,
                "status": status,
                "jsonData": result_data.get("jsonData"),
                "kpInfo": result_data.get("kpInfo"),
            }
            if status == "saved":
                entry["component_id"] = res.get("component_id")
            if category == "core":
                for k in ["taskInfo", "materialInfo", "flowInfo", "studyInfo",
                          "questionGroup", "dialogSetting", "dialogConfig",
                          "imageInfo", "optionList", "questionList",
                          "preDialog", "dialogList", "textInfo", "evalInfo",
                          "intermediate"]:
                    if result_data.get(k) is not None:
                        entry[k] = result_data[k]
            report["results"].append(entry)
        elif status == "skipped":
            report["results"].append({
                "cId": res["cId"], "type_name": res["type_name"],
                "status": "skipped", "reason": res.get("reason", ""),
                "raw_config": res.get("raw_config", ""),
            })
        elif status == "failed":
            report["results"].append({
                "cId": res["cId"], "type_name": res["type_name"],
                "status": "failed", "error": res.get("error", ""),
                "raw_config": res.get("raw_config", ""),
            })

    report["success"] = _progress["success"]
    report["failed"] = _progress["failed"]
    report["skipped"] = _progress["skipped"]

    # 记录日志路径到报告
    report["log_path"] = log_path

    # 最终进度
    print(f"\n[完成] 总计:{len(components)} 成功:{report['success']} 失败:{report['failed']} 跳过:{report['skipped']}", flush=True)

    # 打印报告 + 生成交互式HTML
    _print_report(report)

    # 确保代理服务运行
    proxy_url = _ensure_proxy_running()

    # 始终生成交互式 HTML 报告
    if report["success"] > 0:
        try:
            from llm_client import DEFAULT_MODEL
            wiki_token = wiki_url_or_token
            if "/" in wiki_token:
                wiki_token = wiki_token.rstrip("/").split("/")[-1]
            html_path = generate_html_report(report, model_name=DEFAULT_MODEL, wiki_token=wiki_token, api_url=proxy_url)
            report["html_path"] = html_path
            logger.info(f"HTML报告已生成: {html_path}")
        except Exception as e:
            logger.error(f"HTML报告生成失败: {e}")

    return report


def _extract_level(title):
    """从标题提取级别（如 L1, L2）"""
    m = re.search(r'\b(L\d+)\b', title, re.IGNORECASE)
    return m.group(1).upper() if m else "L1"


def _extract_unit(title):
    """从标题提取单元（如 U14）"""
    m = re.search(r'\b(U\d+)\b', title, re.IGNORECASE)
    return m.group(1).upper() if m else None


def _print_report(report):
    """打印处理报告"""
    print("\n" + "=" * 60)
    print(f"处理报告: {report['title']}")
    print("=" * 60)
    print(f"总组件数: {report['total_components']}")
    print(f"成功: {report['success']}")
    print(f"失败: {report['failed']}")
    print(f"跳过: {report['skipped']}")

    if report["errors"]:
        print(f"\n全局错误:")
        for err in report["errors"]:
            print(f"  - {err}")

    print(f"\n详细结果:")
    for r in report["results"]:
        status = r["status"]
        if status in ("generated", "saved"):
            print(f"  OK  [{r['cId']}] {r['type_name']} → {r['cType']}")
        elif status == "skipped":
            print(f"  SKIP [{r['cId']}] {r['type_name']}: {r.get('reason', '')}")
        elif status == "failed":
            print(f"  FAIL [{r['cId']}] {r['type_name']}: {r.get('error', '')[:80]}")
    print("=" * 60)


def send_report_via_bot(html_path, receive_id, receive_id_type="user_id", agent_name="xiaoyan"):
    """
    通过飞书 Bot 身份发送 HTML 报告文件

    Args:
        html_path: HTML 报告文件的本地路径
        receive_id: 接收者 ID (user_id 或 chat_id)
        receive_id_type: ID 类型 ("user_id" 或 "chat_id")
        agent_name: Bot agent 名称

    Returns:
        dict: {"success": bool, "message_id": str or None, "error": str or None}
    """
    import subprocess

    cred_path = f"/root/.openclaw/credentials/{agent_name}/config.json"
    if not os.path.exists(cred_path):
        return {"success": False, "message_id": None, "error": f"凭证文件不存在: {cred_path}"}

    if not os.path.exists(html_path):
        return {"success": False, "message_id": None, "error": f"HTML文件不存在: {html_path}"}

    # 读取凭证
    with open(cred_path, 'r') as f:
        cred = json.load(f)
    app_id = cred["apps"][0]["appId"]
    app_secret = cred["apps"][0]["appSecret"]

    # 获取 tenant_access_token
    import requests
    token_resp = requests.post(
        "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal",
        json={"app_id": app_id, "app_secret": app_secret},
        timeout=10,
    )
    token_data = token_resp.json()
    if token_data.get("code") != 0:
        return {"success": False, "message_id": None, "error": f"获取token失败: {token_data}"}
    token = token_data["tenant_access_token"]

    # 上传文件
    file_name = os.path.basename(html_path)
    with open(html_path, 'rb') as f:
        upload_resp = requests.post(
            "https://open.feishu.cn/open-apis/im/v1/files",
            headers={"Authorization": f"Bearer {token}"},
            data={"file_type": "stream", "file_name": file_name},
            files={"file": (file_name, f, "text/html")},
            timeout=30,
        )
    upload_data = upload_resp.json()
    if upload_data.get("code") != 0:
        return {"success": False, "message_id": None, "error": f"文件上传失败: {upload_data}"}
    file_key = upload_data["data"]["file_key"]
    logger.info(f"文件上传成功: file_key={file_key}")

    # 发送文件消息
    send_resp = requests.post(
        f"https://open.feishu.cn/open-apis/im/v1/messages?receive_id_type={receive_id_type}",
        headers={"Authorization": f"Bearer {token}", "Content-Type": "application/json"},
        json={
            "receive_id": receive_id,
            "msg_type": "file",
            "content": json.dumps({"file_key": file_key}),
        },
        timeout=10,
    )
    send_data = send_resp.json()
    if send_data.get("code") != 0:
        return {"success": False, "message_id": None, "error": f"消息发送失败: {send_data}"}

    message_id = send_data.get("data", {}).get("message_id")
    logger.info(f"HTML报告已发送: message_id={message_id}, receiver={receive_id}")
    return {"success": True, "message_id": message_id, "error": None}


# ============ CLI ============
if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("用法:")
        print("  python3 pipeline.py <wiki_url_or_token>           # 正常运行，写入DB + 生成HTML")
        print("  python3 pipeline.py <wiki_url_or_token> --dry-run  # 仅生成，不写入DB")
        print("  python3 pipeline.py <wiki_url_or_token> --dry-run --limit 3  # 只处理前3个")
        print("  python3 pipeline.py <wiki_url_or_token> --dry-run --send-to <user_id>  # 生成后发送")
        print("  python3 pipeline.py <wiki_url_or_token> --dry-run --send-to-chat <chat_id>  # 发到群")
        sys.exit(1)

    wiki_input = sys.argv[1]
    dry_run = "--dry-run" in sys.argv

    # 支持 --limit N 参数
    limit = None
    if "--limit" in sys.argv:
        idx = sys.argv.index("--limit")
        if idx + 1 < len(sys.argv):
            limit = int(sys.argv[idx + 1])

    # 支持 --send-to <user_id> 参数
    send_to_user = None
    if "--send-to" in sys.argv:
        idx = sys.argv.index("--send-to")
        if idx + 1 < len(sys.argv):
            send_to_user = sys.argv[idx + 1]

    # 支持 --send-to-chat <chat_id> 参数
    send_to_chat = None
    if "--send-to-chat" in sys.argv:
        idx = sys.argv.index("--send-to-chat")
        if idx + 1 < len(sys.argv):
            send_to_chat = sys.argv[idx + 1]

    if limit:
        original_parse = parse_script_from_sheet
        def limited_parse(sheet_rows, markdown="", llm_client=None):
            result = original_parse(sheet_rows, markdown, llm_client=llm_client)
            result["components"] = result["components"][:limit]
            return result
        import parse_script
        parse_script.parse_script_from_sheet = limited_parse
        globals()["parse_script_from_sheet"] = limited_parse

    report = process_script(wiki_input, dry_run=dry_run)

    # 输出日志和HTML路径
    if report.get("log_path"):
        print(f"\n详细日志: {report['log_path']}")
    if report.get("html_path"):
        print(f"HTML报告: {report['html_path']}")

    # 发送 HTML 报告文件
    if report.get("html_path") and (send_to_user or send_to_chat):
        if send_to_user:
            result = send_report_via_bot(report["html_path"], send_to_user, "user_id")
        else:
            result = send_report_via_bot(report["html_path"], send_to_chat, "chat_id")

        if result["success"]:
            print(f"\n✅ HTML报告已发送: {result['message_id']}")
        else:
            print(f"\n❌ 发送失败: {result['error']}")
            sys.exit(1)

    # 退出码
    if report["failed"] > 0:
        sys.exit(1)