#!/usr/bin/env python3 """ 端到端流水线 飞书wiki URL → 读取文档 → 解析sheet → 类型匹配 → LLM生成jsonData/kpInfo → 写入SQLite """ import os import sys import json import time import logging import re import threading import subprocess from concurrent.futures import ThreadPoolExecutor, as_completed CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, CURRENT_PATH) PROJECT_ROOT = os.path.dirname(CURRENT_PATH) CONFIG_PATH = os.path.join(PROJECT_ROOT, 'config.json') from feishu_client import read_wiki_doc_with_sheet from parse_script import parse_script_from_sheet from match_component import match_component_type from generate_json import generate_component from llm_client import get_client from db_manager import get_connection, init_db, insert_component, update_component_field from html_report import generate_html_report logger = logging.getLogger("pipeline") if not logger.handlers: handler = logging.StreamHandler() handler.setFormatter(logging.Formatter( "%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s" )) logger.addHandler(handler) logger.setLevel(logging.INFO) def setup_file_logging(title=""): """ 设置文件日志:将 pipeline 和 llm_client 的日志写入 outputs 目录下的日志文件。 成功时记录摘要,失败时记录完整 prompt + LLM 返回内容。 Returns: str: 日志文件路径 """ from datetime import datetime outputs_dir = os.path.join(PROJECT_ROOT, "outputs") os.makedirs(outputs_dir, exist_ok=True) # 生成日志文件名 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") safe_title = re.sub(r'[^\w\u4e00-\u9fff-]', '_', title)[:40] if title else "pipeline" log_filename = f"{safe_title}_{timestamp}.log" log_path = os.path.join(outputs_dir, log_filename) # 创建文件 handler file_handler = logging.FileHandler(log_path, encoding="utf-8") file_handler.setFormatter(logging.Formatter( "%(asctime)s - %(levelname)s - %(name)s - %(filename)s:%(lineno)d - %(message)s" )) file_handler.setLevel(logging.DEBUG) # 添加到 pipeline 和 llm_client logger logger.addHandler(file_handler) llm_logger = logging.getLogger("llm_client") llm_logger.addHandler(file_handler) logger.info(f"日志文件: {log_path}") return log_path def _load_proxy_config(): """读取 config.json 中的 proxy 配置""" if not os.path.exists(CONFIG_PATH): return None try: with open(CONFIG_PATH, 'r') as f: return json.load(f).get('proxy') except Exception: return None def _find_pid_on_port(port): """查找占用指定端口的进程PID,返回PID或None。""" import signal try: result = subprocess.run( ['ss', '-tlnp', f'sport = :{port}'], capture_output=True, text=True, timeout=5 ) # 解析 ss 输出中的 pid=XXXX import re as _re m = _re.search(r'pid=(\d+)', result.stdout) if m: return int(m.group(1)) except Exception: pass return None def _kill_stale_process(port): """杀掉占用端口的僵死进程,返回是否成功释放。""" import signal pid = _find_pid_on_port(port) if not pid: return False logger.warning(f"⚠️ 检测到端口 {port} 被僵死进程占用 (PID {pid}),正在终止...") try: os.kill(pid, signal.SIGTERM) # 等待进程退出 for _ in range(5): time.sleep(0.5) try: os.kill(pid, 0) # 检查进程是否还在 except OSError: logger.info(f"✓ 僵死进程 (PID {pid}) 已终止") return True # SIGTERM 没杀掉,强制 SIGKILL os.kill(pid, signal.SIGKILL) time.sleep(0.5) logger.info(f"✓ 僵死进程 (PID {pid}) 已强制终止 (SIGKILL)") return True except OSError as e: logger.error(f"终止进程 PID {pid} 失败: {e}") return False def _start_proxy_process(): """启动代理服务子进程。""" proxy_script = os.path.join(CURRENT_PATH, 'proxy_server.py') subprocess.Popen( [sys.executable, proxy_script], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, start_new_session=True, ) def _ensure_proxy_running(): """检测代理服务是否运行,未运行则自动启动。返回代理 URL 或 None。""" import requests as _req proxy_cfg = _load_proxy_config() if not proxy_cfg: logger.warning("⚠️ 未找到 config.json 或 proxy 配置,跳过代理") print("[代理] ⚠️ 未找到代理配置,HTML报告将使用直连地址") return None port = proxy_cfg['port'] external_ip = proxy_cfg.get('external_ip', '127.0.0.1') health_url = f'http://127.0.0.1:{port}/health' proxy_url = f'http://{external_ip}:{port}/api/push' # 健康检查 try: resp = _req.get(health_url, timeout=3) if resp.status_code == 200: logger.info(f"✓ 代理服务正常运行 (port {port})") print(f"[代理] ✓ 中转服务正常 → {proxy_url}") return proxy_url except Exception: pass # 健康检查失败 — 检测端口是否被僵死进程占用 stale_pid = _find_pid_on_port(port) if stale_pid: logger.warning(f"代理服务健康检查失败,但端口 {port} 被进程 PID {stale_pid} 占用(疑似僵死)") print(f"[代理] ⚠️ 端口 {port} 被僵死进程 (PID {stale_pid}) 占用,正在清理...") if not _kill_stale_process(port): logger.error(f"无法终止僵死进程 PID {stale_pid},HTML报告将使用直连地址") print(f"[代理] ❌ 无法清理僵死进程,HTML报告将使用直连地址") return None time.sleep(1) # 等待端口释放 else: logger.info("代理服务未运行,正在启动...") print(f"[代理] 正在启动中转服务 (port {port})...") # 启动代理 _start_proxy_process() # 等待启动(最多8秒) for i in range(8): time.sleep(1) try: resp = _req.get(health_url, timeout=2) if resp.status_code == 200: logger.info(f"✓ 代理服务启动成功 (port {port}, 耗时 {i+1}s)") print(f"[代理] ✓ 中转服务启动成功 → {proxy_url}") return proxy_url except Exception: continue logger.error("❌ 代理服务启动失败,HTML报告将使用直连地址") print("[代理] ❌ 中转服务启动失败!HTML报告将使用直连HTTPS地址(可能受浏览器CORS限制)") return None def process_script(wiki_url_or_token, db_path=None, dry_run=False): """ 端到端处理一个剧本文档 Args: wiki_url_or_token: 飞书 wiki URL 或 wiki_token db_path: SQLite 数据库路径(默认使用 db/components.db) dry_run: 如果为 True,只生成不写入DB Returns: dict: 处理报告 """ report = { "wiki_url": wiki_url_or_token, "title": "", "total_components": 0, "success": 0, "failed": 0, "skipped": 0, "results": [], "errors": [], } # 设置文件日志(早期设置,后续用标题重命名) log_path = setup_file_logging() # Step 1: 读取飞书文档 logger.info(f"=== Step 1: 读取飞书文档 ===") try: doc_data = read_wiki_doc_with_sheet(wiki_url_or_token) report["title"] = doc_data["title"] logger.info(f"文档: {doc_data['title']}, obj_token={doc_data['obj_token']}") except Exception as e: report["errors"].append(f"读取文档失败: {e}") logger.error(f"读取文档失败: {e}") return report if not doc_data["sheet_rows"] and not doc_data.get("all_sheets"): report["errors"].append("文档中未找到内嵌sheet数据") logger.error("文档中未找到内嵌sheet数据") return report # Step 2: 解析剧本(尝试所有sheet,取组件数最多的结果) logger.info(f"=== Step 2: 解析剧本 ===") llm_client = get_client() # MySQL 连通性检测 try: from kp_matcher import _get_connection _get_connection() logger.info("MySQL 连接成功(kpId 匹配可用)") except Exception as e: logger.error(f"MySQL 连接失败: {e} — kpId 匹配将全部为空") # 收集所有sheet的数据,逐一尝试解析 all_sheets = doc_data.get("all_sheets", []) if not all_sheets and doc_data["sheet_rows"]: all_sheets = [doc_data["sheet_rows"]] best_parsed = None best_sheet_rows = None best_count = 0 for idx, sheet_rows in enumerate(all_sheets): if not sheet_rows or len(sheet_rows) < 2: continue try: parsed = parse_script_from_sheet(sheet_rows, doc_data["markdown"], llm_client=llm_client) count = len(parsed.get("components", [])) logger.info(f" Sheet[{idx}]: 识别到 {count} 个组件") if count > best_count: best_count = count best_parsed = parsed best_sheet_rows = sheet_rows except Exception as e: logger.warning(f" Sheet[{idx}] 解析失败: {e}") if best_parsed is None or best_count == 0: report["errors"].append("未识别到任何组件行(已尝试所有sheet)") return report parsed = best_parsed character_map = parsed["character_map"] section_char_map = parsed.get("section_char_map", []) components = parsed["components"] # 更新 doc_data 中的 sheet_rows 为匹配到的那个 if best_sheet_rows is not None: doc_data["sheet_rows"] = best_sheet_rows report["total_components"] = len(components) logger.info(f"最终识别到 {len(components)} 个组件, 角色映射: {character_map}, section映射: {len(section_char_map)}条") # Step 3: 初始化DB if not dry_run: init_db() logger.info(f"数据库已初始化") # Step 4: 并行组件生成 logger.info(f"=== Step 3: 并行组件生成 jsonData/kpInfo (workers=4) ===") # 提取元数据 metadata = parsed.get("metadata", {}) script_id = doc_data["obj_token"] script_title = doc_data["title"] # 从标题提取 level 和 unit level = _extract_level(script_title) unit_id = _extract_unit(script_title) # 预加载 examples cache(线程安全:主线程加载一次后只读) try: from generate_json import _load_examples _load_examples() except Exception: pass # 进度锁 _progress_lock = threading.Lock() _progress = {"success": 0, "failed": 0, "skipped": 0} def _process_one(i, comp): """处理单个组件(线程 worker)""" from llm_client import LLMClient from kp_matcher import _close_connection # 每个 worker 独立 LLM client worker_llm = LLMClient() cId = comp["cId"] type_name = comp["type_name"] try: # 类型匹配 type_info = match_component_type(type_name) cType = type_info["cType"] except ValueError as e: with _progress_lock: _progress["skipped"] += 1 logger.warning(f"跳过: {e}") return {"index": i, "cId": cId, "type_name": type_name, "status": "skipped", "reason": str(e), "raw_config": comp.get("teaching_config", "")} if cType is None: with _progress_lock: _progress["skipped"] += 1 logger.warning(f"跳过: {type_name} 尚未实现") return {"index": i, "cId": cId, "type_name": type_name, "status": "skipped", "reason": f"类型 '{type_name}' 尚未实现生成器", "raw_config": comp.get("teaching_config", "")} try: # LLM 生成(使用 worker 独立的 llm_client) result = generate_component(comp, character_map, section_char_map, worker_llm, level=level) category = result.get("category", "mid") entry = { "index": i, "cId": cId, "cType": result["cType"], "type_name": type_name, "category": category, "result": result, "raw_config": comp.get("teaching_config", ""), } if dry_run: entry["status"] = "generated" else: # 写入数据库 component_id = insert_component( script_id=script_id, cType=result["cType"], type_name=type_name, category=category, has_image=result["has_image"], level=level, unit_id=unit_id, knowledge_points_raw=comp.get("knowledge_text", ""), raw_config=comp["teaching_config"], component_index=i, script_title=script_title, bitable_token=result.get("bitable"), db_table=result.get("db_table"), ) update_component_field(component_id, "final_config_json", json.dumps(result["jsonData"], ensure_ascii=False)) if result.get("kpInfo"): update_component_field(component_id, "kp_info_json", json.dumps(result["kpInfo"], ensure_ascii=False)) if category == "core": _core_db_map = [ ("task_info_json", "taskInfo"), ("material_info_json", "materialInfo"), ("flow_info_json", "flowInfo"), ("study_info_json", "studyInfo"), ("question_group_json", "questionGroup"), ("dialog_setting_json", "dialogSetting"), ("dialog_config_json", "dialogConfig"), ("image_info_json", "imageInfo"), ("option_list_json", "optionList"), ("question_list_json", "questionList"), ("pre_dialog_json", "preDialog"), ("dialog_list_json", "dialogList"), ("text_info_json", "textInfo"), ("eval_info_json", "evalInfo"), ] for field, key in _core_db_map: data = result.get(key) if data: update_component_field(component_id, field, json.dumps(data, ensure_ascii=False)) if result.get("intermediate"): update_component_field(component_id, "intermediate_json", json.dumps(result["intermediate"], ensure_ascii=False)) update_component_field(component_id, "status", "generated") entry["status"] = "saved" entry["component_id"] = component_id with _progress_lock: _progress["success"] += 1 done = _progress["success"] + _progress["failed"] + _progress["skipped"] logger.info(f"成功: {cType} ({cId})") print(f" ✓ [{done}/{len(components)}] {type_name} ({cType})", flush=True) return entry except Exception as e: import traceback as _tb with _progress_lock: _progress["failed"] += 1 done = _progress["success"] + _progress["failed"] + _progress["skipped"] # 详细日志:包含 teaching_config 和完整堆栈 logger.error( f"组件生成失败!\n" f"── 组件信息 ──\n" f" index={i}, cId={cId}, type_name={type_name}\n" f"── teaching_config ({len(comp.get('teaching_config', ''))} chars) ──\n" f"{comp.get('teaching_config', '')[:2000]}\n" f"── 错误堆栈 ──\n{_tb.format_exc()}" ) print(f" ✗ [{done}/{len(components)}] {type_name}: {str(e)[:60]}", flush=True) return {"index": i, "cId": cId, "type_name": type_name, "status": "failed", "error": str(e), "raw_config": comp.get("teaching_config", "")} finally: _close_connection() # 执行并行处理 results_map = {} max_workers = min(4, len(components)) with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = { executor.submit(_process_one, i, comp): i for i, comp in enumerate(components, 1) } for future in as_completed(futures): res = future.result() results_map[res["index"]] = res # 按原始顺序组装 report for i in sorted(results_map.keys()): res = results_map[i] status = res["status"] if status in ("generated", "saved"): result_data = res.get("result", {}) category = res.get("category", "mid") entry = { "cId": res["cId"], "cType": res.get("cType", ""), "type_name": res["type_name"], "category": category, "status": status, "jsonData": result_data.get("jsonData"), "kpInfo": result_data.get("kpInfo"), "raw_config": res.get("raw_config", ""), } if status == "saved": entry["component_id"] = res.get("component_id") if category == "core": for k in ["taskInfo", "materialInfo", "flowInfo", "studyInfo", "questionGroup", "dialogSetting", "dialogConfig", "imageInfo", "optionList", "questionList", "preDialog", "dialogList", "textInfo", "evalInfo", "intermediate"]: if result_data.get(k) is not None: entry[k] = result_data[k] report["results"].append(entry) elif status == "skipped": report["results"].append({ "cId": res["cId"], "type_name": res["type_name"], "status": "skipped", "reason": res.get("reason", ""), "raw_config": res.get("raw_config", ""), }) elif status == "failed": report["results"].append({ "cId": res["cId"], "type_name": res["type_name"], "status": "failed", "error": res.get("error", ""), "raw_config": res.get("raw_config", ""), }) report["success"] = _progress["success"] report["failed"] = _progress["failed"] report["skipped"] = _progress["skipped"] report["character_map"] = character_map report["level"] = level # 记录日志路径到报告 report["log_path"] = log_path # 最终进度 print(f"\n[完成] 总计:{len(components)} 成功:{report['success']} 失败:{report['failed']} 跳过:{report['skipped']}", flush=True) # 打印报告 + 生成交互式HTML _print_report(report) # 确保代理服务运行 proxy_url = _ensure_proxy_running() # 始终生成交互式 HTML 报告 if report["success"] > 0: try: from llm_client import DEFAULT_MODEL wiki_token = wiki_url_or_token if "/" in wiki_token: wiki_token = wiki_token.rstrip("/").split("/")[-1] html_path = generate_html_report(report, model_name=DEFAULT_MODEL, wiki_token=wiki_token, api_url=proxy_url) report["html_path"] = html_path logger.info(f"HTML报告已生成: {html_path}") except Exception as e: logger.error(f"HTML报告生成失败: {e}") return report def _extract_level(title): """从标题提取级别(如 L1, L2)""" m = re.search(r'\b(L\d+)\b', title, re.IGNORECASE) return m.group(1).upper() if m else "L1" def _extract_unit(title): """从标题提取单元(如 U14)""" m = re.search(r'\b(U\d+)\b', title, re.IGNORECASE) return m.group(1).upper() if m else None def _print_report(report): """打印处理报告""" print("\n" + "=" * 60) print(f"处理报告: {report['title']}") print("=" * 60) print(f"总组件数: {report['total_components']}") print(f"成功: {report['success']}") print(f"失败: {report['failed']}") print(f"跳过: {report['skipped']}") if report["errors"]: print(f"\n全局错误:") for err in report["errors"]: print(f" - {err}") print(f"\n详细结果:") for r in report["results"]: status = r["status"] if status in ("generated", "saved"): print(f" OK [{r['cId']}] {r['type_name']} → {r['cType']}") elif status == "skipped": print(f" SKIP [{r['cId']}] {r['type_name']}: {r.get('reason', '')}") elif status == "failed": print(f" FAIL [{r['cId']}] {r['type_name']}: {r.get('error', '')[:80]}") print("=" * 60) def send_report_via_bot(html_path, receive_id, receive_id_type="user_id", agent_name="xiaoyan"): """ 通过飞书 Bot 身份发送 HTML 报告文件 Args: html_path: HTML 报告文件的本地路径 receive_id: 接收者 ID (user_id 或 chat_id) receive_id_type: ID 类型 ("user_id" 或 "chat_id") agent_name: Bot agent 名称 Returns: dict: {"success": bool, "message_id": str or None, "error": str or None} """ import subprocess cred_path = f"/root/.openclaw/credentials/{agent_name}/config.json" if not os.path.exists(cred_path): return {"success": False, "message_id": None, "error": f"凭证文件不存在: {cred_path}"} if not os.path.exists(html_path): return {"success": False, "message_id": None, "error": f"HTML文件不存在: {html_path}"} # 读取凭证 with open(cred_path, 'r') as f: cred = json.load(f) app_id = cred["apps"][0]["appId"] app_secret = cred["apps"][0]["appSecret"] # 获取 tenant_access_token import requests token_resp = requests.post( "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal", json={"app_id": app_id, "app_secret": app_secret}, timeout=10, ) token_data = token_resp.json() if token_data.get("code") != 0: return {"success": False, "message_id": None, "error": f"获取token失败: {token_data}"} token = token_data["tenant_access_token"] # 上传文件 file_name = os.path.basename(html_path) with open(html_path, 'rb') as f: upload_resp = requests.post( "https://open.feishu.cn/open-apis/im/v1/files", headers={"Authorization": f"Bearer {token}"}, data={"file_type": "stream", "file_name": file_name}, files={"file": (file_name, f, "text/html")}, timeout=30, ) upload_data = upload_resp.json() if upload_data.get("code") != 0: return {"success": False, "message_id": None, "error": f"文件上传失败: {upload_data}"} file_key = upload_data["data"]["file_key"] logger.info(f"文件上传成功: file_key={file_key}") # 发送文件消息 send_resp = requests.post( f"https://open.feishu.cn/open-apis/im/v1/messages?receive_id_type={receive_id_type}", headers={"Authorization": f"Bearer {token}", "Content-Type": "application/json"}, json={ "receive_id": receive_id, "msg_type": "file", "content": json.dumps({"file_key": file_key}), }, timeout=10, ) send_data = send_resp.json() if send_data.get("code") != 0: return {"success": False, "message_id": None, "error": f"消息发送失败: {send_data}"} message_id = send_data.get("data", {}).get("message_id") logger.info(f"HTML报告已发送: message_id={message_id}, receiver={receive_id}") return {"success": True, "message_id": message_id, "error": None} # ============ CLI ============ if __name__ == "__main__": if len(sys.argv) < 2: print("用法:") print(" python3 pipeline.py # 正常运行,写入DB + 生成HTML") print(" python3 pipeline.py --dry-run # 仅生成,不写入DB") print(" python3 pipeline.py --dry-run --limit 3 # 只处理前3个") print(" python3 pipeline.py --dry-run --send-to # 生成后发送") print(" python3 pipeline.py --dry-run --send-to-chat # 发到群") sys.exit(1) wiki_input = sys.argv[1] dry_run = "--dry-run" in sys.argv # 支持 --limit N 参数 limit = None if "--limit" in sys.argv: idx = sys.argv.index("--limit") if idx + 1 < len(sys.argv): limit = int(sys.argv[idx + 1]) # 支持 --send-to 参数 send_to_user = None if "--send-to" in sys.argv: idx = sys.argv.index("--send-to") if idx + 1 < len(sys.argv): send_to_user = sys.argv[idx + 1] # 支持 --send-to-chat 参数 send_to_chat = None if "--send-to-chat" in sys.argv: idx = sys.argv.index("--send-to-chat") if idx + 1 < len(sys.argv): send_to_chat = sys.argv[idx + 1] if limit: original_parse = parse_script_from_sheet def limited_parse(sheet_rows, markdown="", llm_client=None): result = original_parse(sheet_rows, markdown, llm_client=llm_client) result["components"] = result["components"][:limit] return result import parse_script parse_script.parse_script_from_sheet = limited_parse globals()["parse_script_from_sheet"] = limited_parse report = process_script(wiki_input, dry_run=dry_run) # 输出日志和HTML路径 if report.get("log_path"): print(f"\n详细日志: {report['log_path']}") if report.get("html_path"): print(f"HTML报告: {report['html_path']}") # 发送 HTML 报告文件 if report.get("html_path") and (send_to_user or send_to_chat): if send_to_user: result = send_report_via_bot(report["html_path"], send_to_user, "user_id") else: result = send_report_via_bot(report["html_path"], send_to_chat, "chat_id") if result["success"]: print(f"\n✅ HTML报告已发送: {result['message_id']}") else: print(f"\n❌ 发送失败: {result['error']}") sys.exit(1) # 退出码 if report["failed"] > 0: sys.exit(1)