ai_member_xiaoyan/output/S3_U30_L4_知识点统计.py

#!/usr/bin/env python3
"""L2剧本审校 - S3-U30-L4 幽灵宝藏 知识点统计（修正版）"""
import re, json
from collections import Counter

with open("/tmp/l2_l4_review.md", "r") as f:
    content = f.read()

# 找剧本表格（7列：类型|剧情内容|中文对白|翻译|知识点|组件|配置信息）
tables = list(re.finditer(r'<lark-table[^>]*>(.*?)</lark-table>', content, re.DOTALL))
script_table = tables[2].group(0)  # 第3个表格是剧本表

tr_blocks = re.findall(r'<lark-tr>(.*?)</lark-tr>', script_table, re.DOTALL)

print(f"剧本表总行数(含表头): {len(tr_blocks)}")

data_rows = []
for tr in tr_blocks:
    tds = re.findall(r'<lark-td[^>]*>(.*?)</lark-td>', tr, re.DOTALL)
    if len(tds) >= 7:
        clean = []
        for td in tds:
            c = re.sub(r'<[^>]+>', '', td)
            c = re.sub(r'\*\*', '', c)
            c = re.sub(r'\{align="[^"]*"\}', '', c)
            c = re.sub(r'\{color="[^"]*"\}', '', c)
            c = c.strip()
            clean.append(c)
        data_rows.append(clean)

# 跳过表头
header = data_rows[0]
print(f"列结构: {header}")
print(f"数据行: {len(data_rows)-1}")

# 知识点列 = 索引4
kp_counter = Counter()
kp_rows_map = {}

for i, row in enumerate(data_rows[1:], start=2):
    row_type = row[0].strip() if len(row) > 0 else ""
    kp_text = row[4].strip() if len(row) > 4 else ""
    if kp_text:
        # 按换行分割
        kps = [k.strip() for k in kp_text.split('\n') if k.strip()]
        for kp in kps:
            kp_counter[kp] += 1
            if kp not in kp_rows_map:
                kp_rows_map[kp] = []
            kp_rows_map[kp].append((i, row_type))

print("\n=== 知识点出现次数统计（以【知识点】列为准）===\n")
print(f"{'知识点':<32} {'次数':>4}  {'规范':>10}  {'出现行(类型)'}")
print("-" * 100)

for kp, count in kp_counter.most_common():
    status = "✅" if 2 <= count <= 3 else ("⚠️超标" if count > 3 else "⚠️不足")
    row_info = "; ".join([f"行{r}({t})" for r, t in kp_rows_map[kp]])
    print(f"{kp:<32} {count:>4}  {status:>10}  {row_info}")

# 汇总
print(f"\n=== 汇总 ===")
total_kps = len(kp_counter)
print(f"知识点总数: {total_kps}")
for kp, count in kp_counter.most_common():
    status = "✅" if 2 <= count <= 3 else ("⚠️超标" if count > 3 else "⚠️不足")
    print(f"  {kp}: {count}次 {status}")

# 删除线检查
print(f"\n=== 删除线行 ===")
deleted = []
for i, row in enumerate(data_rows[1:], start=2):
    for col in [0, 1, 2, 3]:
        text = row[col] if len(row) > col else ""
        if '~~' in text:
            deleted.append(i)
            break
print(f"含删除线: {deleted}")

# 组件配置为空
print(f"\n=== 组件配置为空 ===")
empty = []
for i, row in enumerate(data_rows[1:], start=2):
    row_type = row[0].strip() if len(row) > 0 else ""
    if row_type and 'TL' not in row_type:
        comp = row[5].strip() if len(row) > 5 else ""
        if not comp:
            empty.append((i, row_type))
print(f"组件配置为空的互动行: {len(empty)}个")
for r, t in empty:
    print(f"  行{r}: {t}")