ai_member_xiaoyan/output/S3_U30_L4_知识点统计.py

92 lines
3.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""L2剧本审校 - S3-U30-L4 幽灵宝藏 知识点统计(修正版)"""
import re, json
from collections import Counter
with open("/tmp/l2_l4_review.md", "r") as f:
content = f.read()
# 找剧本表格7列类型|剧情内容|中文对白|翻译|知识点|组件|配置信息)
tables = list(re.finditer(r'<lark-table[^>]*>(.*?)</lark-table>', content, re.DOTALL))
script_table = tables[2].group(0) # 第3个表格是剧本表
tr_blocks = re.findall(r'<lark-tr>(.*?)</lark-tr>', script_table, re.DOTALL)
print(f"剧本表总行数(含表头): {len(tr_blocks)}")
data_rows = []
for tr in tr_blocks:
tds = re.findall(r'<lark-td[^>]*>(.*?)</lark-td>', tr, re.DOTALL)
if len(tds) >= 7:
clean = []
for td in tds:
c = re.sub(r'<[^>]+>', '', td)
c = re.sub(r'\*\*', '', c)
c = re.sub(r'\{align="[^"]*"\}', '', c)
c = re.sub(r'\{color="[^"]*"\}', '', c)
c = c.strip()
clean.append(c)
data_rows.append(clean)
# 跳过表头
header = data_rows[0]
print(f"列结构: {header}")
print(f"数据行: {len(data_rows)-1}")
# 知识点列 = 索引4
kp_counter = Counter()
kp_rows_map = {}
for i, row in enumerate(data_rows[1:], start=2):
row_type = row[0].strip() if len(row) > 0 else ""
kp_text = row[4].strip() if len(row) > 4 else ""
if kp_text:
# 按换行分割
kps = [k.strip() for k in kp_text.split('\n') if k.strip()]
for kp in kps:
kp_counter[kp] += 1
if kp not in kp_rows_map:
kp_rows_map[kp] = []
kp_rows_map[kp].append((i, row_type))
print("\n=== 知识点出现次数统计(以【知识点】列为准)===\n")
print(f"{'知识点':<32} {'次数':>4} {'规范':>10} {'出现行(类型)'}")
print("-" * 100)
for kp, count in kp_counter.most_common():
status = "" if 2 <= count <= 3 else ("⚠️超标" if count > 3 else "⚠️不足")
row_info = "; ".join([f"{r}({t})" for r, t in kp_rows_map[kp]])
print(f"{kp:<32} {count:>4} {status:>10} {row_info}")
# 汇总
print(f"\n=== 汇总 ===")
total_kps = len(kp_counter)
print(f"知识点总数: {total_kps}")
for kp, count in kp_counter.most_common():
status = "" if 2 <= count <= 3 else ("⚠️超标" if count > 3 else "⚠️不足")
print(f" {kp}: {count}{status}")
# 删除线检查
print(f"\n=== 删除线行 ===")
deleted = []
for i, row in enumerate(data_rows[1:], start=2):
for col in [0, 1, 2, 3]:
text = row[col] if len(row) > col else ""
if '~~' in text:
deleted.append(i)
break
print(f"含删除线: {deleted}")
# 组件配置为空
print(f"\n=== 组件配置为空 ===")
empty = []
for i, row in enumerate(data_rows[1:], start=2):
row_type = row[0].strip() if len(row) > 0 else ""
if row_type and 'TL' not in row_type:
comp = row[5].strip() if len(row) > 5 else ""
if not comp:
empty.append((i, row_type))
print(f"组件配置为空的互动行: {len(empty)}")
for r, t in empty:
print(f"{r}: {t}")