ai_member_xiaoyan/scripts/fix_json_and_p5p6.py

333 lines
12 KiB
Python

"""
修复 5 条 JSON 解析失败记录 + P5/P6 能力标签修正 + 回填
"""
import json, urllib.request, re, sys
APP_TOKEN = "CMHSbUUjka3TrUsaxxEc297ongf"
CRED_FILE = "/root/.openclaw/credentials/xiaoyan/config.json"
def get_token():
with open(CRED_FILE) as f:
cfg = json.load(f)
app_id = cfg['apps'][0]['appId']
app_secret = cfg['apps'][0]['appSecret']
req = urllib.request.Request(
"https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal",
data=json.dumps({"app_id": app_id, "app_secret": app_secret}).encode(),
headers={"Content-Type": "application/json"})
return json.loads(urllib.request.urlopen(req).read())['tenant_access_token']
def api_call(url, method='GET', body=None):
token = get_token()
headers = {"Authorization": f"Bearer {token}"}
data = json.dumps(body).encode() if body else None
if data:
headers["Content-Type"] = "application/json"
req = urllib.request.Request(url, data=data, method=method, headers=headers)
return json.loads(urllib.request.urlopen(req).read())
def update_record(table_id, record_id, fields):
url = f"https://open.feishu.cn/open-apis/bitable/v1/apps/{APP_TOKEN}/tables/{table_id}/records/{record_id}"
return api_call(url, 'PUT', {"fields": fields})
def fix_json_str(jd_str):
"""Attempt to fix a broken JSON string."""
# Try direct parse first
try:
jd = json.loads(jd_str)
return jd, "already_valid"
except json.JSONDecodeError as e:
pass
# Strategy: try raw_decode to find where valid JSON ends
decoder = json.JSONDecoder()
# Try multiple positions - walk back from error to find fix point
# Common fix 1: Insert missing closing bracket(s)
# Check bracket balance
open_brackets = jd_str.count('[') - jd_str.count(']')
open_braces = jd_str.count('{') - jd_str.count('}')
if open_brackets > 0 or open_braces > 0:
fixed = jd_str.rstrip()
# Remove trailing chars that might be wrong
fixed = fixed.rstrip('}')
fixed += ']' * open_brackets + '}' * open_braces
try:
jd = json.loads(fixed)
return jd, "bracket_balance_fix"
except:
pass
# Common fix 2: Unescaped quotes in explanation (P4_021301)
# Find all "explanation":"..." patterns and fix internal quotes
# Use regex to find explanation values containing broken quotes
def fix_expl_quotes(s):
"""Replace Chinese-context double quotes inside explanations with special chars"""
# Find: ...提到"text text." pattern inside explanation values
# The quote after 提到/说/告诉 etc are typical Chinese quotation
result = []
i = 0
in_explanation = False
expl_start = -1
while i < len(s):
# Detect start of explanation value
if not in_explanation and s[i:i+15] == '"explanation":"':
in_explanation = True
expl_start = i + 15 # after '"explanation":"'
result.append(s[i:i+15])
i += 15
continue
if in_explanation:
# Walk until we find the closing quote of explanation
if s[i] == '"' and (i == 0 or s[i-1] != '\\'):
# Check if this is closing quote (followed by , or } or ])
j = i + 1
while j < len(s) and s[j] in ' \t':
j += 1
if j < len(s) and s[j] in ',}]':
result.append(s[expl_start:i])
result.append('"')
in_explanation = False
i += 1
expl_start = -1
continue
# Not closing quote - replace with '
result.append(s[expl_start:i])
result.append("'")
expl_start = i + 1
i += 1
continue
result.append(s[i])
i += 1
if in_explanation:
result.append(s[expl_start:])
return ''.join(result)
fixed = fix_expl_quotes(jd_str)
try:
jd = json.loads(fixed)
return jd, "quote_fix"
except:
pass
# Common fix 3: Replace problematic characters
replacements = [
('\u201c', "'"), ('\u201d', "'"), # smart quotes
('\uff0c', ','), ('\uff1a', ':'), # fullwidth punctuation
('\u2019', "'"), # right single quote
]
fixed = jd_str
for old, new in replacements:
fixed = fixed.replace(old, new)
try:
jd = json.loads(fixed)
return jd, "char_replace"
except:
pass
return None, "unfixable: all strategies exhausted"
def fix_ability_tags(jd):
"""Fix ability tags in jsonData - check all levels"""
changed = False
for section in ['first', 'second']:
sect = jd.get(section, {})
if not sect:
continue
# Fix section-level ability (P5 style: first.ability)
if 'ability' in sect and isinstance(sect['ability'], list):
new_ab = []
for a in sect['ability']:
if a == '听觉抓取关键信息':
new_ab.append('显性事实理解|关键词识别')
elif a == '多特征整合':
new_ab.append('多句保持|信息整合')
else:
new_ab.append(a)
if new_ab != sect['ability']:
sect['ability'] = new_ab
changed = True
# Fix question-level ability (first.questionSet[i].ability)
qs = sect.get('questionSet', [])
for q in qs:
if 'ability' in q and isinstance(q['ability'], list):
new_ab = []
for a in q['ability']:
if a == '听觉抓取关键信息':
new_ab.append('显性事实理解|关键词识别')
elif a == '多特征整合':
new_ab.append('多句保持|信息整合')
else:
new_ab.append(a)
if new_ab != q['ability']:
q['ability'] = new_ab
changed = True
# Fix root questionSet ability (P6 style)
qs = jd.get('questionSet', [])
for q in qs:
if 'ability' in q and isinstance(q['ability'], list):
new_ab = []
for a in q['ability']:
if a == '听觉抓取关键信息':
new_ab.append('显性事实理解|关键词识别')
elif a == '多特征整合':
new_ab.append('多句保持|信息整合')
else:
new_ab.append(a)
if new_ab != q['ability']:
q['ability'] = new_ab
changed = True
return jd, changed
# ===== STEP 1: Fix 5 broken JSON records =====
print("=" * 60)
print("STEP 1: Fixing 5 broken JSON records")
print("=" * 60)
broken = {
"021901": ("tblzTLNH7f13uWQN", "P2"),
"022301": ("tblzTLNH7f13uWQN", "P2"),
"021301": ("tblVmeDtBDKsAEfz", "P4"),
"021601": ("tblVmeDtBDKsAEfz", "P4"),
"021801": ("tblVmeDtBDKsAEfz", "P4"),
}
for sid, (table_id, label) in broken.items():
url = f"https://open.feishu.cn/open-apis/bitable/v1/apps/{APP_TOKEN}/tables/{table_id}/records?page_size=50"
resp = api_call(url)
for item in resp['data']['items']:
if item['fields'].get('题目集合 ID', '') == sid:
rid = item['record_id']
jd_str = item['fields']['jsonData']
jd, msg = fix_json_str(jd_str)
if jd is None:
print(f"{label} {sid}: {msg}")
continue
# Also fix ability tags
jd, ab_changed = fix_ability_tags(jd)
new_jd = json.dumps(jd, ensure_ascii=False)
result = update_record(table_id, rid, {"jsonData": new_jd})
# Verify
url2 = f"https://open.feishu.cn/open-apis/bitable/v1/apps/{APP_TOKEN}/tables/{table_id}/records/{rid}"
v_resp = api_call(url2)
v_jd = v_resp['data']['items'][0]['fields'].get('jsonData', '')
try:
json.loads(v_jd)
detail = f"json={msg}, ab={'fixed' if ab_changed else 'ok'}"
print(f"{label} {sid}: {detail}")
except json.JSONDecodeError as e:
print(f" ⚠️ {label} {sid}: written but re-verify failed: {e}")
break
# ===== STEP 2: Fix P5 032801 =====
print(f"\n{'='*60}")
print("STEP 2: Fix P5 032801")
print("=" * 60)
url = f"https://open.feishu.cn/open-apis/bitable/v1/apps/{APP_TOKEN}/tables/tblDssVmhGzc3UKd/records?page_size=50"
resp = api_call(url)
for item in resp['data']['items']:
sid = item['fields'].get('题目集合 ID', '')
if sid == '032801':
rid = item['record_id']
jd_str = item['fields']['jsonData']
try:
jd = json.loads(jd_str)
except:
jd, _ = fix_json_str(jd_str)
if jd is None:
print(f" ❌ P5 {sid}: JSON parse failed")
break
jd, ab_changed = fix_ability_tags(jd)
if not ab_changed:
print(f" ⏭️ P5 {sid}: no ability tag changes needed")
else:
new_jd = json.dumps(jd, ensure_ascii=False)
result = update_record("tblDssVmhGzc3UKd", rid, {"jsonData": new_jd})
print(f" {'' if result.get('code')==0 else ''} P5 {sid}: ability tags updated")
break
# ===== STEP 3: Fix P6 records =====
print(f"\n{'='*60}")
print("STEP 3: Fix P6 records (add ability tags)")
print("=" * 60)
url = f"https://open.feishu.cn/open-apis/bitable/v1/apps/{APP_TOKEN}/tables/tbloiMcD0sBtGSTq/records?page_size=50"
resp = api_call(url)
for item in resp['data']['items']:
sid = item['fields'].get('题目集合 ID', '')
if '010199' in str(sid):
continue
jd_str = item['fields'].get('jsonData', '{}')
if not jd_str or jd_str == 'None':
continue
try:
jd = json.loads(jd_str)
except:
jd, _ = fix_json_str(jd_str)
if jd is None:
print(f" ❌ P6 {sid}: JSON parse failed")
continue
# P6 has root-level questionSet
qs = jd.get('questionSet', [])
changed = False
for q in qs:
if not q.get('ability') or len(q.get('ability', [])) == 0:
# Assign default based on question content
question = q.get('question', '')
answer = q.get('answer', [])
# Determine ability tag
if any(w in question.lower() for w in ['where', 'location', 'place']):
tag = '显性细节理解|数字/时间/地点'
elif any(w in question.lower() for w in ['how many', 'how much', 'number']):
tag = '显性细节理解|数字/时间/地点'
elif any(w in question.lower() for w in ['what color', 'which one']):
tag = '显性事实理解|关键词识别'
elif any(w in question.lower() for w in ['why', 'because']):
tag = '情绪/态度理解'
elif any(w in question.lower() for w in ['like', 'love', 'want']):
tag = '目的/偏好识别|显性 to/for/like'
else:
tag = '显性事实理解|关键词识别'
q['ability'] = [tag]
changed = True
if changed:
new_jd = json.dumps(jd, ensure_ascii=False)
result = update_record("tbloiMcD0sBtGSTq", item['record_id'], {"jsonData": new_jd})
if result.get('code') == 0:
tags_assigned = [q.get('ability', []) for q in jd.get('questionSet', [])]
print(f" ✅ P6 {sid}: ability filled ({[t[0] for t in tags_assigned if t]})")
else:
print(f" ❌ P6 {sid}: update failed - {result.get('msg')}")
else:
print(f" ⏭️ P6 {sid}: no changes needed")
print(f"\n{'='*60}")
print("All fixes complete!")
print("=" * 60)