ai_member_xiaoyan/scripts/fix_json_and_p5p6.py

"""
修复 5 条 JSON 解析失败记录 + P5/P6 能力标签修正 + 回填
"""
import json, urllib.request, re, sys

APP_TOKEN = "CMHSbUUjka3TrUsaxxEc297ongf"
CRED_FILE = "/root/.openclaw/credentials/xiaoyan/config.json"

def get_token():
    with open(CRED_FILE) as f:
        cfg = json.load(f)
    app_id = cfg['apps'][0]['appId']
    app_secret = cfg['apps'][0]['appSecret']
    req = urllib.request.Request(
        "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal",
        data=json.dumps({"app_id": app_id, "app_secret": app_secret}).encode(),
        headers={"Content-Type": "application/json"})
    return json.loads(urllib.request.urlopen(req).read())['tenant_access_token']

def api_call(url, method='GET', body=None):
    token = get_token()
    headers = {"Authorization": f"Bearer {token}"}
    data = json.dumps(body).encode() if body else None
    if data:
        headers["Content-Type"] = "application/json"
    req = urllib.request.Request(url, data=data, method=method, headers=headers)
    return json.loads(urllib.request.urlopen(req).read())

def update_record(table_id, record_id, fields):
    url = f"https://open.feishu.cn/open-apis/bitable/v1/apps/{APP_TOKEN}/tables/{table_id}/records/{record_id}"
    return api_call(url, 'PUT', {"fields": fields})

def fix_json_str(jd_str):
    """Attempt to fix a broken JSON string."""
    # Try direct parse first
    try:
        jd = json.loads(jd_str)
        return jd, "already_valid"
    except json.JSONDecodeError as e:
        pass

    # Strategy: try raw_decode to find where valid JSON ends
    decoder = json.JSONDecoder()
    # Try multiple positions - walk back from error to find fix point

    # Common fix 1: Insert missing closing bracket(s)
    # Check bracket balance
    open_brackets = jd_str.count('[') - jd_str.count(']')
    open_braces = jd_str.count('{') - jd_str.count('}')

    if open_brackets > 0 or open_braces > 0:
        fixed = jd_str.rstrip()
        # Remove trailing chars that might be wrong
        fixed = fixed.rstrip('}')
        fixed += ']' * open_brackets + '}' * open_braces
        try:
            jd = json.loads(fixed)
            return jd, "bracket_balance_fix"
        except:
            pass

    # Common fix 2: Unescaped quotes in explanation (P4_021301)
    # Find all "explanation":"..." patterns and fix internal quotes
    # Use regex to find explanation values containing broken quotes
    def fix_expl_quotes(s):
        """Replace Chinese-context double quotes inside explanations with special chars"""
        # Find: ...提到"text text." pattern inside explanation values
        # The quote after 提到/说/告诉 etc are typical Chinese quotation
        result = []
        i = 0
        in_explanation = False
        expl_start = -1
        while i < len(s):
            # Detect start of explanation value
            if not in_explanation and s[i:i+15] == '"explanation":"':
                in_explanation = True
                expl_start = i + 15  # after '"explanation":"'
                result.append(s[i:i+15])
                i += 15
                continue

            if in_explanation:
                # Walk until we find the closing quote of explanation
                if s[i] == '"' and (i == 0 or s[i-1] != '\\'):
                    # Check if this is closing quote (followed by , or } or ])
                    j = i + 1
                    while j < len(s) and s[j] in ' \t':
                        j += 1
                    if j < len(s) and s[j] in ',}]':
                        result.append(s[expl_start:i])
                        result.append('"')
                        in_explanation = False
                        i += 1
                        expl_start = -1
                        continue
                    # Not closing quote - replace with '
                    result.append(s[expl_start:i])
                    result.append("'")
                    expl_start = i + 1
                i += 1
                continue

            result.append(s[i])
            i += 1

        if in_explanation:
            result.append(s[expl_start:])

        return ''.join(result)

    fixed = fix_expl_quotes(jd_str)
    try:
        jd = json.loads(fixed)
        return jd, "quote_fix"
    except:
        pass

    # Common fix 3: Replace problematic characters
    replacements = [
        ('\u201c', "'"), ('\u201d', "'"),  # smart quotes
        ('\uff0c', ','), ('\uff1a', ':'),  # fullwidth punctuation
        ('\u2019', "'"),  # right single quote
    ]
    fixed = jd_str
    for old, new in replacements:
        fixed = fixed.replace(old, new)
    try:
        jd = json.loads(fixed)
        return jd, "char_replace"
    except:
        pass

    return None, "unfixable: all strategies exhausted"

def fix_ability_tags(jd):
    """Fix ability tags in jsonData - check all levels"""
    changed = False

    for section in ['first', 'second']:
        sect = jd.get(section, {})
        if not sect:
            continue

        # Fix section-level ability (P5 style: first.ability)
        if 'ability' in sect and isinstance(sect['ability'], list):
            new_ab = []
            for a in sect['ability']:
                if a == '听觉抓取关键信息':
                    new_ab.append('显性事实理解｜关键词识别')
                elif a == '多特征整合':
                    new_ab.append('多句保持｜信息整合')
                else:
                    new_ab.append(a)
            if new_ab != sect['ability']:
                sect['ability'] = new_ab
                changed = True

        # Fix question-level ability (first.questionSet[i].ability)
        qs = sect.get('questionSet', [])
        for q in qs:
            if 'ability' in q and isinstance(q['ability'], list):
                new_ab = []
                for a in q['ability']:
                    if a == '听觉抓取关键信息':
                        new_ab.append('显性事实理解｜关键词识别')
                    elif a == '多特征整合':
                        new_ab.append('多句保持｜信息整合')
                    else:
                        new_ab.append(a)
                if new_ab != q['ability']:
                    q['ability'] = new_ab
                    changed = True

    # Fix root questionSet ability (P6 style)
    qs = jd.get('questionSet', [])
    for q in qs:
        if 'ability' in q and isinstance(q['ability'], list):
            new_ab = []
            for a in q['ability']:
                if a == '听觉抓取关键信息':
                    new_ab.append('显性事实理解｜关键词识别')
                elif a == '多特征整合':
                    new_ab.append('多句保持｜信息整合')
                else:
                    new_ab.append(a)
            if new_ab != q['ability']:
                q['ability'] = new_ab
                changed = True

    return jd, changed


# ===== STEP 1: Fix 5 broken JSON records =====
print("=" * 60)
print("STEP 1: Fixing 5 broken JSON records")
print("=" * 60)

broken = {
    "021901": ("tblzTLNH7f13uWQN", "P2"),
    "022301": ("tblzTLNH7f13uWQN", "P2"),
    "021301": ("tblVmeDtBDKsAEfz", "P4"),
    "021601": ("tblVmeDtBDKsAEfz", "P4"),
    "021801": ("tblVmeDtBDKsAEfz", "P4"),
}

for sid, (table_id, label) in broken.items():
    url = f"https://open.feishu.cn/open-apis/bitable/v1/apps/{APP_TOKEN}/tables/{table_id}/records?page_size=50"
    resp = api_call(url)
    for item in resp['data']['items']:
        if item['fields'].get('题目集合 ID', '') == sid:
            rid = item['record_id']
            jd_str = item['fields']['jsonData']

            jd, msg = fix_json_str(jd_str)
            if jd is None:
                print(f"  ❌ {label} {sid}: {msg}")
                continue

            # Also fix ability tags
            jd, ab_changed = fix_ability_tags(jd)

            new_jd = json.dumps(jd, ensure_ascii=False)
            result = update_record(table_id, rid, {"jsonData": new_jd})

            # Verify
            url2 = f"https://open.feishu.cn/open-apis/bitable/v1/apps/{APP_TOKEN}/tables/{table_id}/records/{rid}"
            v_resp = api_call(url2)
            v_jd = v_resp['data']['items'][0]['fields'].get('jsonData', '')
            try:
                json.loads(v_jd)
                detail = f"json={msg}, ab={'fixed' if ab_changed else 'ok'}"
                print(f"  ✅ {label} {sid}: {detail}")
            except json.JSONDecodeError as e:
                print(f"  ⚠️ {label} {sid}: written but re-verify failed: {e}")
            break

# ===== STEP 2: Fix P5 032801 =====
print(f"\n{'='*60}")
print("STEP 2: Fix P5 032801")
print("=" * 60)

url = f"https://open.feishu.cn/open-apis/bitable/v1/apps/{APP_TOKEN}/tables/tblDssVmhGzc3UKd/records?page_size=50"
resp = api_call(url)
for item in resp['data']['items']:
    sid = item['fields'].get('题目集合 ID', '')
    if sid == '032801':
        rid = item['record_id']
        jd_str = item['fields']['jsonData']

        try:
            jd = json.loads(jd_str)
        except:
            jd, _ = fix_json_str(jd_str)

        if jd is None:
            print(f"  ❌ P5 {sid}: JSON parse failed")
            break

        jd, ab_changed = fix_ability_tags(jd)
        if not ab_changed:
            print(f"  ⏭️ P5 {sid}: no ability tag changes needed")
        else:
            new_jd = json.dumps(jd, ensure_ascii=False)
            result = update_record("tblDssVmhGzc3UKd", rid, {"jsonData": new_jd})
            print(f"  {'✅' if result.get('code')==0 else '❌'} P5 {sid}: ability tags updated")
        break

# ===== STEP 3: Fix P6 records =====
print(f"\n{'='*60}")
print("STEP 3: Fix P6 records (add ability tags)")
print("=" * 60)

url = f"https://open.feishu.cn/open-apis/bitable/v1/apps/{APP_TOKEN}/tables/tbloiMcD0sBtGSTq/records?page_size=50"
resp = api_call(url)
for item in resp['data']['items']:
    sid = item['fields'].get('题目集合 ID', '')
    if '010199' in str(sid):
        continue

    jd_str = item['fields'].get('jsonData', '{}')
    if not jd_str or jd_str == 'None':
        continue

    try:
        jd = json.loads(jd_str)
    except:
        jd, _ = fix_json_str(jd_str)

    if jd is None:
        print(f"  ❌ P6 {sid}: JSON parse failed")
        continue

    # P6 has root-level questionSet
    qs = jd.get('questionSet', [])
    changed = False
    for q in qs:
        if not q.get('ability') or len(q.get('ability', [])) == 0:
            # Assign default based on question content
            question = q.get('question', '')
            answer = q.get('answer', [])

            # Determine ability tag
            if any(w in question.lower() for w in ['where', 'location', 'place']):
                tag = '显性细节理解｜数字/时间/地点'
            elif any(w in question.lower() for w in ['how many', 'how much', 'number']):
                tag = '显性细节理解｜数字/时间/地点'
            elif any(w in question.lower() for w in ['what color', 'which one']):
                tag = '显性事实理解｜关键词识别'
            elif any(w in question.lower() for w in ['why', 'because']):
                tag = '情绪/态度理解'
            elif any(w in question.lower() for w in ['like', 'love', 'want']):
                tag = '目的/偏好识别｜显性 to/for/like'
            else:
                tag = '显性事实理解｜关键词识别'

            q['ability'] = [tag]
            changed = True

    if changed:
        new_jd = json.dumps(jd, ensure_ascii=False)
        result = update_record("tbloiMcD0sBtGSTq", item['record_id'], {"jsonData": new_jd})
        if result.get('code') == 0:
            tags_assigned = [q.get('ability', []) for q in jd.get('questionSet', [])]
            print(f"  ✅ P6 {sid}: ability filled ({[t[0] for t in tags_assigned if t]})")
        else:
            print(f"  ❌ P6 {sid}: update failed - {result.get('msg')}")
    else:
        print(f"  ⏭️ P6 {sid}: no changes needed")

print(f"\n{'='*60}")
print("All fixes complete!")
print("=" * 60)