ai_member_xiaobian/scripts/extract_lessons.py
2026-06-23 08:10:01 +08:00

371 lines
21 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Extract lesson info from lark-cli docs +fetch JSON output."""
import json, sys, re, os, glob
def extract_markdown(filepath):
with open(filepath, 'r') as f:
content = f.read()
# Find JSON object
start = content.find('{')
if start == -1:
return None, None
try:
data = json.loads(content[start:])
except:
return None, None
if not data.get('ok'):
return None, data.get('error', {}).get('message', 'unknown error')
md = data.get('data', {}).get('markdown', '')
title = data.get('data', {}).get('title', '')
return md, title
def extract_knowledge_points(md):
"""Extract knowledge points from markdown."""
kps = []
# Look for 知识点 section
kp_section = re.search(r'# 知识点\s*\n(.*?)(?=\n# |\Z)', md, re.DOTALL)
if not kp_section:
return kps, []
section = kp_section.group(1)
# Extract sheet tokens
sheet_tokens = re.findall(r'<sheet token="([^"]+)"', section)
# Extract example sentences
examples = []
for line in section.split('\n'):
line = line.strip()
if line and not line.startswith('<') and not line.startswith('#'):
examples.append(line)
return sheet_tokens, examples
def extract_story_summary(md):
"""Extract story summary from 剧情梗概 table."""
summary_section = re.search(r'# 剧情梗概\s*\n(.*?)(?=\n# |\Z)', md, re.DOTALL)
if not summary_section:
return ""
section = summary_section.group(1)
# Extract text from lark-td cells
cells = re.findall(r'<lark-td[^>]*>\s*(.*?)\s*</lark-td>', section, re.DOTALL)
# Filter out non-content cells (headers, image refs, etc.)
content_cells = []
for cell in cells:
cell = cell.strip()
# Skip empty, headers, image-only cells
if not cell or cell in ['开场', '结尾', 'AI动画', 'AI动画']:
continue
if re.match(r'^\d+$', cell): # Just a number
continue
if cell.startswith('<image'):
continue
if cell.startswith('(全屏插入图)'):
continue
# Clean up mentions
cell = re.sub(r'<mention[^>]+/>', '', cell)
cell = re.sub(r'<[^>]+>', '', cell)
cell = cell.strip()
if cell:
content_cells.append(cell)
return content_cells
def extract_characters(md):
"""Extract character names from the story."""
# Look for known character names
known_chars = ['User', 'Ben', 'May', 'Vicky', 'BoBo', 'DiDi', 'Sally',
'Mr. Stone', 'Bingo', 'Momo', 'Luna', 'Grandpa', 'Grandma',
'Dad', 'Mom', 'Uncle', 'Aunt', 'Tom', 'Lily', 'Max',
'Leo', 'Mia', 'Zoe', 'Sam', 'Ella', 'Jack', 'Ruby',
'Oliver', 'Emma', 'Noah', 'Ava', 'Liam', 'Sophia',
'Coco', 'Lulu', 'Nana', 'Pipi', 'Toto', 'Kiki',
'Mr.', 'Ms.', 'Mrs.', 'Dr.', 'Captain', 'Mayor']
# Extract all text
text = re.sub(r'<[^>]+>', ' ', md)
found = set()
for char in known_chars:
if re.search(r'\b' + re.escape(char) + r'\b', text):
found.add(char)
# Also look for capitalized names not in the list
# (but be careful not to match common words)
caps = re.findall(r'\b([A-Z][a-z]{2,}(?:\s+[A-Z][a-z]{2,})?)\b', text)
for c in caps:
if c not in ['User', 'What', 'Where', 'This', 'That', 'They', 'Their',
'First', 'Second', 'Third', 'Sunny', 'Rain', 'Windy',
'Hide', 'It', 'We', 'She', 'He', 'You', 'Are', 'Is',
'The', 'And', 'But', 'For', 'Not', 'Now', 'All',
'Then', 'When', 'Here', 'There', 'Just', 'Like',
'Very', 'Come', 'Look', 'Find', 'Help', 'Stop',
'Wait', 'Play', 'Hide', 'Seek', 'Rock', 'Grass',
'Place', 'Way', 'Day', 'Back', 'One', 'Two',
'From', 'With', 'Into', 'Over', 'Down', 'Out',
'Still', 'Again', 'Only', 'Much', 'More', 'Some',
'Each', 'Both', 'Most', 'Such', 'Even', 'Also',
'Many', 'Been', 'Will', 'Have', 'Were', 'Does',
'Going', 'Being', 'Right', 'Left', 'Next', 'Last',
'Good', 'Great', 'Well', 'Sure', 'Okay', 'Yes',
'Soon', 'Ever', 'Never', 'Always', 'Maybe',
'Could', 'Would', 'Should', 'Must', 'Need',
'Want', 'Know', 'Think', 'Mean', 'Feel', 'See',
'Make', 'Take', 'Give', 'Tell', 'Call', 'Keep',
'Move', 'Turn', 'Walk', 'Run', 'Push', 'Pull',
'Open', 'Close', 'Pick', 'Drop', 'Hold', 'Let',
'Put', 'Get', 'Try', 'Use', 'Ask', 'Say',
'Different', 'Strange', 'Wrong', 'Lost', 'Dark',
'Fast', 'Slow', 'Small', 'Big', 'New', 'Old',
'Long', 'Short', 'High', 'Low', 'Wide', 'Deep',
'Hot', 'Cold', 'Wet', 'Dry', 'Hard', 'Soft',
'Loud', 'Quiet', 'Happy', 'Sad', 'Brave', 'Scared',
'Tired', 'Hungry', 'Thirsty', 'Sleepy', 'Busy',
'Ready', 'Sorry', 'Thank', 'Please', 'Hello',
'Morning', 'Night', 'Today', 'Tomorrow', 'Yesterday',
'Inside', 'Outside', 'Behind', 'Under', 'Above',
'Below', 'Between', 'Through', 'Around', 'Along',
'Across', 'Near', 'Far', 'Away', 'Home',
'Door', 'Wall', 'Floor', 'Window', 'Light',
'Water', 'Fire', 'Wind', 'Rain', 'Snow', 'Sun',
'Tree', 'Flower', 'River', 'Road', 'Bridge',
'House', 'Room', 'Table', 'Chair', 'Bed',
'Book', 'Box', 'Bag', 'Hat', 'Shoe',
'Food', 'Bread', 'Milk', 'Egg', 'Fish',
'Dog', 'Cat', 'Bird', 'Fish', 'Horse',
'Color', 'Red', 'Blue', 'Green', 'Yellow',
'White', 'Black', 'Brown', 'Pink', 'Purple',
'Number', 'Letter', 'Word', 'Name', 'Game',
'Time', 'Year', 'Month', 'Week', 'Hour',
'World', 'Life', 'Hand', 'Eye', 'Head',
'Friend', 'Family', 'People', 'Child', 'Baby',
'School', 'Teacher', 'Class', 'Story', 'Song',
'Party', 'Present', 'Card', 'Cake', 'Ball',
'Map', 'Key', 'Lock', 'Bell', 'Flag',
'Star', 'Moon', 'Sky', 'Sea', 'Land',
'Top', 'Bottom', 'Side', 'Front', 'End',
'Maze', 'Cave', 'Tower', 'Castle', 'Garden',
'Desert', 'Town', 'City', 'Village', 'Island',
'Robot', 'Machine', 'Button', 'Lever', 'Switch',
'Screen', 'Camera', 'Phone', 'Radio', 'Computer',
'Dance', 'Music', 'Picture', 'Movie', 'Show',
'Team', 'Race', 'Match', 'Score', 'Win',
'Lose', 'Start', 'Finish', 'Begin', 'Stop',
'Question', 'Answer', 'Idea', 'Plan', 'Secret',
'Magic', 'Power', 'Dream', 'Wish', 'Hope',
'Love', 'Like', 'Hate', 'Fear', 'Anger',
'Fun', 'Cool', 'Nice', 'Fine', 'Real',
'True', 'False', 'Easy', 'Hard', 'Simple',
'Safe', 'Danger', 'Trouble', 'Problem', 'Mistake',
'Change', 'Stay', 'Leave', 'Return', 'Follow',
'Lead', 'Guide', 'Show', 'Teach', 'Learn',
'Read', 'Write', 'Draw', 'Paint', 'Build',
'Fix', 'Break', 'Cut', 'Tie', 'Wash',
'Clean', 'Dirty', 'Full', 'Empty', 'Heavy',
'Light', 'Thick', 'Thin', 'Round', 'Flat',
'Sweet', 'Sour', 'Salty', 'Spicy', 'Fresh',
'Alive', 'Dead', 'Asleep', 'Awake', 'Alone',
'Together', 'Apart', 'Same', 'Different', 'Special',
'Normal', 'Usual', 'Strange', 'Weird', 'Funny',
'Silly', 'Smart', 'Kind', 'Mean', 'Brave',
'Shy', 'Loud', 'Quiet', 'Fast', 'Slow',
'Strong', 'Weak', 'Rich', 'Poor', 'Lucky',
'Ready', 'Sure', 'Done', 'Gone', 'Back',
'Wait', 'Hurry', 'Slow', 'Careful', 'Gentle',
'Suddenly', 'Finally', 'Maybe', 'Perhaps', 'Almost',
'Really', 'Very', 'Too', 'So', 'How',
'Why', 'Who', 'Whose', 'Which', 'How Many',
'How Much', 'How Long', 'How Far', 'How Old',
'First', 'Second', 'Third', 'Fourth', 'Fifth',
'Once', 'Twice', 'Thrice', 'Again', 'More',
'Less', 'Few', 'Many', 'Much', 'Little',
'All', 'None', 'Some', 'Any', 'Every',
'Another', 'Other', 'Else', 'Enough', 'Several',
'Whole', 'Half', 'Part', 'Piece', 'Bit',
'Lot', 'Kind', 'Sort', 'Type', 'Way',
'Thing', 'Stuff', 'Place', 'Time', 'Person',
'Man', 'Woman', 'Boy', 'Girl', 'Kid',
'Dad', 'Mom', 'Son', 'Daughter', 'Brother',
'Sister', 'Uncle', 'Aunt', 'Cousin', 'Friend',
'Neighbor', 'Guest', 'Visitor', 'Stranger', 'Hero',
'King', 'Queen', 'Prince', 'Princess', 'Knight',
'Dragon', 'Monster', 'Ghost', 'Witch', 'Wizard',
'Fairy', 'Giant', 'Dwarf', 'Elf', 'Troll',
'Pirate', 'Cowboy', 'Astronaut', 'Doctor', 'Nurse',
'Police', 'Fireman', 'Chef', 'Artist', 'Singer',
'Dancer', 'Player', 'Runner', 'Swimmer', 'Climber',
'Jumper', 'Thrower', 'Catcher', 'Kicker', 'Hitter',
'Walker', 'Talker', 'Listener', 'Reader', 'Writer',
'Thinker', 'Maker', 'Builder', 'Helper', 'Finder',
'Keeper', 'Giver', 'Taker', 'Buyer', 'Seller',
'Eater', 'Drinker', 'Sleeper', 'Dreamer', 'Lover',
'Hater', 'Fighter', 'Winner', 'Loser', 'Leader',
'Follower', 'Teacher', 'Student', 'Worker', 'Player',
'Traveler', 'Explorer', 'Discoverer', 'Inventor', 'Creator',
'Morning', 'Afternoon', 'Evening', 'Midnight', 'Noon',
'Breakfast', 'Lunch', 'Dinner', 'Snack', 'Meal',
'Spring', 'Summer', 'Autumn', 'Winter', 'Season',
'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday',
'Saturday', 'Sunday', 'Weekend', 'Holiday', 'Birthday',
'Christmas', 'Easter', 'Halloween', 'Thanksgiving', 'New Year',
'January', 'February', 'March', 'April', 'May',
'June', 'July', 'August', 'September', 'October',
'November', 'December', 'Minute', 'Second', 'Moment',
'Past', 'Present', 'Future', 'Early', 'Late',
'Soon', 'Later', 'Before', 'After', 'During',
'While', 'Since', 'Until', 'Already', 'Yet',
'North', 'South', 'East', 'West', 'Center',
'Middle', 'Corner', 'Edge', 'Line', 'Point',
'Circle', 'Square', 'Triangle', 'Rectangle', 'Oval',
'Diamond', 'Heart', 'Cross', 'Arrow', 'Dot',
'Body', 'Arm', 'Leg', 'Foot', 'Toe',
'Finger', 'Thumb', 'Nose', 'Mouth', 'Ear',
'Hair', 'Neck', 'Back', 'Belly', 'Knee',
'Shoulder', 'Elbow', 'Wrist', 'Ankle', 'Hip',
'Skin', 'Bone', 'Blood', 'Heart', 'Brain',
'Lung', 'Stomach', 'Muscle', 'Nerve', 'Cell',
'Plant', 'Animal', 'Insect', 'Spider', 'Snake',
'Lizard', 'Frog', 'Turtle', 'Fish', 'Whale',
'Shark', 'Dolphin', 'Seal', 'Penguin', 'Eagle',
'Owl', 'Parrot', 'Duck', 'Goose', 'Chicken',
'Turkey', 'Pig', 'Cow', 'Sheep', 'Goat',
'Deer', 'Bear', 'Wolf', 'Fox', 'Rabbit',
'Squirrel', 'Mouse', 'Rat', 'Bat', 'Monkey',
'Ape', 'Gorilla', 'Elephant', 'Lion', 'Tiger',
'Zebra', 'Giraffe', 'Kangaroo', 'Koala', 'Panda',
'Camel', 'Donkey', 'Mule', 'Ox', 'Buffalo',
'Crocodile', 'Alligator', 'Dinosaur', 'Fossil', 'Egg',
'Nest', 'Cave', 'Hole', 'Burrow', 'Den',
'Forest', 'Jungle', 'Desert', 'Mountain', 'Valley',
'Hill', 'Cliff', 'Beach', 'Ocean', 'Lake',
'Pond', 'Stream', 'Waterfall', 'Volcano', 'Earthquake',
'Storm', 'Thunder', 'Lightning', 'Rainbow', 'Cloud',
'Fog', 'Mist', 'Dew', 'Frost', 'Ice',
'Snowman', 'Snowball', 'Sled', 'Skate', 'Ski',
'Surf', 'Swim', 'Dive', 'Float', 'Sink',
'Rise', 'Fall', 'Grow', 'Shrink', 'Change',
'Appear', 'Disappear', 'Vanish', 'Return', 'Arrive',
'Enter', 'Exit', 'Escape', 'Rescue', 'Save',
'Protect', 'Guard', 'Defend', 'Attack', 'Fight',
'Battle', 'War', 'Peace', 'Win', 'Lose',
'Victory', 'Defeat', 'Success', 'Failure', 'Try',
'Attempt', 'Effort', 'Work', 'Rest', 'Play',
'Fun', 'Joy', 'Sadness', 'Anger', 'Fear',
'Surprise', 'Shock', 'Wonder', 'Curiosity', 'Interest',
'Boredom', 'Excitement', 'Calm', 'Peace', 'Quiet',
'Noise', 'Sound', 'Voice', 'Whisper', 'Shout',
'Scream', 'Laugh', 'Cry', 'Smile', 'Frown',
'Wink', 'Blink', 'Stare', 'Glance', 'Look',
'Watch', 'Observe', 'Notice', 'Spot', 'Discover',
'Find', 'Search', 'Seek', 'Hunt', 'Chase',
'Catch', 'Grab', 'Hold', 'Release', 'Drop',
'Throw', 'Catch', 'Bounce', 'Roll', 'Slide',
'Spin', 'Turn', 'Twist', 'Bend', 'Stretch',
'Reach', 'Touch', 'Feel', 'Taste', 'Smell',
'Hear', 'Listen', 'Speak', 'Talk', 'Say',
'Tell', 'Ask', 'Answer', 'Reply', 'Respond',
'Explain', 'Describe', 'Report', 'Announce', 'Declare',
'Promise', 'Agree', 'Disagree', 'Argue', 'Discuss',
'Chat', 'Conversation', 'Talk', 'Speech', 'Story',
'Tale', 'Legend', 'Myth', 'Fable', 'Joke',
'Riddle', 'Puzzle', 'Mystery', 'Secret', 'Clue',
'Hint', 'Tip', 'Advice', 'Suggestion', 'Recommendation',
'Opinion', 'Belief', 'Fact', 'Truth', 'Lie',
'Honest', 'Dishonest', 'Fair', 'Unfair', 'Right',
'Wrong', 'Correct', 'Incorrect', 'Accurate', 'Inaccurate',
'Possible', 'Impossible', 'Probable', 'Improbable', 'Certain',
'Uncertain', 'Sure', 'Unsure', 'Confident', 'Doubtful',
'Clear', 'Unclear', 'Obvious', 'Hidden', 'Visible',
'Invisible', 'Present', 'Absent', 'Missing', 'Found',
'Lost', 'Stolen', 'Broken', 'Fixed', 'Repaired',
'Damaged', 'Destroyed', 'Ruined', 'Perfect', 'Flawless',
'Beautiful', 'Ugly', 'Pretty', 'Handsome', 'Cute',
'Attractive', 'Unattractive', 'Lovely', 'Horrible', 'Terrible',
'Wonderful', 'Amazing', 'Fantastic', 'Excellent', 'Great',
'Good', 'Bad', 'Okay', 'Alright', 'Fine',
'Well', 'Poor', 'Rich', 'Wealthy', 'Broke',
'Expensive', 'Cheap', 'Free', 'Costly', 'Priceless',
'Valuable', 'Worthless', 'Useful', 'Useless', 'Helpful',
'Harmful', 'Dangerous', 'Safe', 'Secure', 'Risky',
'Brave', 'Cowardly', 'Bold', 'Timid', 'Shy',
'Confident', 'Nervous', 'Anxious', 'Worried', 'Relaxed',
'Calm', 'Angry', 'Furious', 'Annoyed', 'Irritated',
'Frustrated', 'Disappointed', 'Sad', 'Unhappy', 'Miserable',
'Depressed', 'Lonely', 'Alone', 'Together', 'United',
'Divided', 'Separated', 'Connected', 'Attached', 'Detached',
'Linked', 'Related', 'Unrelated', 'Similar', 'Different',
'Unique', 'Common', 'Rare', 'Special', 'Ordinary',
'Normal', 'Abnormal', 'Typical', 'Atypical', 'Usual',
'Unusual', 'Regular', 'Irregular', 'Frequent', 'Infrequent',
'Often', 'Seldom', 'Always', 'Never', 'Sometimes',
'Usually', 'Rarely', 'Occasionally', 'Constantly', 'Continuously',
'Briefly', 'Shortly', 'Quickly', 'Slowly', 'Rapidly',
'Gradually', 'Suddenly', 'Immediately', 'Instantly', 'Eventually',
'Finally', 'Initially', 'Originally', 'Previously', 'Recently',
'Currently', 'Presently', 'Simultaneously', 'Meanwhile', 'Afterward',
'Beforehand', 'Afterward', 'Thereafter', 'Henceforth', 'Hereafter',
'Therefore', 'However', 'Moreover', 'Furthermore', 'Nevertheless',
'Nonetheless', 'Otherwise', 'Instead', 'Rather', 'Besides',
'Additionally', 'Alternatively', 'Consequently', 'Accordingly', 'Thus',
'Hence', 'So', 'Because', 'Since', 'Although',
'Though', 'Unless', 'Until', 'While', 'Whereas',
'Whether', 'If', 'When', 'Where', 'How',
'What', 'Which', 'Who', 'Whom', 'Whose',
'That', 'This', 'These', 'Those', 'Such',
'Either', 'Neither', 'Nor', 'Or', 'And',
'But', 'Yet', 'For', 'With', 'Without',
'Within', 'Without', 'Throughout', 'Through', 'Across',
'Along', 'Around', 'About', 'Above', 'Below',
'Beneath', 'Underneath', 'Inside', 'Outside', 'Between',
'Among', 'Amid', 'Against', 'Toward', 'Towards',
'Into', 'Onto', 'Upon', 'Off', 'Away',
'Back', 'Forth', 'Forward', 'Backward', 'Upward',
'Downward', 'Inward', 'Outward', 'Sideways', 'Lengthwise',
'Crosswise', 'Clockwise', 'Counterclockwise', 'Northward', 'Southward',
'Eastward', 'Westward', 'Homeward', 'Heavenward', 'Earthward',
'Seaward', 'Landward', 'Windward', 'Leeward', 'Shoreward']:
pass # skip common words
else:
found.add(c)
return sorted(found)
def main():
base = os.path.expanduser('~/.openclaw/workspace-xiaobian/tmp/lessons/S3')
for unit in sorted(os.listdir(base)):
unit_dir = os.path.join(base, unit)
if not os.path.isdir(unit_dir):
continue
for lesson_file in sorted(os.listdir(unit_dir)):
if not lesson_file.endswith('.md'):
continue
filepath = os.path.join(unit_dir, lesson_file)
md, title = extract_markdown(filepath)
if md is None:
print(f"ERROR: {unit}/{lesson_file}: {title}")
continue
sheet_tokens, examples = extract_knowledge_points(md)
story_cells = extract_story_summary(md)
characters = extract_characters(md)
print(f"\n{'='*60}")
print(f"TITLE: {title}")
print(f"UNIT: {unit}")
print(f"LESSON: {lesson_file.replace('.md', '')}")
print(f"SHEET_TOKENS: {sheet_tokens}")
print(f"EXAMPLES: {examples}")
print(f"CHARACTERS: {', '.join(characters)}")
print(f"STORY:")
for i, cell in enumerate(story_cells):
print(f" [{i}] {cell[:200]}")
if __name__ == '__main__':
main()