#!/usr/bin/env python3 """Extract lesson info from lark-cli docs +fetch JSON output.""" import json, sys, re, os, glob def extract_markdown(filepath): with open(filepath, 'r') as f: content = f.read() # Find JSON object start = content.find('{') if start == -1: return None, None try: data = json.loads(content[start:]) except: return None, None if not data.get('ok'): return None, data.get('error', {}).get('message', 'unknown error') md = data.get('data', {}).get('markdown', '') title = data.get('data', {}).get('title', '') return md, title def extract_knowledge_points(md): """Extract knowledge points from markdown.""" kps = [] # Look for 知识点 section kp_section = re.search(r'# 知识点\s*\n(.*?)(?=\n# |\Z)', md, re.DOTALL) if not kp_section: return kps, [] section = kp_section.group(1) # Extract sheet tokens sheet_tokens = re.findall(r']*>\s*(.*?)\s*', section, re.DOTALL) # Filter out non-content cells (headers, image refs, etc.) content_cells = [] for cell in cells: cell = cell.strip() # Skip empty, headers, image-only cells if not cell or cell in ['开场', '结尾', 'AI动画', '(AI动画)']: continue if re.match(r'^\d+$', cell): # Just a number continue if cell.startswith(']+/>', '', cell) cell = re.sub(r'<[^>]+>', '', cell) cell = cell.strip() if cell: content_cells.append(cell) return content_cells def extract_characters(md): """Extract character names from the story.""" # Look for known character names known_chars = ['User', 'Ben', 'May', 'Vicky', 'BoBo', 'DiDi', 'Sally', 'Mr. Stone', 'Bingo', 'Momo', 'Luna', 'Grandpa', 'Grandma', 'Dad', 'Mom', 'Uncle', 'Aunt', 'Tom', 'Lily', 'Max', 'Leo', 'Mia', 'Zoe', 'Sam', 'Ella', 'Jack', 'Ruby', 'Oliver', 'Emma', 'Noah', 'Ava', 'Liam', 'Sophia', 'Coco', 'Lulu', 'Nana', 'Pipi', 'Toto', 'Kiki', 'Mr.', 'Ms.', 'Mrs.', 'Dr.', 'Captain', 'Mayor'] # Extract all text text = re.sub(r'<[^>]+>', ' ', md) found = set() for char in known_chars: if re.search(r'\b' + re.escape(char) + r'\b', text): found.add(char) # Also look for capitalized names not in the list # (but be careful not to match common words) caps = re.findall(r'\b([A-Z][a-z]{2,}(?:\s+[A-Z][a-z]{2,})?)\b', text) for c in caps: if c not in ['User', 'What', 'Where', 'This', 'That', 'They', 'Their', 'First', 'Second', 'Third', 'Sunny', 'Rain', 'Windy', 'Hide', 'It', 'We', 'She', 'He', 'You', 'Are', 'Is', 'The', 'And', 'But', 'For', 'Not', 'Now', 'All', 'Then', 'When', 'Here', 'There', 'Just', 'Like', 'Very', 'Come', 'Look', 'Find', 'Help', 'Stop', 'Wait', 'Play', 'Hide', 'Seek', 'Rock', 'Grass', 'Place', 'Way', 'Day', 'Back', 'One', 'Two', 'From', 'With', 'Into', 'Over', 'Down', 'Out', 'Still', 'Again', 'Only', 'Much', 'More', 'Some', 'Each', 'Both', 'Most', 'Such', 'Even', 'Also', 'Many', 'Been', 'Will', 'Have', 'Were', 'Does', 'Going', 'Being', 'Right', 'Left', 'Next', 'Last', 'Good', 'Great', 'Well', 'Sure', 'Okay', 'Yes', 'Soon', 'Ever', 'Never', 'Always', 'Maybe', 'Could', 'Would', 'Should', 'Must', 'Need', 'Want', 'Know', 'Think', 'Mean', 'Feel', 'See', 'Make', 'Take', 'Give', 'Tell', 'Call', 'Keep', 'Move', 'Turn', 'Walk', 'Run', 'Push', 'Pull', 'Open', 'Close', 'Pick', 'Drop', 'Hold', 'Let', 'Put', 'Get', 'Try', 'Use', 'Ask', 'Say', 'Different', 'Strange', 'Wrong', 'Lost', 'Dark', 'Fast', 'Slow', 'Small', 'Big', 'New', 'Old', 'Long', 'Short', 'High', 'Low', 'Wide', 'Deep', 'Hot', 'Cold', 'Wet', 'Dry', 'Hard', 'Soft', 'Loud', 'Quiet', 'Happy', 'Sad', 'Brave', 'Scared', 'Tired', 'Hungry', 'Thirsty', 'Sleepy', 'Busy', 'Ready', 'Sorry', 'Thank', 'Please', 'Hello', 'Morning', 'Night', 'Today', 'Tomorrow', 'Yesterday', 'Inside', 'Outside', 'Behind', 'Under', 'Above', 'Below', 'Between', 'Through', 'Around', 'Along', 'Across', 'Near', 'Far', 'Away', 'Home', 'Door', 'Wall', 'Floor', 'Window', 'Light', 'Water', 'Fire', 'Wind', 'Rain', 'Snow', 'Sun', 'Tree', 'Flower', 'River', 'Road', 'Bridge', 'House', 'Room', 'Table', 'Chair', 'Bed', 'Book', 'Box', 'Bag', 'Hat', 'Shoe', 'Food', 'Bread', 'Milk', 'Egg', 'Fish', 'Dog', 'Cat', 'Bird', 'Fish', 'Horse', 'Color', 'Red', 'Blue', 'Green', 'Yellow', 'White', 'Black', 'Brown', 'Pink', 'Purple', 'Number', 'Letter', 'Word', 'Name', 'Game', 'Time', 'Year', 'Month', 'Week', 'Hour', 'World', 'Life', 'Hand', 'Eye', 'Head', 'Friend', 'Family', 'People', 'Child', 'Baby', 'School', 'Teacher', 'Class', 'Story', 'Song', 'Party', 'Present', 'Card', 'Cake', 'Ball', 'Map', 'Key', 'Lock', 'Bell', 'Flag', 'Star', 'Moon', 'Sky', 'Sea', 'Land', 'Top', 'Bottom', 'Side', 'Front', 'End', 'Maze', 'Cave', 'Tower', 'Castle', 'Garden', 'Desert', 'Town', 'City', 'Village', 'Island', 'Robot', 'Machine', 'Button', 'Lever', 'Switch', 'Screen', 'Camera', 'Phone', 'Radio', 'Computer', 'Dance', 'Music', 'Picture', 'Movie', 'Show', 'Team', 'Race', 'Match', 'Score', 'Win', 'Lose', 'Start', 'Finish', 'Begin', 'Stop', 'Question', 'Answer', 'Idea', 'Plan', 'Secret', 'Magic', 'Power', 'Dream', 'Wish', 'Hope', 'Love', 'Like', 'Hate', 'Fear', 'Anger', 'Fun', 'Cool', 'Nice', 'Fine', 'Real', 'True', 'False', 'Easy', 'Hard', 'Simple', 'Safe', 'Danger', 'Trouble', 'Problem', 'Mistake', 'Change', 'Stay', 'Leave', 'Return', 'Follow', 'Lead', 'Guide', 'Show', 'Teach', 'Learn', 'Read', 'Write', 'Draw', 'Paint', 'Build', 'Fix', 'Break', 'Cut', 'Tie', 'Wash', 'Clean', 'Dirty', 'Full', 'Empty', 'Heavy', 'Light', 'Thick', 'Thin', 'Round', 'Flat', 'Sweet', 'Sour', 'Salty', 'Spicy', 'Fresh', 'Alive', 'Dead', 'Asleep', 'Awake', 'Alone', 'Together', 'Apart', 'Same', 'Different', 'Special', 'Normal', 'Usual', 'Strange', 'Weird', 'Funny', 'Silly', 'Smart', 'Kind', 'Mean', 'Brave', 'Shy', 'Loud', 'Quiet', 'Fast', 'Slow', 'Strong', 'Weak', 'Rich', 'Poor', 'Lucky', 'Ready', 'Sure', 'Done', 'Gone', 'Back', 'Wait', 'Hurry', 'Slow', 'Careful', 'Gentle', 'Suddenly', 'Finally', 'Maybe', 'Perhaps', 'Almost', 'Really', 'Very', 'Too', 'So', 'How', 'Why', 'Who', 'Whose', 'Which', 'How Many', 'How Much', 'How Long', 'How Far', 'How Old', 'First', 'Second', 'Third', 'Fourth', 'Fifth', 'Once', 'Twice', 'Thrice', 'Again', 'More', 'Less', 'Few', 'Many', 'Much', 'Little', 'All', 'None', 'Some', 'Any', 'Every', 'Another', 'Other', 'Else', 'Enough', 'Several', 'Whole', 'Half', 'Part', 'Piece', 'Bit', 'Lot', 'Kind', 'Sort', 'Type', 'Way', 'Thing', 'Stuff', 'Place', 'Time', 'Person', 'Man', 'Woman', 'Boy', 'Girl', 'Kid', 'Dad', 'Mom', 'Son', 'Daughter', 'Brother', 'Sister', 'Uncle', 'Aunt', 'Cousin', 'Friend', 'Neighbor', 'Guest', 'Visitor', 'Stranger', 'Hero', 'King', 'Queen', 'Prince', 'Princess', 'Knight', 'Dragon', 'Monster', 'Ghost', 'Witch', 'Wizard', 'Fairy', 'Giant', 'Dwarf', 'Elf', 'Troll', 'Pirate', 'Cowboy', 'Astronaut', 'Doctor', 'Nurse', 'Police', 'Fireman', 'Chef', 'Artist', 'Singer', 'Dancer', 'Player', 'Runner', 'Swimmer', 'Climber', 'Jumper', 'Thrower', 'Catcher', 'Kicker', 'Hitter', 'Walker', 'Talker', 'Listener', 'Reader', 'Writer', 'Thinker', 'Maker', 'Builder', 'Helper', 'Finder', 'Keeper', 'Giver', 'Taker', 'Buyer', 'Seller', 'Eater', 'Drinker', 'Sleeper', 'Dreamer', 'Lover', 'Hater', 'Fighter', 'Winner', 'Loser', 'Leader', 'Follower', 'Teacher', 'Student', 'Worker', 'Player', 'Traveler', 'Explorer', 'Discoverer', 'Inventor', 'Creator', 'Morning', 'Afternoon', 'Evening', 'Midnight', 'Noon', 'Breakfast', 'Lunch', 'Dinner', 'Snack', 'Meal', 'Spring', 'Summer', 'Autumn', 'Winter', 'Season', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'Weekend', 'Holiday', 'Birthday', 'Christmas', 'Easter', 'Halloween', 'Thanksgiving', 'New Year', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', 'Minute', 'Second', 'Moment', 'Past', 'Present', 'Future', 'Early', 'Late', 'Soon', 'Later', 'Before', 'After', 'During', 'While', 'Since', 'Until', 'Already', 'Yet', 'North', 'South', 'East', 'West', 'Center', 'Middle', 'Corner', 'Edge', 'Line', 'Point', 'Circle', 'Square', 'Triangle', 'Rectangle', 'Oval', 'Diamond', 'Heart', 'Cross', 'Arrow', 'Dot', 'Body', 'Arm', 'Leg', 'Foot', 'Toe', 'Finger', 'Thumb', 'Nose', 'Mouth', 'Ear', 'Hair', 'Neck', 'Back', 'Belly', 'Knee', 'Shoulder', 'Elbow', 'Wrist', 'Ankle', 'Hip', 'Skin', 'Bone', 'Blood', 'Heart', 'Brain', 'Lung', 'Stomach', 'Muscle', 'Nerve', 'Cell', 'Plant', 'Animal', 'Insect', 'Spider', 'Snake', 'Lizard', 'Frog', 'Turtle', 'Fish', 'Whale', 'Shark', 'Dolphin', 'Seal', 'Penguin', 'Eagle', 'Owl', 'Parrot', 'Duck', 'Goose', 'Chicken', 'Turkey', 'Pig', 'Cow', 'Sheep', 'Goat', 'Deer', 'Bear', 'Wolf', 'Fox', 'Rabbit', 'Squirrel', 'Mouse', 'Rat', 'Bat', 'Monkey', 'Ape', 'Gorilla', 'Elephant', 'Lion', 'Tiger', 'Zebra', 'Giraffe', 'Kangaroo', 'Koala', 'Panda', 'Camel', 'Donkey', 'Mule', 'Ox', 'Buffalo', 'Crocodile', 'Alligator', 'Dinosaur', 'Fossil', 'Egg', 'Nest', 'Cave', 'Hole', 'Burrow', 'Den', 'Forest', 'Jungle', 'Desert', 'Mountain', 'Valley', 'Hill', 'Cliff', 'Beach', 'Ocean', 'Lake', 'Pond', 'Stream', 'Waterfall', 'Volcano', 'Earthquake', 'Storm', 'Thunder', 'Lightning', 'Rainbow', 'Cloud', 'Fog', 'Mist', 'Dew', 'Frost', 'Ice', 'Snowman', 'Snowball', 'Sled', 'Skate', 'Ski', 'Surf', 'Swim', 'Dive', 'Float', 'Sink', 'Rise', 'Fall', 'Grow', 'Shrink', 'Change', 'Appear', 'Disappear', 'Vanish', 'Return', 'Arrive', 'Enter', 'Exit', 'Escape', 'Rescue', 'Save', 'Protect', 'Guard', 'Defend', 'Attack', 'Fight', 'Battle', 'War', 'Peace', 'Win', 'Lose', 'Victory', 'Defeat', 'Success', 'Failure', 'Try', 'Attempt', 'Effort', 'Work', 'Rest', 'Play', 'Fun', 'Joy', 'Sadness', 'Anger', 'Fear', 'Surprise', 'Shock', 'Wonder', 'Curiosity', 'Interest', 'Boredom', 'Excitement', 'Calm', 'Peace', 'Quiet', 'Noise', 'Sound', 'Voice', 'Whisper', 'Shout', 'Scream', 'Laugh', 'Cry', 'Smile', 'Frown', 'Wink', 'Blink', 'Stare', 'Glance', 'Look', 'Watch', 'Observe', 'Notice', 'Spot', 'Discover', 'Find', 'Search', 'Seek', 'Hunt', 'Chase', 'Catch', 'Grab', 'Hold', 'Release', 'Drop', 'Throw', 'Catch', 'Bounce', 'Roll', 'Slide', 'Spin', 'Turn', 'Twist', 'Bend', 'Stretch', 'Reach', 'Touch', 'Feel', 'Taste', 'Smell', 'Hear', 'Listen', 'Speak', 'Talk', 'Say', 'Tell', 'Ask', 'Answer', 'Reply', 'Respond', 'Explain', 'Describe', 'Report', 'Announce', 'Declare', 'Promise', 'Agree', 'Disagree', 'Argue', 'Discuss', 'Chat', 'Conversation', 'Talk', 'Speech', 'Story', 'Tale', 'Legend', 'Myth', 'Fable', 'Joke', 'Riddle', 'Puzzle', 'Mystery', 'Secret', 'Clue', 'Hint', 'Tip', 'Advice', 'Suggestion', 'Recommendation', 'Opinion', 'Belief', 'Fact', 'Truth', 'Lie', 'Honest', 'Dishonest', 'Fair', 'Unfair', 'Right', 'Wrong', 'Correct', 'Incorrect', 'Accurate', 'Inaccurate', 'Possible', 'Impossible', 'Probable', 'Improbable', 'Certain', 'Uncertain', 'Sure', 'Unsure', 'Confident', 'Doubtful', 'Clear', 'Unclear', 'Obvious', 'Hidden', 'Visible', 'Invisible', 'Present', 'Absent', 'Missing', 'Found', 'Lost', 'Stolen', 'Broken', 'Fixed', 'Repaired', 'Damaged', 'Destroyed', 'Ruined', 'Perfect', 'Flawless', 'Beautiful', 'Ugly', 'Pretty', 'Handsome', 'Cute', 'Attractive', 'Unattractive', 'Lovely', 'Horrible', 'Terrible', 'Wonderful', 'Amazing', 'Fantastic', 'Excellent', 'Great', 'Good', 'Bad', 'Okay', 'Alright', 'Fine', 'Well', 'Poor', 'Rich', 'Wealthy', 'Broke', 'Expensive', 'Cheap', 'Free', 'Costly', 'Priceless', 'Valuable', 'Worthless', 'Useful', 'Useless', 'Helpful', 'Harmful', 'Dangerous', 'Safe', 'Secure', 'Risky', 'Brave', 'Cowardly', 'Bold', 'Timid', 'Shy', 'Confident', 'Nervous', 'Anxious', 'Worried', 'Relaxed', 'Calm', 'Angry', 'Furious', 'Annoyed', 'Irritated', 'Frustrated', 'Disappointed', 'Sad', 'Unhappy', 'Miserable', 'Depressed', 'Lonely', 'Alone', 'Together', 'United', 'Divided', 'Separated', 'Connected', 'Attached', 'Detached', 'Linked', 'Related', 'Unrelated', 'Similar', 'Different', 'Unique', 'Common', 'Rare', 'Special', 'Ordinary', 'Normal', 'Abnormal', 'Typical', 'Atypical', 'Usual', 'Unusual', 'Regular', 'Irregular', 'Frequent', 'Infrequent', 'Often', 'Seldom', 'Always', 'Never', 'Sometimes', 'Usually', 'Rarely', 'Occasionally', 'Constantly', 'Continuously', 'Briefly', 'Shortly', 'Quickly', 'Slowly', 'Rapidly', 'Gradually', 'Suddenly', 'Immediately', 'Instantly', 'Eventually', 'Finally', 'Initially', 'Originally', 'Previously', 'Recently', 'Currently', 'Presently', 'Simultaneously', 'Meanwhile', 'Afterward', 'Beforehand', 'Afterward', 'Thereafter', 'Henceforth', 'Hereafter', 'Therefore', 'However', 'Moreover', 'Furthermore', 'Nevertheless', 'Nonetheless', 'Otherwise', 'Instead', 'Rather', 'Besides', 'Additionally', 'Alternatively', 'Consequently', 'Accordingly', 'Thus', 'Hence', 'So', 'Because', 'Since', 'Although', 'Though', 'Unless', 'Until', 'While', 'Whereas', 'Whether', 'If', 'When', 'Where', 'How', 'What', 'Which', 'Who', 'Whom', 'Whose', 'That', 'This', 'These', 'Those', 'Such', 'Either', 'Neither', 'Nor', 'Or', 'And', 'But', 'Yet', 'For', 'With', 'Without', 'Within', 'Without', 'Throughout', 'Through', 'Across', 'Along', 'Around', 'About', 'Above', 'Below', 'Beneath', 'Underneath', 'Inside', 'Outside', 'Between', 'Among', 'Amid', 'Against', 'Toward', 'Towards', 'Into', 'Onto', 'Upon', 'Off', 'Away', 'Back', 'Forth', 'Forward', 'Backward', 'Upward', 'Downward', 'Inward', 'Outward', 'Sideways', 'Lengthwise', 'Crosswise', 'Clockwise', 'Counterclockwise', 'Northward', 'Southward', 'Eastward', 'Westward', 'Homeward', 'Heavenward', 'Earthward', 'Seaward', 'Landward', 'Windward', 'Leeward', 'Shoreward']: pass # skip common words else: found.add(c) return sorted(found) def main(): base = os.path.expanduser('~/.openclaw/workspace-xiaobian/tmp/lessons/S3') for unit in sorted(os.listdir(base)): unit_dir = os.path.join(base, unit) if not os.path.isdir(unit_dir): continue for lesson_file in sorted(os.listdir(unit_dir)): if not lesson_file.endswith('.md'): continue filepath = os.path.join(unit_dir, lesson_file) md, title = extract_markdown(filepath) if md is None: print(f"ERROR: {unit}/{lesson_file}: {title}") continue sheet_tokens, examples = extract_knowledge_points(md) story_cells = extract_story_summary(md) characters = extract_characters(md) print(f"\n{'='*60}") print(f"TITLE: {title}") print(f"UNIT: {unit}") print(f"LESSON: {lesson_file.replace('.md', '')}") print(f"SHEET_TOKENS: {sheet_tokens}") print(f"EXAMPLES: {examples}") print(f"CHARACTERS: {', '.join(characters)}") print(f"STORY:") for i, cell in enumerate(story_cells): print(f" [{i}] {cell[:200]}") if __name__ == '__main__': main()