Capstone: The Autonomous Humanoid

Project Overview

In this capstone project, you will build a complete autonomous humanoid robot system that:

Receives a voice command from a human
Transcribes it using Whisper (local, free)
Plans a sequence of actions using an LLM (Groq, free)
Navigates to the target location using Nav2
Identifies the target object using computer vision
Manipulates (picks up) the object

System Architecture

┌─────────────────────────────────────────────────────────┐
│                    Capstone System                       │
│                                                          │
│  [Microphone] → [Whisper STT] → [LLM Planner]           │
│                                        │                 │
│  [RealSense Camera] → [YOLO Detection] │                 │
│                           │            │                 │
│                           └────────────┘                 │
│                                │                         │
│                         [Task Executor]                  │
│                                │                         │
│              ┌─────────────────┼───────────────┐         │
│              ▼                 ▼               ▼         │
│          [Nav2]           [MoveIt 2]       [TTS Output]  │
│              │                 │                         │
│        [Robot Base]      [Robot Arm]                     │
└─────────────────────────────────────────────────────────┘

Full Integration Code

#!/usr/bin/env python3
"""
Capstone: Autonomous Humanoid Robot
Combines: Whisper + Groq LLM + ROS 2 Nav2
"""

import rclpy
from rclpy.node import Node
import whisper
import sounddevice as sd
import numpy as np
from groq import Groq
import json
from std_msgs.msg import String
from geometry_msgs.msg import PoseStamped
from nav2_simple_commander.robot_navigator import BasicNavigator

SAMPLE_RATE = 16000
RECORD_SECONDS = 4

LOCATIONS = {
    "kitchen": (3.0, 1.0, 0.0),
    "living room": (0.5, 3.0, 0.0),
    "table": (1.5, 2.0, 0.0),
    "door": (0.0, 5.0, 0.0),
}

SYSTEM_PROMPT = """
You are a robot task planner. Convert the command to a JSON action array.
Available: navigate_to(location), pick_up(object), say(text), wait(seconds)
Output ONLY valid JSON array.
"""


class AutonomousHumanoid(Node):
    def __init__(self):
        super().__init__('autonomous_humanoid')

        # Initialize components
        self.whisper_model = whisper.load_model("base")
        self.groq_client = Groq(api_key="YOUR_GROQ_KEY")
        self.navigator = BasicNavigator()
        self.speech_pub = self.create_publisher(String, '/robot_speech', 10)

        self.get_logger().info("Autonomous Humanoid ready!")
        self.navigator.waitUntilNav2Active()

    def listen(self) -> str:
        """Record audio and transcribe with Whisper."""
        self.get_logger().info("Listening for command...")
        audio = sd.rec(
            int(RECORD_SECONDS * SAMPLE_RATE),
            samplerate=SAMPLE_RATE,
            channels=1,
            dtype='float32'
        )
        sd.wait()
        result = self.whisper_model.transcribe(audio.flatten(), fp16=False)
        text = result["text"].strip()
        self.get_logger().info(f'Heard: "{text}"')
        return text

    def plan(self, command: str) -> list[dict]:
        """Use Groq LLM to create action plan."""
        response = self.groq_client.chat.completions.create(
            model="llama-3.1-8b-instant",
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": command}
            ],
            temperature=0.1,
        )
        return json.loads(response.choices[0].message.content)

    def execute(self, plan: list[dict]):
        """Execute the action plan."""
        for step in plan:
            action = step.get('action', '')
            args = step.get('args', {})
            self.get_logger().info(f'Executing: {action}({args})')

            if action == 'navigate_to':
                self._navigate(args.get('location', ''))
            elif action == 'pick_up':
                self._pick_up(args.get('object', ''))
            elif action == 'say':
                self._say(args.get('text', ''))

    def _navigate(self, location: str):
        if location not in LOCATIONS:
            self.get_logger().warn(f'Unknown location: {location}')
            return
        x, y, z = LOCATIONS[location]
        goal = PoseStamped()
        goal.header.frame_id = 'map'
        goal.header.stamp = self.get_clock().now().to_msg()
        goal.pose.position.x = x
        goal.pose.position.y = y
        goal.pose.orientation.w = 1.0
        self.navigator.goToPose(goal)
        while not self.navigator.isTaskComplete():
            pass
        self.get_logger().info(f'Arrived at {location}')

    def _pick_up(self, obj: str):
        # Placeholder: in real system, calls MoveIt 2 arm controller
        self.get_logger().info(f'Attempting to pick up: {obj}')

    def _say(self, text: str):
        msg = String()
        msg.data = text
        self.speech_pub.publish(msg)

    def run(self):
        """Main loop."""
        while rclpy.ok():
            command = self.listen()
            if command:
                try:
                    plan = self.plan(command)
                    self.get_logger().info(f'Plan: {json.dumps(plan, indent=2)}')
                    self.execute(plan)
                except Exception as e:
                    self.get_logger().error(f'Error: {e}')


def main():
    rclpy.init()
    robot = AutonomousHumanoid()
    robot.run()
    rclpy.shutdown()


if __name__ == '__main__':
    main()

Running the Capstone

# Terminal 1: Start Gazebo simulation
ros2 launch your_robot_pkg gazebo.launch.py

# Terminal 2: Start Nav2
ros2 launch nav2_bringup bringup_launch.py map:=lab_map.yaml

# Terminal 3: Start the Autonomous Humanoid
ros2 run capstone autonomous_humanoid

Then speak: "Go to the kitchen and pick up the water bottle"

Evaluation Checklist

Voice command captured and transcribed correctly
LLM generates valid action plan
Robot navigates to the correct location
Robot identifies the target object
Robot performs the pick-up action
System handles unknown commands gracefully

Congratulations! You have completed the Physical AI & Humanoid Robotics textbook.

Built with Docusaurus. Powered by ROS 2, NVIDIA Isaac, and open-source AI tools.

Project Overview​

System Architecture​

Full Integration Code​

Running the Capstone​

Evaluation Checklist​

Project Overview

System Architecture

Full Integration Code

Running the Capstone

Evaluation Checklist