Skip to main content
December 2025 Update: Comprehensive guide to multimodal AI including GPT-4 Vision, audio processing, and real-time voice with OpenAI’s Realtime API.

What is Multimodal AI?

Multimodal AI processes and generates multiple types of content:
Text ──────┐

Image ─────┼───▶ Multimodal LLM ───▶ Text/Image/Audio Output

Audio ─────┘
Video ─────┘
CapabilityUse CasesModels
VisionImage analysis, OCR, diagram understandingGPT-4o, Claude 3.5, Gemini
AudioTranscription, TTS, voice understandingWhisper, TTS, Realtime API
VideoScene analysis, content moderationGemini 1.5 Pro

Vision: Image Understanding

Analyzing Images with GPT-4o

from openai import OpenAI
import base64
from pathlib import Path

client = OpenAI()

def encode_image(image_path: str) -> str:
    """Encode image to base64"""
    with open(image_path, "rb") as f:
        return base64.standard_b64encode(f.read()).decode("utf-8")

def analyze_image(
    image_path: str,
    prompt: str = "Describe this image in detail."
) -> str:
    """Analyze an image with GPT-4o Vision"""
    
    base64_image = encode_image(image_path)
    
    # Determine MIME type
    suffix = Path(image_path).suffix.lower()
    mime_types = {
        ".jpg": "image/jpeg",
        ".jpeg": "image/jpeg",
        ".png": "image/png",
        ".gif": "image/gif",
        ".webp": "image/webp"
    }
    mime_type = mime_types.get(suffix, "image/jpeg")
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:{mime_type};base64,{base64_image}",
                            "detail": "high"  # high, low, or auto
                        }
                    }
                ]
            }
        ],
        max_tokens=1000
    )
    
    return response.choices[0].message.content

# Usage
result = analyze_image(
    "screenshot.png",
    "Extract all text from this screenshot and format it as markdown."
)
print(result)

Analyzing Images from URLs

def analyze_image_url(url: str, prompt: str) -> str:
    """Analyze an image from URL"""
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": url}
                    }
                ]
            }
        ]
    )
    return response.choices[0].message.content

# Analyze multiple images
def compare_images(image_urls: list[str], comparison_prompt: str) -> str:
    """Compare multiple images"""
    content = [{"type": "text", "text": comparison_prompt}]
    
    for url in image_urls:
        content.append({
            "type": "image_url",
            "image_url": {"url": url}
        })
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": content}]
    )
    return response.choices[0].message.content

Vision Use Cases

class VisionAssistant:
    """Specialized vision analysis tasks"""
    
    def __init__(self):
        self.client = OpenAI()
    
    def extract_text_ocr(self, image_path: str) -> str:
        """OCR: Extract text from image"""
        return analyze_image(
            image_path,
            """Extract ALL text from this image exactly as it appears.
            Maintain formatting, structure, and layout.
            If it's a table, format as markdown table.
            If it's code, format as code block with language."""
        )
    
    def analyze_chart(self, image_path: str) -> dict:
        """Analyze a chart or graph"""
        response = self.client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": """Analyze this chart/graph and return JSON:
{
    "chart_type": "bar|line|pie|scatter|etc",
    "title": "chart title if visible",
    "x_axis": "x-axis label",
    "y_axis": "y-axis label", 
    "key_insights": ["insight1", "insight2"],
    "data_points": [{"label": "...", "value": "..."}]
}"""
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{encode_image(image_path)}"
                            }
                        }
                    ]
                }
            ],
            response_format={"type": "json_object"}
        )
        import json
        return json.loads(response.choices[0].message.content)
    
    def describe_for_accessibility(self, image_path: str) -> str:
        """Generate alt text for accessibility"""
        return analyze_image(
            image_path,
            """Generate comprehensive alt text for this image for screen readers.
            Include: main subject, context, important details, text if any.
            Keep it under 150 words but be descriptive."""
        )
    
    def analyze_ui_screenshot(self, image_path: str) -> dict:
        """Analyze UI/UX of a screenshot"""
        response = self.client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": """Analyze this UI screenshot:
{
    "page_type": "login|dashboard|form|etc",
    "ui_elements": ["button", "form", "nav"],
    "accessibility_issues": [],
    "ux_suggestions": [],
    "detected_text": []
}"""
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{encode_image(image_path)}"
                            }
                        }
                    ]
                }
            ],
            response_format={"type": "json_object"}
        )
        import json
        return json.loads(response.choices[0].message.content)

Audio: Speech and Sound

Speech-to-Text with Whisper

def transcribe_audio(audio_path: str, language: str = None) -> dict:
    """Transcribe audio to text using Whisper"""
    with open(audio_path, "rb") as f:
        response = client.audio.transcriptions.create(
            model="whisper-1",
            file=f,
            language=language,  # Optional: "en", "es", etc.
            response_format="verbose_json",  # Get timestamps
            timestamp_granularities=["word", "segment"]
        )
    
    return {
        "text": response.text,
        "language": response.language,
        "segments": response.segments,
        "words": response.words
    }

# Transcribe with timestamps
result = transcribe_audio("meeting.mp3")
print(f"Transcription: {result['text'][:500]}...")

# Print with timestamps
for segment in result["segments"]:
    print(f"[{segment['start']:.2f}s] {segment['text']}")

Text-to-Speech

from pathlib import Path

def text_to_speech(
    text: str,
    output_path: str,
    voice: str = "alloy",  # alloy, echo, fable, onyx, nova, shimmer
    model: str = "tts-1-hd"  # tts-1 or tts-1-hd
) -> str:
    """Convert text to speech"""
    response = client.audio.speech.create(
        model=model,
        voice=voice,
        input=text
    )
    
    # Stream to file
    response.stream_to_file(output_path)
    return output_path

# Generate speech
text_to_speech(
    "Welcome to our AI-powered assistant. How can I help you today?",
    "welcome.mp3",
    voice="nova"
)

# Generate with different voices for comparison
voices = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
for voice in voices:
    text_to_speech(
        "Hello, this is a voice sample.",
        f"sample_{voice}.mp3",
        voice=voice
    )

Audio Translation

def translate_audio(audio_path: str) -> str:
    """Translate audio from any language to English"""
    with open(audio_path, "rb") as f:
        response = client.audio.translations.create(
            model="whisper-1",
            file=f
        )
    return response.text

# Translate Spanish audio to English text
english_text = translate_audio("spanish_meeting.mp3")

Real-Time Voice

OpenAI Realtime API

For real-time voice conversations with AI:
import asyncio
import websockets
import json
import base64
import pyaudio

REALTIME_URL = "wss://api.openai.com/v1/realtime"

class RealtimeVoiceAgent:
    """Real-time voice conversation agent"""
    
    def __init__(self, api_key: str, model: str = "gpt-4o-realtime-preview"):
        self.api_key = api_key
        self.model = model
        self.ws = None
        self.audio = pyaudio.PyAudio()
    
    async def connect(self):
        """Connect to Realtime API"""
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "OpenAI-Beta": "realtime=v1"
        }
        
        self.ws = await websockets.connect(
            f"{REALTIME_URL}?model={self.model}",
            extra_headers=headers
        )
        
        # Configure session
        await self.ws.send(json.dumps({
            "type": "session.update",
            "session": {
                "modalities": ["text", "audio"],
                "instructions": "You are a helpful voice assistant. Be concise and conversational.",
                "voice": "alloy",
                "input_audio_format": "pcm16",
                "output_audio_format": "pcm16",
                "turn_detection": {
                    "type": "server_vad",  # Voice activity detection
                    "threshold": 0.5,
                    "silence_duration_ms": 500
                }
            }
        }))
    
    async def send_audio(self, audio_data: bytes):
        """Send audio chunk to API"""
        await self.ws.send(json.dumps({
            "type": "input_audio_buffer.append",
            "audio": base64.b64encode(audio_data).decode()
        }))
    
    async def receive_messages(self):
        """Receive and handle messages"""
        async for message in self.ws:
            event = json.loads(message)
            await self.handle_event(event)
    
    async def handle_event(self, event: dict):
        """Handle different event types"""
        event_type = event.get("type")
        
        if event_type == "response.audio.delta":
            # Play audio chunk
            audio_data = base64.b64decode(event["delta"])
            self.play_audio(audio_data)
        
        elif event_type == "response.text.delta":
            # Print text response
            print(event["delta"], end="", flush=True)
        
        elif event_type == "response.done":
            print("\n[Response complete]")
        
        elif event_type == "error":
            print(f"Error: {event['error']}")
    
    def play_audio(self, audio_data: bytes):
        """Play audio through speakers"""
        stream = self.audio.open(
            format=pyaudio.paInt16,
            channels=1,
            rate=24000,
            output=True
        )
        stream.write(audio_data)
        stream.close()
    
    async def start_conversation(self):
        """Start a voice conversation"""
        await self.connect()
        
        # Start receiving in background
        receive_task = asyncio.create_task(self.receive_messages())
        
        # Capture and send microphone audio
        stream = self.audio.open(
            format=pyaudio.paInt16,
            channels=1,
            rate=24000,
            input=True,
            frames_per_buffer=1024
        )
        
        try:
            while True:
                audio_data = stream.read(1024)
                await self.send_audio(audio_data)
                await asyncio.sleep(0.01)
        finally:
            stream.close()
            receive_task.cancel()
            await self.ws.close()

# Usage
async def main():
    agent = RealtimeVoiceAgent(api_key=os.getenv("OPENAI_API_KEY"))
    await agent.start_conversation()

asyncio.run(main())

Voice Agent with Function Calling

class RealtimeVoiceAgentWithTools(RealtimeVoiceAgent):
    """Voice agent with function calling capabilities"""
    
    def __init__(self, api_key: str):
        super().__init__(api_key)
        self.tools = {
            "get_weather": self.get_weather,
            "set_reminder": self.set_reminder,
            "search_web": self.search_web
        }
    
    async def connect(self):
        await super().connect()
        
        # Add tools to session
        await self.ws.send(json.dumps({
            "type": "session.update",
            "session": {
                "tools": [
                    {
                        "type": "function",
                        "name": "get_weather",
                        "description": "Get current weather for a location",
                        "parameters": {
                            "type": "object",
                            "properties": {
                                "location": {"type": "string"}
                            },
                            "required": ["location"]
                        }
                    },
                    {
                        "type": "function",
                        "name": "set_reminder",
                        "description": "Set a reminder",
                        "parameters": {
                            "type": "object",
                            "properties": {
                                "message": {"type": "string"},
                                "time": {"type": "string"}
                            },
                            "required": ["message", "time"]
                        }
                    }
                ]
            }
        }))
    
    async def handle_event(self, event: dict):
        event_type = event.get("type")
        
        if event_type == "response.function_call_arguments.done":
            # Execute function
            func_name = event["name"]
            args = json.loads(event["arguments"])
            
            if func_name in self.tools:
                result = await self.tools[func_name](**args)
                
                # Send result back
                await self.ws.send(json.dumps({
                    "type": "conversation.item.create",
                    "item": {
                        "type": "function_call_output",
                        "call_id": event["call_id"],
                        "output": json.dumps(result)
                    }
                }))
                
                # Continue response
                await self.ws.send(json.dumps({
                    "type": "response.create"
                }))
        else:
            await super().handle_event(event)
    
    async def get_weather(self, location: str) -> dict:
        return {"location": location, "temp": "72°F", "condition": "Sunny"}
    
    async def set_reminder(self, message: str, time: str) -> dict:
        return {"status": "set", "message": message, "time": time}
    
    async def search_web(self, query: str) -> dict:
        return {"results": f"Search results for: {query}"}

Image Generation

DALL-E 3 Integration

def generate_image(
    prompt: str,
    size: str = "1024x1024",  # 1024x1024, 1792x1024, 1024x1792
    quality: str = "standard",  # standard or hd
    style: str = "vivid"  # vivid or natural
) -> str:
    """Generate an image with DALL-E 3"""
    response = client.images.generate(
        model="dall-e-3",
        prompt=prompt,
        size=size,
        quality=quality,
        style=style,
        n=1
    )
    
    return response.data[0].url

# Generate image
url = generate_image(
    "A futuristic city with flying cars and neon lights, cyberpunk style",
    size="1792x1024",
    quality="hd"
)
print(f"Image URL: {url}")

# Generate and save
import requests

def generate_and_save(prompt: str, output_path: str, **kwargs) -> str:
    url = generate_image(prompt, **kwargs)
    
    response = requests.get(url)
    with open(output_path, "wb") as f:
        f.write(response.content)
    
    return output_path

Image Editing

def edit_image(
    image_path: str,
    mask_path: str,
    prompt: str,
    size: str = "1024x1024"
) -> str:
    """Edit an image using DALL-E 2"""
    with open(image_path, "rb") as img, open(mask_path, "rb") as mask:
        response = client.images.edit(
            model="dall-e-2",
            image=img,
            mask=mask,
            prompt=prompt,
            size=size,
            n=1
        )
    
    return response.data[0].url

def create_variations(image_path: str, n: int = 4) -> list[str]:
    """Create variations of an image"""
    with open(image_path, "rb") as f:
        response = client.images.create_variation(
            model="dall-e-2",
            image=f,
            n=n,
            size="1024x1024"
        )
    
    return [img.url for img in response.data]

Multimodal RAG

Combine vision with RAG for document understanding:
from dataclasses import dataclass

@dataclass
class MultimodalDocument:
    text: str
    images: list[str]  # Base64 encoded images
    metadata: dict

class MultimodalRAG:
    """RAG system that handles text and images"""
    
    def __init__(self):
        self.documents: list[MultimodalDocument] = []
    
    def add_pdf_with_images(self, pdf_path: str):
        """Extract text and images from PDF"""
        import fitz  # PyMuPDF
        
        doc = fitz.open(pdf_path)
        
        for page in doc:
            text = page.get_text()
            images = []
            
            for img in page.get_images():
                xref = img[0]
                pix = fitz.Pixmap(doc, xref)
                images.append(base64.b64encode(pix.tobytes()).decode())
            
            self.documents.append(MultimodalDocument(
                text=text,
                images=images,
                metadata={"page": page.number, "source": pdf_path}
            ))
    
    def query(self, question: str) -> str:
        """Query with multimodal context"""
        # Build multimodal context
        content = [
            {"type": "text", "text": f"Question: {question}\n\nContext from documents:"}
        ]
        
        # Add relevant text and images
        for doc in self.documents[:5]:  # Limit for context
            content.append({
                "type": "text",
                "text": doc.text[:1000]
            })
            
            for img in doc.images[:2]:  # Limit images
                content.append({
                    "type": "image_url",
                    "image_url": {"url": f"data:image/png;base64,{img}"}
                })
        
        content.append({
            "type": "text",
            "text": "Based on the above context (text and images), answer the question."
        })
        
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": content}],
            max_tokens=1000
        )
        
        return response.choices[0].message.content

Key Takeaways

Vision is Powerful

GPT-4o can analyze images, charts, screenshots, and documents

Audio is Easy

Whisper + TTS = complete audio pipeline in a few lines

Real-time is Here

Build voice assistants with the Realtime API

Combine Modalities

Multimodal RAG unlocks powerful document understanding

What’s Next

DSPy Framework

Learn declarative AI programming with Stanford’s DSPy framework