December 2025 Update: Comprehensive guide to multimodal AI including GPT-4 Vision, audio processing, and real-time voice with OpenAI’s Realtime API.
What is Multimodal AI?
Multimodal AI processes and generates multiple types of content:Copy
Text ──────┐
│
Image ─────┼───▶ Multimodal LLM ───▶ Text/Image/Audio Output
│
Audio ─────┘
Video ─────┘
| Capability | Use Cases | Models |
|---|---|---|
| Vision | Image analysis, OCR, diagram understanding | GPT-4o, Claude 3.5, Gemini |
| Audio | Transcription, TTS, voice understanding | Whisper, TTS, Realtime API |
| Video | Scene analysis, content moderation | Gemini 1.5 Pro |
Vision: Image Understanding
Analyzing Images with GPT-4o
Copy
from openai import OpenAI
import base64
from pathlib import Path
client = OpenAI()
def encode_image(image_path: str) -> str:
"""Encode image to base64"""
with open(image_path, "rb") as f:
return base64.standard_b64encode(f.read()).decode("utf-8")
def analyze_image(
image_path: str,
prompt: str = "Describe this image in detail."
) -> str:
"""Analyze an image with GPT-4o Vision"""
base64_image = encode_image(image_path)
# Determine MIME type
suffix = Path(image_path).suffix.lower()
mime_types = {
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".png": "image/png",
".gif": "image/gif",
".webp": "image/webp"
}
mime_type = mime_types.get(suffix, "image/jpeg")
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:{mime_type};base64,{base64_image}",
"detail": "high" # high, low, or auto
}
}
]
}
],
max_tokens=1000
)
return response.choices[0].message.content
# Usage
result = analyze_image(
"screenshot.png",
"Extract all text from this screenshot and format it as markdown."
)
print(result)
Analyzing Images from URLs
Copy
def analyze_image_url(url: str, prompt: str) -> str:
"""Analyze an image from URL"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {"url": url}
}
]
}
]
)
return response.choices[0].message.content
# Analyze multiple images
def compare_images(image_urls: list[str], comparison_prompt: str) -> str:
"""Compare multiple images"""
content = [{"type": "text", "text": comparison_prompt}]
for url in image_urls:
content.append({
"type": "image_url",
"image_url": {"url": url}
})
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": content}]
)
return response.choices[0].message.content
Vision Use Cases
Copy
class VisionAssistant:
"""Specialized vision analysis tasks"""
def __init__(self):
self.client = OpenAI()
def extract_text_ocr(self, image_path: str) -> str:
"""OCR: Extract text from image"""
return analyze_image(
image_path,
"""Extract ALL text from this image exactly as it appears.
Maintain formatting, structure, and layout.
If it's a table, format as markdown table.
If it's code, format as code block with language."""
)
def analyze_chart(self, image_path: str) -> dict:
"""Analyze a chart or graph"""
response = self.client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": """Analyze this chart/graph and return JSON:
{
"chart_type": "bar|line|pie|scatter|etc",
"title": "chart title if visible",
"x_axis": "x-axis label",
"y_axis": "y-axis label",
"key_insights": ["insight1", "insight2"],
"data_points": [{"label": "...", "value": "..."}]
}"""
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{encode_image(image_path)}"
}
}
]
}
],
response_format={"type": "json_object"}
)
import json
return json.loads(response.choices[0].message.content)
def describe_for_accessibility(self, image_path: str) -> str:
"""Generate alt text for accessibility"""
return analyze_image(
image_path,
"""Generate comprehensive alt text for this image for screen readers.
Include: main subject, context, important details, text if any.
Keep it under 150 words but be descriptive."""
)
def analyze_ui_screenshot(self, image_path: str) -> dict:
"""Analyze UI/UX of a screenshot"""
response = self.client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": """Analyze this UI screenshot:
{
"page_type": "login|dashboard|form|etc",
"ui_elements": ["button", "form", "nav"],
"accessibility_issues": [],
"ux_suggestions": [],
"detected_text": []
}"""
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{encode_image(image_path)}"
}
}
]
}
],
response_format={"type": "json_object"}
)
import json
return json.loads(response.choices[0].message.content)
Audio: Speech and Sound
Speech-to-Text with Whisper
Copy
def transcribe_audio(audio_path: str, language: str = None) -> dict:
"""Transcribe audio to text using Whisper"""
with open(audio_path, "rb") as f:
response = client.audio.transcriptions.create(
model="whisper-1",
file=f,
language=language, # Optional: "en", "es", etc.
response_format="verbose_json", # Get timestamps
timestamp_granularities=["word", "segment"]
)
return {
"text": response.text,
"language": response.language,
"segments": response.segments,
"words": response.words
}
# Transcribe with timestamps
result = transcribe_audio("meeting.mp3")
print(f"Transcription: {result['text'][:500]}...")
# Print with timestamps
for segment in result["segments"]:
print(f"[{segment['start']:.2f}s] {segment['text']}")
Text-to-Speech
Copy
from pathlib import Path
def text_to_speech(
text: str,
output_path: str,
voice: str = "alloy", # alloy, echo, fable, onyx, nova, shimmer
model: str = "tts-1-hd" # tts-1 or tts-1-hd
) -> str:
"""Convert text to speech"""
response = client.audio.speech.create(
model=model,
voice=voice,
input=text
)
# Stream to file
response.stream_to_file(output_path)
return output_path
# Generate speech
text_to_speech(
"Welcome to our AI-powered assistant. How can I help you today?",
"welcome.mp3",
voice="nova"
)
# Generate with different voices for comparison
voices = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
for voice in voices:
text_to_speech(
"Hello, this is a voice sample.",
f"sample_{voice}.mp3",
voice=voice
)
Audio Translation
Copy
def translate_audio(audio_path: str) -> str:
"""Translate audio from any language to English"""
with open(audio_path, "rb") as f:
response = client.audio.translations.create(
model="whisper-1",
file=f
)
return response.text
# Translate Spanish audio to English text
english_text = translate_audio("spanish_meeting.mp3")
Real-Time Voice
OpenAI Realtime API
For real-time voice conversations with AI:Copy
import asyncio
import websockets
import json
import base64
import pyaudio
REALTIME_URL = "wss://api.openai.com/v1/realtime"
class RealtimeVoiceAgent:
"""Real-time voice conversation agent"""
def __init__(self, api_key: str, model: str = "gpt-4o-realtime-preview"):
self.api_key = api_key
self.model = model
self.ws = None
self.audio = pyaudio.PyAudio()
async def connect(self):
"""Connect to Realtime API"""
headers = {
"Authorization": f"Bearer {self.api_key}",
"OpenAI-Beta": "realtime=v1"
}
self.ws = await websockets.connect(
f"{REALTIME_URL}?model={self.model}",
extra_headers=headers
)
# Configure session
await self.ws.send(json.dumps({
"type": "session.update",
"session": {
"modalities": ["text", "audio"],
"instructions": "You are a helpful voice assistant. Be concise and conversational.",
"voice": "alloy",
"input_audio_format": "pcm16",
"output_audio_format": "pcm16",
"turn_detection": {
"type": "server_vad", # Voice activity detection
"threshold": 0.5,
"silence_duration_ms": 500
}
}
}))
async def send_audio(self, audio_data: bytes):
"""Send audio chunk to API"""
await self.ws.send(json.dumps({
"type": "input_audio_buffer.append",
"audio": base64.b64encode(audio_data).decode()
}))
async def receive_messages(self):
"""Receive and handle messages"""
async for message in self.ws:
event = json.loads(message)
await self.handle_event(event)
async def handle_event(self, event: dict):
"""Handle different event types"""
event_type = event.get("type")
if event_type == "response.audio.delta":
# Play audio chunk
audio_data = base64.b64decode(event["delta"])
self.play_audio(audio_data)
elif event_type == "response.text.delta":
# Print text response
print(event["delta"], end="", flush=True)
elif event_type == "response.done":
print("\n[Response complete]")
elif event_type == "error":
print(f"Error: {event['error']}")
def play_audio(self, audio_data: bytes):
"""Play audio through speakers"""
stream = self.audio.open(
format=pyaudio.paInt16,
channels=1,
rate=24000,
output=True
)
stream.write(audio_data)
stream.close()
async def start_conversation(self):
"""Start a voice conversation"""
await self.connect()
# Start receiving in background
receive_task = asyncio.create_task(self.receive_messages())
# Capture and send microphone audio
stream = self.audio.open(
format=pyaudio.paInt16,
channels=1,
rate=24000,
input=True,
frames_per_buffer=1024
)
try:
while True:
audio_data = stream.read(1024)
await self.send_audio(audio_data)
await asyncio.sleep(0.01)
finally:
stream.close()
receive_task.cancel()
await self.ws.close()
# Usage
async def main():
agent = RealtimeVoiceAgent(api_key=os.getenv("OPENAI_API_KEY"))
await agent.start_conversation()
asyncio.run(main())
Voice Agent with Function Calling
Copy
class RealtimeVoiceAgentWithTools(RealtimeVoiceAgent):
"""Voice agent with function calling capabilities"""
def __init__(self, api_key: str):
super().__init__(api_key)
self.tools = {
"get_weather": self.get_weather,
"set_reminder": self.set_reminder,
"search_web": self.search_web
}
async def connect(self):
await super().connect()
# Add tools to session
await self.ws.send(json.dumps({
"type": "session.update",
"session": {
"tools": [
{
"type": "function",
"name": "get_weather",
"description": "Get current weather for a location",
"parameters": {
"type": "object",
"properties": {
"location": {"type": "string"}
},
"required": ["location"]
}
},
{
"type": "function",
"name": "set_reminder",
"description": "Set a reminder",
"parameters": {
"type": "object",
"properties": {
"message": {"type": "string"},
"time": {"type": "string"}
},
"required": ["message", "time"]
}
}
]
}
}))
async def handle_event(self, event: dict):
event_type = event.get("type")
if event_type == "response.function_call_arguments.done":
# Execute function
func_name = event["name"]
args = json.loads(event["arguments"])
if func_name in self.tools:
result = await self.tools[func_name](**args)
# Send result back
await self.ws.send(json.dumps({
"type": "conversation.item.create",
"item": {
"type": "function_call_output",
"call_id": event["call_id"],
"output": json.dumps(result)
}
}))
# Continue response
await self.ws.send(json.dumps({
"type": "response.create"
}))
else:
await super().handle_event(event)
async def get_weather(self, location: str) -> dict:
return {"location": location, "temp": "72°F", "condition": "Sunny"}
async def set_reminder(self, message: str, time: str) -> dict:
return {"status": "set", "message": message, "time": time}
async def search_web(self, query: str) -> dict:
return {"results": f"Search results for: {query}"}
Image Generation
DALL-E 3 Integration
Copy
def generate_image(
prompt: str,
size: str = "1024x1024", # 1024x1024, 1792x1024, 1024x1792
quality: str = "standard", # standard or hd
style: str = "vivid" # vivid or natural
) -> str:
"""Generate an image with DALL-E 3"""
response = client.images.generate(
model="dall-e-3",
prompt=prompt,
size=size,
quality=quality,
style=style,
n=1
)
return response.data[0].url
# Generate image
url = generate_image(
"A futuristic city with flying cars and neon lights, cyberpunk style",
size="1792x1024",
quality="hd"
)
print(f"Image URL: {url}")
# Generate and save
import requests
def generate_and_save(prompt: str, output_path: str, **kwargs) -> str:
url = generate_image(prompt, **kwargs)
response = requests.get(url)
with open(output_path, "wb") as f:
f.write(response.content)
return output_path
Image Editing
Copy
def edit_image(
image_path: str,
mask_path: str,
prompt: str,
size: str = "1024x1024"
) -> str:
"""Edit an image using DALL-E 2"""
with open(image_path, "rb") as img, open(mask_path, "rb") as mask:
response = client.images.edit(
model="dall-e-2",
image=img,
mask=mask,
prompt=prompt,
size=size,
n=1
)
return response.data[0].url
def create_variations(image_path: str, n: int = 4) -> list[str]:
"""Create variations of an image"""
with open(image_path, "rb") as f:
response = client.images.create_variation(
model="dall-e-2",
image=f,
n=n,
size="1024x1024"
)
return [img.url for img in response.data]
Multimodal RAG
Combine vision with RAG for document understanding:Copy
from dataclasses import dataclass
@dataclass
class MultimodalDocument:
text: str
images: list[str] # Base64 encoded images
metadata: dict
class MultimodalRAG:
"""RAG system that handles text and images"""
def __init__(self):
self.documents: list[MultimodalDocument] = []
def add_pdf_with_images(self, pdf_path: str):
"""Extract text and images from PDF"""
import fitz # PyMuPDF
doc = fitz.open(pdf_path)
for page in doc:
text = page.get_text()
images = []
for img in page.get_images():
xref = img[0]
pix = fitz.Pixmap(doc, xref)
images.append(base64.b64encode(pix.tobytes()).decode())
self.documents.append(MultimodalDocument(
text=text,
images=images,
metadata={"page": page.number, "source": pdf_path}
))
def query(self, question: str) -> str:
"""Query with multimodal context"""
# Build multimodal context
content = [
{"type": "text", "text": f"Question: {question}\n\nContext from documents:"}
]
# Add relevant text and images
for doc in self.documents[:5]: # Limit for context
content.append({
"type": "text",
"text": doc.text[:1000]
})
for img in doc.images[:2]: # Limit images
content.append({
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{img}"}
})
content.append({
"type": "text",
"text": "Based on the above context (text and images), answer the question."
})
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": content}],
max_tokens=1000
)
return response.choices[0].message.content
Key Takeaways
Vision is Powerful
GPT-4o can analyze images, charts, screenshots, and documents
Audio is Easy
Whisper + TTS = complete audio pipeline in a few lines
Real-time is Here
Build voice assistants with the Realtime API
Combine Modalities
Multimodal RAG unlocks powerful document understanding
What’s Next
DSPy Framework
Learn declarative AI programming with Stanford’s DSPy framework