Semantic Routing - Dev Weekends

Semantic routing directs queries to the most appropriate handler, model, or pipeline based on content understanding. This enables cost optimization and improved response quality.

Intent Classification

Embedding-Based Classification

import numpy as np
from openai import OpenAI
from dataclasses import dataclass


@dataclass
class Intent:
    """Represents an intent with example queries."""
    name: str
    description: str
    examples: list[str]
    embedding: np.ndarray = None


class IntentClassifier:
    """Classify queries into predefined intents using embeddings."""
    
    def __init__(self, intents: list[Intent], model: str = "text-embedding-3-small"):
        self.client = OpenAI()
        self.model = model
        self.intents = intents
        self._compute_intent_embeddings()
    
    def _compute_intent_embeddings(self):
        """Compute embeddings for all intent examples."""
        for intent in self.intents:
            # Combine description and examples
            texts = [intent.description] + intent.examples
            
            response = self.client.embeddings.create(
                model=self.model,
                input=texts
            )
            
            # Average all embeddings for this intent
            embeddings = [e.embedding for e in response.data]
            intent.embedding = np.mean(embeddings, axis=0)
    
    def classify(self, query: str, threshold: float = 0.5) -> tuple[str, float]:
        """Classify a query into an intent."""
        # Get query embedding
        response = self.client.embeddings.create(
            model=self.model,
            input=[query]
        )
        query_embedding = np.array(response.data[0].embedding)
        
        # Find most similar intent
        best_intent = None
        best_score = -1
        
        for intent in self.intents:
            score = self._cosine_similarity(query_embedding, intent.embedding)
            if score > best_score:
                best_score = score
                best_intent = intent.name
        
        if best_score < threshold:
            return "unknown", best_score
        
        return best_intent, best_score
    
    def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
    
    def classify_batch(
        self,
        queries: list[str],
        threshold: float = 0.5
    ) -> list[tuple[str, float]]:
        """Classify multiple queries."""
        response = self.client.embeddings.create(
            model=self.model,
            input=queries
        )
        
        results = []
        for embedding_data in response.data:
            query_embedding = np.array(embedding_data.embedding)
            
            best_intent = None
            best_score = -1
            
            for intent in self.intents:
                score = self._cosine_similarity(query_embedding, intent.embedding)
                if score > best_score:
                    best_score = score
                    best_intent = intent.name
            
            if best_score < threshold:
                results.append(("unknown", best_score))
            else:
                results.append((best_intent, best_score))
        
        return results


# Usage
intents = [
    Intent(
        name="technical_support",
        description="Questions about technical issues, bugs, and troubleshooting",
        examples=[
            "My application keeps crashing",
            "How do I fix this error?",
            "The feature isn't working properly"
        ]
    ),
    Intent(
        name="billing",
        description="Questions about payments, invoices, and subscriptions",
        examples=[
            "How do I update my payment method?",
            "Where can I find my invoice?",
            "I want to cancel my subscription"
        ]
    ),
    Intent(
        name="product_info",
        description="Questions about features, capabilities, and product details",
        examples=[
            "What features are included?",
            "Can your product do X?",
            "Tell me about your enterprise plan"
        ]
    ),
]

classifier = IntentClassifier(intents)

queries = [
    "My app won't start after the update",
    "How much does the pro plan cost?",
    "Does it support Python 3.12?",
]

for query in queries:
    intent, confidence = classifier.classify(query)
    print(f"Query: {query}")
    print(f"Intent: {intent} (confidence: {confidence:.2f})\n")

LLM-Based Classification

from openai import OpenAI
import json


class LLMIntentClassifier:
    """Classify intents using LLM reasoning."""
    
    def __init__(
        self,
        intents: dict[str, str],
        model: str = "gpt-4o-mini"
    ):
        self.client = OpenAI()
        self.model = model
        self.intents = intents
    
    def classify(self, query: str) -> dict:
        """Classify a query with reasoning."""
        intent_list = "\n".join(
            f"- {name}: {desc}"
            for name, desc in self.intents.items()
        )
        
        prompt = f"""Classify this query into one of the following intents:

{intent_list}

Query: {query}

Respond with JSON:
{{
    "intent": "intent_name",
    "confidence": 0.0-1.0,
    "reasoning": "brief explanation"
}}"""
        
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        return json.loads(response.choices[0].message.content)
    
    def classify_with_fallback(
        self,
        query: str,
        confidence_threshold: float = 0.7
    ) -> dict:
        """Classify with fallback for low-confidence results."""
        result = self.classify(query)
        
        if result["confidence"] < confidence_threshold:
            result["intent"] = "requires_human_review"
            result["original_intent"] = result.get("intent")
        
        return result


# Usage
intents = {
    "order_status": "Inquiries about order tracking and delivery",
    "refund_request": "Requests for refunds or returns",
    "product_question": "Questions about product features or availability",
    "complaint": "Complaints about service or product quality",
    "general_inquiry": "General questions not fitting other categories"
}

classifier = LLMIntentClassifier(intents)

result = classifier.classify("When will my order arrive? I've been waiting for a week.")
print(f"Intent: {result['intent']}")
print(f"Confidence: {result['confidence']}")
print(f"Reasoning: {result['reasoning']}")

Query Routing

Multi-Model Router

Route queries to the most appropriate model based on complexity:

from openai import OpenAI
from anthropic import Anthropic
from dataclasses import dataclass
from enum import Enum
import json


class ModelTier(Enum):
    FAST = "fast"      # Simple queries
    BALANCED = "balanced"  # Moderate complexity
    POWERFUL = "powerful"  # Complex reasoning


@dataclass
class RouteConfig:
    """Configuration for a route."""
    model: str
    provider: str
    max_tokens: int
    temperature: float


class QueryRouter:
    """Route queries to appropriate models based on complexity."""
    
    ROUTES = {
        ModelTier.FAST: RouteConfig(
            model="gpt-4o-mini",
            provider="openai",
            max_tokens=512,
            temperature=0.3
        ),
        ModelTier.BALANCED: RouteConfig(
            model="gpt-4o",
            provider="openai",
            max_tokens=1024,
            temperature=0.5
        ),
        ModelTier.POWERFUL: RouteConfig(
            model="claude-sonnet-4-20250514",
            provider="anthropic",
            max_tokens=2048,
            temperature=0.7
        ),
    }
    
    def __init__(self):
        self.openai = OpenAI()
        self.anthropic = Anthropic()
    
    def analyze_complexity(self, query: str) -> ModelTier:
        """Determine query complexity."""
        prompt = f"""Analyze the complexity of this query:

Query: {query}

Consider:
1. Does it require multi-step reasoning?
2. Does it need domain expertise?
3. Is it a simple factual question?
4. Does it require creativity or nuance?

Respond with JSON:
{{
    "complexity": "simple" | "moderate" | "complex",
    "reasoning": "brief explanation"
}}"""
        
        response = self.openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        result = json.loads(response.choices[0].message.content)
        
        complexity_map = {
            "simple": ModelTier.FAST,
            "moderate": ModelTier.BALANCED,
            "complex": ModelTier.POWERFUL
        }
        
        return complexity_map.get(result["complexity"], ModelTier.BALANCED)
    
    def route(self, query: str) -> tuple[str, RouteConfig]:
        """Route query and get response."""
        tier = self.analyze_complexity(query)
        config = self.ROUTES[tier]
        
        if config.provider == "openai":
            response = self.openai.chat.completions.create(
                model=config.model,
                max_tokens=config.max_tokens,
                temperature=config.temperature,
                messages=[{"role": "user", "content": query}]
            )
            return response.choices[0].message.content, config
        
        elif config.provider == "anthropic":
            response = self.anthropic.messages.create(
                model=config.model,
                max_tokens=config.max_tokens,
                messages=[{"role": "user", "content": query}]
            )
            return response.content[0].text, config
        
        raise ValueError(f"Unknown provider: {config.provider}")


# Usage
router = QueryRouter()

queries = [
    "What is 2 + 2?",
    "Explain the concept of dependency injection.",
    "Design a distributed system for real-time collaboration with CRDT support."
]

for query in queries:
    response, config = router.route(query)
    print(f"Query: {query[:50]}...")
    print(f"Routed to: {config.model}")
    print(f"Response: {response[:100]}...\n")

Topic-Based Routing

from openai import OpenAI
from dataclasses import dataclass
from typing import Callable
import json


@dataclass
class TopicHandler:
    """Handler for a specific topic."""
    topic: str
    description: str
    handler: Callable[[str], str]
    keywords: list[str]


class TopicRouter:
    """Route queries to topic-specific handlers."""
    
    def __init__(self, handlers: list[TopicHandler]):
        self.client = OpenAI()
        self.handlers = {h.topic: h for h in handlers}
        self._build_topic_index()
    
    def _build_topic_index(self):
        """Build embeddings for topic matching."""
        # Create text representations for each topic
        self.topic_texts = {}
        for topic, handler in self.handlers.items():
            text = f"{handler.description}. Keywords: {', '.join(handler.keywords)}"
            self.topic_texts[topic] = text
    
    def route(self, query: str) -> tuple[str, str]:
        """Route query to appropriate handler."""
        # Use LLM to classify topic
        topics = "\n".join(
            f"- {topic}: {h.description}"
            for topic, h in self.handlers.items()
        )
        
        prompt = f"""Match this query to the most appropriate topic:

Topics:
{topics}

Query: {query}

Respond with JSON: {{"topic": "topic_name", "confidence": 0.0-1.0}}"""
        
        response = self.client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        result = json.loads(response.choices[0].message.content)
        topic = result["topic"]
        
        if topic in self.handlers:
            handler = self.handlers[topic]
            return topic, handler.handler(query)
        
        # Fallback to default handler
        return "unknown", f"I don't have a specialized handler for this query: {query}"


# Define handlers
def handle_coding(query: str) -> str:
    client = OpenAI()
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an expert programmer."},
            {"role": "user", "content": query}
        ]
    )
    return response.choices[0].message.content


def handle_writing(query: str) -> str:
    client = OpenAI()
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a professional writer and editor."},
            {"role": "user", "content": query}
        ]
    )
    return response.choices[0].message.content


def handle_math(query: str) -> str:
    client = OpenAI()
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a mathematics expert. Show your work."},
            {"role": "user", "content": query}
        ]
    )
    return response.choices[0].message.content


# Create router
handlers = [
    TopicHandler(
        topic="coding",
        description="Programming, software development, and debugging",
        handler=handle_coding,
        keywords=["code", "program", "function", "bug", "python", "javascript"]
    ),
    TopicHandler(
        topic="writing",
        description="Writing, editing, and content creation",
        handler=handle_writing,
        keywords=["write", "edit", "essay", "article", "grammar"]
    ),
    TopicHandler(
        topic="math",
        description="Mathematics, calculations, and problem solving",
        handler=handle_math,
        keywords=["calculate", "equation", "solve", "math", "number"]
    ),
]

router = TopicRouter(handlers)

query = "How do I implement binary search in Python?"
topic, response = router.route(query)
print(f"Routed to: {topic}")
print(f"Response: {response}")

Cost-Optimized Routing

from openai import OpenAI
from dataclasses import dataclass
from typing import Optional
import time


@dataclass
class ModelConfig:
    """Configuration for a model."""
    name: str
    cost_per_1k_input: float
    cost_per_1k_output: float
    avg_latency_ms: float
    quality_score: float  # 0-1


class CostOptimizedRouter:
    """Route queries to minimize cost while meeting quality requirements."""
    
    MODELS = [
        ModelConfig("gpt-4o-mini", 0.00015, 0.0006, 500, 0.85),
        ModelConfig("gpt-4o", 0.0025, 0.01, 800, 0.95),
        ModelConfig("gpt-4-turbo", 0.01, 0.03, 1000, 0.93),
    ]
    
    def __init__(
        self,
        quality_threshold: float = 0.8,
        max_latency_ms: float = 2000,
        budget_per_query: float = 0.01
    ):
        self.client = OpenAI()
        self.quality_threshold = quality_threshold
        self.max_latency_ms = max_latency_ms
        self.budget_per_query = budget_per_query
    
    def estimate_tokens(self, text: str) -> int:
        """Rough token estimation."""
        return len(text) // 4
    
    def select_model(
        self,
        query: str,
        required_quality: Optional[float] = None
    ) -> ModelConfig:
        """Select the most cost-effective model."""
        quality_req = required_quality or self.quality_threshold
        
        # Filter models that meet requirements
        viable_models = [
            m for m in self.MODELS
            if m.quality_score >= quality_req
            and m.avg_latency_ms <= self.max_latency_ms
        ]
        
        if not viable_models:
            # Fallback to highest quality model
            return max(self.MODELS, key=lambda m: m.quality_score)
        
        # Estimate query cost
        input_tokens = self.estimate_tokens(query) / 1000
        output_tokens = 0.5  # Estimate 500 output tokens
        
        def estimate_cost(model: ModelConfig) -> float:
            return (
                input_tokens * model.cost_per_1k_input +
                output_tokens * model.cost_per_1k_output
            )
        
        # Select cheapest viable model
        return min(viable_models, key=estimate_cost)
    
    def route(
        self,
        query: str,
        required_quality: Optional[float] = None
    ) -> tuple[str, ModelConfig, dict]:
        """Route query and return response with metadata."""
        model = self.select_model(query, required_quality)
        
        start_time = time.perf_counter()
        
        response = self.client.chat.completions.create(
            model=model.name,
            messages=[{"role": "user", "content": query}]
        )
        
        latency_ms = (time.perf_counter() - start_time) * 1000
        
        usage = response.usage
        actual_cost = (
            (usage.prompt_tokens / 1000) * model.cost_per_1k_input +
            (usage.completion_tokens / 1000) * model.cost_per_1k_output
        )
        
        metadata = {
            "model": model.name,
            "latency_ms": latency_ms,
            "cost": actual_cost,
            "input_tokens": usage.prompt_tokens,
            "output_tokens": usage.completion_tokens
        }
        
        return response.choices[0].message.content, model, metadata


# Usage
router = CostOptimizedRouter(
    quality_threshold=0.85,
    max_latency_ms=1500,
    budget_per_query=0.005
)

# Simple query - should use cheaper model
simple_query = "What is the capital of France?"
response, model, meta = router.route(simple_query)
print(f"Query: {simple_query}")
print(f"Model: {model.name}, Cost: ${meta['cost']:.6f}")

# Complex query with high quality requirement
complex_query = "Explain the mathematical foundations of transformer attention mechanisms."
response, model, meta = router.route(complex_query, required_quality=0.95)
print(f"\nQuery: {complex_query}")
print(f"Model: {model.name}, Cost: ${meta['cost']:.6f}")

Hybrid Routing

Combine multiple routing strategies:

from openai import OpenAI
from dataclasses import dataclass
from typing import Callable, Any
import json


@dataclass
class RoutingDecision:
    """Detailed routing decision."""
    model: str
    handler: str
    reasoning: str
    confidence: float
    metadata: dict


class HybridRouter:
    """Combine intent, complexity, and cost-based routing."""
    
    def __init__(self):
        self.client = OpenAI()
    
    def analyze_query(self, query: str) -> dict:
        """Comprehensive query analysis."""
        prompt = f"""Analyze this query comprehensively:

Query: {query}

Provide analysis as JSON:
{{
    "intent": "question" | "task" | "creative" | "analysis" | "code",
    "complexity": "simple" | "moderate" | "complex",
    "domain": "general" | "technical" | "creative" | "analytical",
    "expected_length": "short" | "medium" | "long",
    "requires_reasoning": true/false,
    "requires_creativity": true/false,
    "requires_accuracy": true/false
}}"""
        
        response = self.client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        return json.loads(response.choices[0].message.content)
    
    def decide_route(self, query: str) -> RoutingDecision:
        """Make routing decision based on analysis."""
        analysis = self.analyze_query(query)
        
        # Determine best model based on analysis
        if analysis["complexity"] == "simple" and not analysis["requires_reasoning"]:
            model = "gpt-4o-mini"
            reasoning = "Simple query, fast model sufficient"
        elif analysis["requires_creativity"]:
            model = "gpt-4o"
            reasoning = "Creative task benefits from capable model"
        elif analysis["complexity"] == "complex" or analysis["requires_reasoning"]:
            model = "gpt-4o"
            reasoning = "Complex reasoning requires powerful model"
        else:
            model = "gpt-4o-mini"
            reasoning = "Balanced query, using efficient model"
        
        # Determine handler
        if analysis["intent"] == "code":
            handler = "code_specialist"
        elif analysis["intent"] == "creative":
            handler = "creative_writer"
        elif analysis["domain"] == "technical":
            handler = "technical_expert"
        else:
            handler = "general"
        
        return RoutingDecision(
            model=model,
            handler=handler,
            reasoning=reasoning,
            confidence=0.85,
            metadata=analysis
        )
    
    def route_and_respond(self, query: str) -> tuple[str, RoutingDecision]:
        """Route query and generate response."""
        decision = self.decide_route(query)
        
        # Build system prompt based on handler
        system_prompts = {
            "code_specialist": "You are an expert programmer. Provide clean, documented code.",
            "creative_writer": "You are a creative writer. Be imaginative and engaging.",
            "technical_expert": "You are a technical expert. Be precise and thorough.",
            "general": "You are a helpful assistant."
        }
        
        system = system_prompts.get(decision.handler, system_prompts["general"])
        
        response = self.client.chat.completions.create(
            model=decision.model,
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": query}
            ]
        )
        
        return response.choices[0].message.content, decision


# Usage
router = HybridRouter()

queries = [
    "What is 5 + 5?",
    "Write a Python function to sort a list of dictionaries by a key",
    "Write a short story about a robot learning to paint",
    "Explain the CAP theorem and its implications for distributed databases"
]

for query in queries:
    response, decision = router.route_and_respond(query)
    print(f"Query: {query[:50]}...")
    print(f"Model: {decision.model}")
    print(f"Handler: {decision.handler}")
    print(f"Reasoning: {decision.reasoning}")
    print(f"Response: {response[:100]}...\n")

Routing Best Practices

Use fast models for routing decisions themselves
Cache routing decisions for similar queries
Monitor routing accuracy and adjust thresholds
Implement fallbacks for routing failures
Track cost savings from intelligent routing

Practice Exercise

Build a production routing system that:

Classifies queries by intent and complexity
Routes to appropriate models based on requirements
Optimizes for cost while meeting quality thresholds
Tracks routing decisions and outcomes
Adapts routing rules based on feedback

Focus on:

Low-latency routing decisions
Graceful degradation on failures
A/B testing different routing strategies
Cost and quality monitoring

Overview

Testing & Code Quality

Crash Courses

AI Engineering

Math for ML - Understanding Linear Algebra

Probability & Statistics for ML

Math for ML - Understanding Calculus

ML Mastery

Deep Learning Mastery

NestJS Mastery

Microservices Mastery

Low Level Design

OOP Concepts

SOLID Principles

Design Patterns

LLD Case Studies

System Design (HLD)

Senior Level (L5+/Staff)

HLD Case Studies

Engineering Fundamentals

DevOps & Operations

Azure Cloud Engineering

AWS Cloud

AWS Monitoring & Observability

AWS Security Services

AWS Serverless

AWS Operations

AWS Advanced

AWS Case Studies

GCP Cloud Engineering

DevOps Tools

Database Engineering

HIPAA Compliance Mastery

Operating Systems

Linux Internals

Distributed Systems

Networking Mastery

Build Your Own X

Go Lang Mastery

C Programming

Classic Research Papers

Distributed System Tools

​Intent Classification

​Embedding-Based Classification

​LLM-Based Classification

​Query Routing

​Multi-Model Router

​Topic-Based Routing

​Cost-Optimized Routing

​Hybrid Routing

​Practice Exercise