Chapter 8: DynamoDB in Production
Introduction
Running DynamoDB in production requires careful attention to security, cost optimization, performance monitoring, and operational best practices. This chapter covers real-world patterns, common pitfalls, and production-ready architectures for mission-critical applications.Security Best Practices
IAM Policies and Least Privilege
Copy
// Implement fine-grained IAM policies
const AWS = require('aws-sdk');
// BAD: Overly permissive policy
const badPolicy = {
Version: '2012-10-17',
Statement: [{
Effect: 'Allow',
Action: 'dynamodb:*', // All actions!
Resource: '*' // All resources!
}]
};
// GOOD: Least privilege policy
const goodPolicy = {
Version: '2012-10-17',
Statement: [
{
Effect: 'Allow',
Action: [
'dynamodb:GetItem',
'dynamodb:Query',
'dynamodb:BatchGetItem'
],
Resource: [
'arn:aws:dynamodb:us-east-1:account:table/Users',
'arn:aws:dynamodb:us-east-1:account:table/Users/index/*'
]
},
{
Effect: 'Allow',
Action: [
'dynamodb:PutItem',
'dynamodb:UpdateItem'
],
Resource: 'arn:aws:dynamodb:us-east-1:account:table/Users',
Condition: {
'ForAllValues:StringEquals': {
'dynamodb:LeadingKeys': ['${aws:username}'] // User can only access own data
}
}
}
]
};
// Item-level access control using IAM conditions
const itemLevelPolicy = {
Version: '2012-10-17',
Statement: [{
Effect: 'Allow',
Action: [
'dynamodb:GetItem',
'dynamodb:PutItem',
'dynamodb:UpdateItem',
'dynamodb:DeleteItem'
],
Resource: 'arn:aws:dynamodb:us-east-1:account:table/Users',
Condition: {
'ForAllValues:StringEquals': {
'dynamodb:LeadingKeys': ['${aws:userid}']
}
}
}]
};
// Attribute-level access control
const attributePolicy = {
Version: '2012-10-17',
Statement: [{
Effect: 'Allow',
Action: [
'dynamodb:GetItem',
'dynamodb:Query'
],
Resource: 'arn:aws:dynamodb:us-east-1:account:table/Users',
Condition: {
'ForAllValues:StringEquals': {
'dynamodb:Attributes': [
'userId',
'name',
'email'
]
},
StringEqualsIfExists: {
'dynamodb:Select': 'SPECIFIC_ATTRIBUTES'
}
}
}]
};
Encryption
Copy
// Enable encryption at rest
const createEncryptedTable = async () => {
await dynamodb.createTable({
TableName: 'SecureUsers',
KeySchema: [
{ AttributeName: 'userId', KeyType: 'HASH' }
],
AttributeDefinitions: [
{ AttributeName: 'userId', AttributeType: 'S' }
],
BillingMode: 'PAY_PER_REQUEST',
SSESpecification: {
Enabled: true,
SSEType: 'KMS', // Use AWS KMS
KMSMasterKeyId: 'arn:aws:kms:us-east-1:account:key/key-id'
}
}).promise();
console.log('Table created with KMS encryption');
};
// Client-side encryption for sensitive fields
const crypto = require('crypto');
class EncryptionService {
constructor(keyId) {
this.kms = new AWS.KMS();
this.keyId = keyId;
this.dataKeyCache = new Map();
}
async encrypt(plaintext) {
// Generate data key
const dataKey = await this.getDataKey();
// Encrypt data with AES-256
const iv = crypto.randomBytes(16);
const cipher = crypto.createCipheriv('aes-256-gcm', dataKey.plaintext, iv);
let encrypted = cipher.update(plaintext, 'utf8', 'base64');
encrypted += cipher.final('base64');
const authTag = cipher.getAuthTag();
return {
ciphertext: encrypted,
iv: iv.toString('base64'),
authTag: authTag.toString('base64'),
encryptedDataKey: dataKey.encrypted
};
}
async decrypt(encryptedData) {
// Decrypt data key
const dataKey = await this.kms.decrypt({
CiphertextBlob: Buffer.from(encryptedData.encryptedDataKey, 'base64')
}).promise();
// Decrypt data
const decipher = crypto.createDecipheriv(
'aes-256-gcm',
dataKey.Plaintext,
Buffer.from(encryptedData.iv, 'base64')
);
decipher.setAuthTag(Buffer.from(encryptedData.authTag, 'base64'));
let decrypted = decipher.update(encryptedData.ciphertext, 'base64', 'utf8');
decrypted += decipher.final('utf8');
return decrypted;
}
async getDataKey() {
const cached = this.dataKeyCache.get(this.keyId);
if (cached && Date.now() - cached.timestamp < 300000) { // 5 min cache
return cached.key;
}
const result = await this.kms.generateDataKey({
KeyId: this.keyId,
KeySpec: 'AES_256'
}).promise();
const key = {
plaintext: result.Plaintext,
encrypted: result.CiphertextBlob.toString('base64')
};
this.dataKeyCache.set(this.keyId, { key, timestamp: Date.now() });
return key;
}
}
// Usage: Encrypt sensitive data before storing
const encryptionService = new EncryptionService('alias/app-key');
const createUser = async (user) => {
// Encrypt sensitive fields
const encryptedSSN = await encryptionService.encrypt(user.ssn);
const encryptedCreditCard = await encryptionService.encrypt(user.creditCard);
await dynamodb.put({
TableName: 'Users',
Item: {
userId: user.id,
name: user.name,
email: user.email,
ssn: encryptedSSN, // Encrypted at rest AND in transit
creditCard: encryptedCreditCard
}
}).promise();
};
const getUser = async (userId) => {
const result = await dynamodb.get({
TableName: 'Users',
Key: { userId: userId }
}).promise();
// Decrypt sensitive fields
result.Item.ssn = await encryptionService.decrypt(result.Item.ssn);
result.Item.creditCard = await encryptionService.decrypt(result.Item.creditCard);
return result.Item;
};
VPC Endpoints
Copy
// Access DynamoDB privately through VPC endpoints
// No Internet Gateway required, traffic stays within AWS network
// Create VPC endpoint using CloudFormation
const vpcEndpointTemplate = {
AWSTemplateFormatVersion: '2010-09-09',
Resources: {
DynamoDBVPCEndpoint: {
Type: 'AWS::EC2::VPCEndpoint',
Properties: {
VpcId: 'vpc-12345678',
ServiceName: 'com.amazonaws.us-east-1.dynamodb',
RouteTableIds: [
'rtb-12345678',
'rtb-87654321'
],
PolicyDocument: {
Version: '2012-10-17',
Statement: [{
Effect: 'Allow',
Principal: '*',
Action: [
'dynamodb:GetItem',
'dynamodb:PutItem',
'dynamodb:Query',
'dynamodb:Scan'
],
Resource: [
'arn:aws:dynamodb:us-east-1:account:table/Users',
'arn:aws:dynamodb:us-east-1:account:table/Orders'
]
}]
}
}
}
}
};
// Application in VPC automatically uses VPC endpoint
const dynamodb = new AWS.DynamoDB.DocumentClient({
region: 'us-east-1'
// Automatically routes through VPC endpoint if available
});
Audit Logging
Copy
// Comprehensive audit logging using DynamoDB Streams + CloudWatch Logs
exports.handler = async (event) => {
const logs = new AWS.CloudWatchLogs();
const logGroupName = '/aws/dynamodb/audit';
const logStreamName = `${new Date().toISOString().split('T')[0]}/audit`;
const auditEntries = [];
for (const record of event.Records) {
const eventTime = new Date(record.dynamodb.ApproximateCreationDateTime * 1000);
const keys = AWS.DynamoDB.Converter.unmarshall(record.dynamodb.Keys);
const oldImage = record.dynamodb.OldImage ?
AWS.DynamoDB.Converter.unmarshall(record.dynamodb.OldImage) : null;
const newImage = record.dynamodb.NewImage ?
AWS.DynamoDB.Converter.unmarshall(record.dynamodb.NewImage) : null;
const auditEntry = {
timestamp: eventTime.toISOString(),
eventId: record.eventID,
eventName: record.eventName,
tableName: record.eventSourceARN.split('/')[1],
keys: keys,
userIdentity: record.userIdentity,
sourceIP: record.dynamodb.sourceIPAddress,
changes: getFieldChanges(oldImage, newImage)
};
auditEntries.push({
message: JSON.stringify(auditEntry),
timestamp: eventTime.getTime()
});
// Also store in audit table
await dynamodb.put({
TableName: 'AuditLog',
Item: {
auditId: record.eventID,
...auditEntry,
ttl: Math.floor(Date.now() / 1000) + (90 * 24 * 60 * 60) // 90 days
}
}).promise();
}
// Write to CloudWatch Logs
await logs.putLogEvents({
logGroupName: logGroupName,
logStreamName: logStreamName,
logEvents: auditEntries
}).promise();
};
function getFieldChanges(oldImage, newImage) {
if (!oldImage || !newImage) return null;
const changes = {};
const allKeys = new Set([...Object.keys(oldImage), ...Object.keys(newImage)]);
for (const key of allKeys) {
if (JSON.stringify(oldImage[key]) !== JSON.stringify(newImage[key])) {
changes[key] = {
before: oldImage[key],
after: newImage[key]
};
}
}
return Object.keys(changes).length > 0 ? changes : null;
}
Cost Optimization
Capacity Planning
Copy
// Analyze usage patterns and optimize capacity
class CapacityOptimizer {
constructor(tableName) {
this.tableName = tableName;
this.cloudwatch = new AWS.CloudWatch();
}
async analyzeUsage(days = 30) {
const endTime = new Date();
const startTime = new Date(Date.now() - days * 24 * 60 * 60 * 1000);
// Get read capacity metrics
const readMetrics = await this.cloudwatch.getMetricStatistics({
Namespace: 'AWS/DynamoDB',
MetricName: 'ConsumedReadCapacityUnits',
Dimensions: [{ Name: 'TableName', Value: this.tableName }],
StartTime: startTime,
EndTime: endTime,
Period: 3600, // 1 hour
Statistics: ['Average', 'Maximum', 'Minimum']
}).promise();
// Get write capacity metrics
const writeMetrics = await this.cloudwatch.getMetricStatistics({
Namespace: 'AWS/DynamoDB',
MetricName: 'ConsumedWriteCapacityUnits',
Dimensions: [{ Name: 'TableName', Value: this.tableName }],
StartTime: startTime,
EndTime: endTime,
Period: 3600,
Statistics: ['Average', 'Maximum', 'Minimum']
}).promise();
return this.generateRecommendations(readMetrics, writeMetrics);
}
generateRecommendations(readMetrics, writeMetrics) {
const readStats = this.calculateStats(readMetrics.Datapoints);
const writeStats = this.calculateStats(writeMetrics.Datapoints);
const recommendations = {
current: {
avgReadCapacity: readStats.average,
avgWriteCapacity: writeStats.average,
peakReadCapacity: readStats.maximum,
peakWriteCapacity: writeStats.maximum
},
recommended: {
mode: null,
provisionedRead: null,
provisionedWrite: null,
estimatedCost: null
}
};
// Decision logic
const readVariability = (readStats.maximum - readStats.average) / readStats.average;
const writeVariability = (writeStats.maximum - writeStats.average) / writeStats.average;
if (readVariability > 2 || writeVariability > 2) {
// High variability -> On-Demand
recommendations.recommended.mode = 'PAY_PER_REQUEST';
recommendations.recommended.estimatedCost = this.calculateOnDemandCost(
readStats.average,
writeStats.average
);
} else {
// Low variability -> Provisioned
recommendations.recommended.mode = 'PROVISIONED';
recommendations.recommended.provisionedRead = Math.ceil(readStats.maximum * 1.2);
recommendations.recommended.provisionedWrite = Math.ceil(writeStats.maximum * 1.2);
recommendations.recommended.estimatedCost = this.calculateProvisionedCost(
recommendations.recommended.provisionedRead,
recommendations.recommended.provisionedWrite
);
}
return recommendations;
}
calculateStats(datapoints) {
const values = datapoints.map(d => d.Average).filter(v => v !== undefined);
return {
average: values.reduce((a, b) => a + b, 0) / values.length,
maximum: Math.max(...values),
minimum: Math.min(...values)
};
}
calculateOnDemandCost(avgReads, avgWrites) {
const monthlyReads = avgReads * 30 * 24 * 60 * 60; // Per second to per month
const monthlyWrites = avgWrites * 30 * 24 * 60 * 60;
const readCost = (monthlyReads / 1_000_000) * 0.25; // $0.25 per million reads
const writeCost = (monthlyWrites / 1_000_000) * 1.25; // $1.25 per million writes
return readCost + writeCost;
}
calculateProvisionedCost(rcus, wcus) {
const monthlyReadCost = rcus * 0.00013 * 24 * 30; // Per RCU-hour
const monthlyWriteCost = wcus * 0.00065 * 24 * 30; // Per WCU-hour
return monthlyReadCost + monthlyWriteCost;
}
}
// Usage
const optimizer = new CapacityOptimizer('Users');
const recommendations = await optimizer.analyzeUsage(30);
console.log('Recommendations:', recommendations);
// Apply recommendations
if (recommendations.recommended.mode === 'PAY_PER_REQUEST') {
await dynamodb.updateTable({
TableName: 'Users',
BillingMode: 'PAY_PER_REQUEST'
}).promise();
} else {
await dynamodb.updateTable({
TableName: 'Users',
BillingMode: 'PROVISIONED',
ProvisionedThroughput: {
ReadCapacityUnits: recommendations.recommended.provisionedRead,
WriteCapacityUnits: recommendations.recommended.provisionedWrite
}
}).promise();
}
Deep Dive: Cost Optimization at Scale
At scale, the choice between Billing Modes and the use of Reserved Capacity can result in cost differences of over 80%.1. The Mathematical Model for Mode Selection
The decision to switch from On-Demand to Provisioned can be modeled by comparing the cost of a million requests vs. the cost of a provisioned unit-hour.- On-Demand Cost (Cod): Cod=(Rm×Pr)+(Wm×Pw)
- Rm,Wm: Millions of reads/writes per month.
- Pr,Pw: Price per million (0.25forreads,1.25 for writes).
- Provisioned Cost (Cp): Cp=(RCU×Prcu×720)+(WCU×Pwcu×720)
- Prcu,Pwcu: Price per unit-hour (0.00013forRCU,0.00065 for WCU).
2. Reserved Capacity Strategy
For mission-critical, steady-state workloads, Reserved Capacity offers the steepest discounts (up to 77% over on-demand).- Commitment: 1-year or 3-year terms.
- Upfront vs. Monthly: You can pay all upfront, partial upfront, or no upfront (with higher monthly rates).
- Stacking: Reserved capacity is applied at the account level across all tables in a specific region.
| Optimization Level | Mode | Est. Monthly Cost (100M Reads) |
|---|---|---|
| Basic | On-Demand | $25.00 |
| Intermediate | Provisioned (No Auto-scale) | $15.00 |
| Advanced | Provisioned + Auto-scale | $11.00 |
| Extreme | 3-Year Reserved Capacity | $5.50 |
Reserved Capacity
Copy
// Purchase reserved capacity for predictable workloads
const purchaseReservedCapacity = async () => {
const dynamodb = new AWS.DynamoDB();
// 1-year commitment, 20% savings
await dynamodb.purchaseReservedCapacityOfferings({
ReservedCapacityOfferingsId: 'offering-id',
ReservedCapacityOfferingsCount: 100 // 100 WCUs or RCUs
}).promise();
console.log('Reserved capacity purchased');
console.log('Savings: ~20% vs on-demand pricing');
};
// View current reserved capacity
const viewReservedCapacity = async () => {
const reservations = await dynamodb.describeReservedCapacity().promise();
for (const reservation of reservations.ReservedCapacityDescriptions) {
console.log({
id: reservation.ReservedCapacityId,
capacity: reservation.ReservedCapacityUnits,
type: reservation.ReservedCapacityType, // READ or WRITE
startTime: reservation.StartTime,
duration: reservation.Duration,
state: reservation.State
});
}
};
Cost Monitoring and Alerts
Copy
// Set up cost monitoring
const setupCostMonitoring = async (tableName, monthlyBudget) => {
const budgets = new AWS.Budgets();
const sns = new AWS.SNS();
// Create SNS topic for alerts
const topic = await sns.createTopic({
Name: `${tableName}-cost-alerts`
}).promise();
// Create budget
await budgets.createBudget({
AccountId: 'account-id',
Budget: {
BudgetName: `${tableName}-monthly-budget`,
BudgetType: 'COST',
TimeUnit: 'MONTHLY',
BudgetLimit: {
Amount: monthlyBudget,
Unit: 'USD'
},
CostFilters: {
Service: ['Amazon DynamoDB']
}
},
NotificationsWithSubscribers: [
{
Notification: {
NotificationType: 'ACTUAL',
ComparisonOperator: 'GREATER_THAN',
Threshold: 80, // Alert at 80% of budget
ThresholdType: 'PERCENTAGE'
},
Subscribers: [{
SubscriptionType: 'SNS',
Address: topic.TopicArn
}]
},
{
Notification: {
NotificationType: 'FORECASTED',
ComparisonOperator: 'GREATER_THAN',
Threshold: 100, // Alert if forecast exceeds budget
ThresholdType: 'PERCENTAGE'
},
Subscribers: [{
SubscriptionType: 'SNS',
Address: topic.TopicArn
}]
}
]
}).promise();
console.log('Cost monitoring configured');
};
// Daily cost report
const generateCostReport = async (tableName) => {
const costExplorer = new AWS.CostExplorer();
const today = new Date();
const thirtyDaysAgo = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000);
const costs = await costExplorer.getCostAndUsage({
TimePeriod: {
Start: thirtyDaysAgo.toISOString().split('T')[0],
End: today.toISOString().split('T')[0]
},
Granularity: 'DAILY',
Metrics: ['UnblendedCost'],
Filter: {
And: [
{
Dimensions: {
Key: 'SERVICE',
Values: ['Amazon DynamoDB']
}
},
{
Tags: {
Key: 'TableName',
Values: [tableName]
}
}
]
},
GroupBy: [{
Type: 'DIMENSION',
Key: 'USAGE_TYPE'
}]
}).promise();
return costs.ResultsByTime;
};
Debugging and Troubleshooting
Enable CloudWatch Logs for API Calls
Copy
// Enable CloudWatch Logs for all DynamoDB API calls
const AWS = require('aws-sdk');
AWS.config.logger = console; // Log to console
// Or use custom logger
const logger = {
log: (message) => {
// Custom logging logic
console.log('[DynamoDB]', message);
}
};
AWS.config.logger = logger;
// X-Ray tracing for detailed request analysis
const AWSXRay = require('aws-xray-sdk');
const AWS_XRAY = AWSXRay.captureAWS(require('aws-sdk'));
const dynamodb = new AWS_XRAY.DynamoDB.DocumentClient();
// All requests now traced in X-Ray
await dynamodb.get({
TableName: 'Users',
Key: { userId: '123' }
}).promise();
// View traces in X-Ray console to see:
// - Request duration
// - DynamoDB latency
// - Throttling events
// - Error details
Common Issues and Solutions
Copy
// Issue 1: ProvisionedThroughputExceededException
const handleThrottling = async () => {
try {
await dynamodb.get({
TableName: 'Users',
Key: { userId: '123' }
}).promise();
} catch (error) {
if (error.code === 'ProvisionedThroughputExceededException') {
console.error('Throttled! Solutions:');
console.error('1. Increase provisioned capacity');
console.error('2. Switch to on-demand mode');
console.error('3. Implement exponential backoff');
console.error('4. Check for hot partitions');
// Immediate mitigation
await dynamodb.updateTable({
TableName: 'Users',
BillingMode: 'PAY_PER_REQUEST'
}).promise();
}
}
};
// Issue 2: ConditionalCheckFailedException
const handleConditionFailed = async () => {
try {
await dynamodb.update({
TableName: 'Users',
Key: { userId: '123' },
UpdateExpression: 'SET version = :newVer',
ConditionExpression: 'version = :currentVer',
ExpressionAttributeValues: {
':newVer': 2,
':currentVer': 1
}
}).promise();
} catch (error) {
if (error.code === 'ConditionalCheckFailedException') {
console.error('Condition failed! Solutions:');
console.error('1. Re-read item and retry');
console.error('2. Handle optimistic locking conflict');
console.error('3. Verify condition expression');
// Retry with fresh read
const item = await dynamodb.get({
TableName: 'Users',
Key: { userId: '123' },
ConsistentRead: true
}).promise();
// Retry update with current version
await dynamodb.update({
TableName: 'Users',
Key: { userId: '123' },
UpdateExpression: 'SET version = :newVer',
ConditionExpression: 'version = :currentVer',
ExpressionAttributeValues: {
':newVer': item.Item.version + 1,
':currentVer': item.Item.version
}
}).promise();
}
}
};
// Issue 3: ValidationException
const handleValidation = async () => {
try {
await dynamodb.get({
TableName: 'Users',
Key: { userId: 123 } // Wrong! Should be string
}).promise();
} catch (error) {
if (error.code === 'ValidationException') {
console.error('Validation error:', error.message);
console.error('Common causes:');
console.error('1. Wrong attribute type (Number vs String)');
console.error('2. Missing required key attributes');
console.error('3. Invalid expression syntax');
console.error('4. Reserved keywords without attribute names');
// Check error message for specific issue
if (error.message.includes('type')) {
console.error('Fix: Ensure key types match schema');
}
}
}
};
// Issue 4: ItemCollectionSizeLimitExceededException
const handleCollectionLimit = async () => {
try {
// Adding too many items to single partition
await dynamodb.put({
TableName: 'Messages',
Item: {
threadId: 'thread-123', // Same partition key
messageId: 'msg-10001',
// ... item would exceed 10GB partition limit
}
}).promise();
} catch (error) {
if (error.code === 'ItemCollectionSizeLimitExceededException') {
console.error('Partition size limit exceeded! Solutions:');
console.error('1. Use composite partition key with sharding');
console.error('2. Archive old items');
console.error('3. Split data across multiple partitions');
// Implement sharding
const shard = Math.floor(Math.random() * 10);
await dynamodb.put({
TableName: 'Messages',
Item: {
threadId: `thread-123#SHARD#${shard}`,
messageId: 'msg-10001'
}
}).promise();
}
}
};
Performance Debugging
Copy
// Measure and optimize query performance
class PerformanceDebugger {
async debugQuery(params) {
const start = Date.now();
const startMemory = process.memoryUsage().heapUsed;
let result;
try {
result = await dynamodb.query(params).promise();
} catch (error) {
console.error('Query failed:', error);
throw error;
}
const duration = Date.now() - start;
const memoryUsed = process.memoryUsage().heapUsed - startMemory;
const analysis = {
duration: `${duration}ms`,
memoryUsed: `${(memoryUsed / 1024 / 1024).toFixed(2)}MB`,
itemCount: result.Items.length,
scannedCount: result.ScannedCount,
consumedCapacity: result.ConsumedCapacity,
lastEvaluatedKey: result.LastEvaluatedKey ? 'Yes (paginated)' : 'No',
efficiency: `${((result.Items.length / result.ScannedCount) * 100).toFixed(2)}%`
};
// Recommendations
const recommendations = [];
if (duration > 100) {
recommendations.push('High latency detected - consider caching');
}
if (result.ScannedCount > result.Items.length * 2) {
recommendations.push('Low efficiency - optimize FilterExpression or use GSI');
}
if (result.LastEvaluatedKey) {
recommendations.push('Result paginated - implement proper pagination handling');
}
if (result.ConsumedCapacity?.CapacityUnits > 10) {
recommendations.push('High capacity consumption - use ProjectionExpression');
}
console.log('Query Analysis:', analysis);
console.log('Recommendations:', recommendations);
return result;
}
}
// Usage
const debugger = new PerformanceDebugger();
const result = await debugger.debugQuery({
TableName: 'Orders',
KeyConditionExpression: 'userId = :uid',
ExpressionAttributeValues: { ':uid': '123' },
ReturnConsumedCapacity: 'TOTAL'
});
Deep Dive: Identifying Hot Keys with Contributor Insights
In production, performance issues are often caused by “hot” partition keys—specific keys that receive a disproportionate amount of traffic.1. The Challenge of Scale
With millions of keys, identifying which specific one is causing throttling is difficult using standard metrics.ConsumedWriteCapacityUnits only shows the aggregate for the entire table.
2. Contributor Insights Mechanics
Contributor Insights is a diagnostic tool that provides a view of the “Top N” most accessed partition keys and sort keys in your table or index.- Sampled Analysis: It uses sampling to identify top contributors with minimal impact on performance.
- Visual Mapping: It generates time-series graphs showing the traffic volume for each of the top keys.
- Granularity: You can see which keys are being throttled vs. which ones are consuming the most capacity.
Copy
<svg viewBox="0 0 900 450" xmlns="http://www.w3.org/2000/svg">
<!-- Title -->
<text x="450" y="30" font-size="18" font-weight="bold" text-anchor="middle" fill="#333">
Contributor Insights: Hot Key Detection
</text>
<!-- Graph Area -->
<rect x="100" y="60" width="700" height="300" fill="#fff" stroke="#ccc" stroke-width="1"/>
<line x1="100" y1="360" x2="800" y2="360" stroke="#333" stroke-width="2"/>
<line x1="100" y1="60" x2="100" y2="360" stroke="#333" stroke-width="2"/>
<!-- Throttling Line -->
<line x1="100" y1="150" x2="800" y2="150" stroke="#f44336" stroke-width="2" stroke-dasharray="5,5"/>
<text x="810" y="155" font-size="11" fill="#f44336">Throttle Threshold</text>
<!-- Key 1 (Hot) -->
<path d="M 100 360 Q 300 300 450 100 T 800 80" fill="none" stroke="#1976d2" stroke-width="3"/>
<text x="460" y="120" font-size="12" fill="#1976d2" font-weight="bold">USER#9999 (HOT)</text>
<!-- Key 2 (Normal) -->
<path d="M 100 360 Q 400 350 800 340" fill="none" stroke="#4caf50" stroke-width="2"/>
<text x="700" y="330" font-size="11" fill="#4caf50">USER#0001</text>
<!-- Key 3 (Normal) -->
<path d="M 100 360 Q 400 355 800 350" fill="none" stroke="#9c27b0" stroke-width="2"/>
<text x="700" y="375" font-size="11" fill="#9c27b0">USER#0552</text>
</svg>
3. Operational Workflow
- Enable: Turn on Contributor Insights (small additional cost per rule).
- Observe: Look for “spikes” in specific key traffic in the CloudWatch console.
- Mitigate:
- App Layer: Add local caching for the hot key.
- Data Layer: Implement write sharding or reconsider the partition key design.
- DynamoDB Layer: Rely on Adaptive Capacity (though it has limits).
Production Patterns
Circuit Breaker with Fallback
Copy
class ResilientDynamoDBService {
constructor() {
this.circuitBreaker = new CircuitBreaker({
failureThreshold: 5,
resetTimeout: 60000
});
this.cache = new Redis();
}
async getUser(userId) {
try {
return await this.circuitBreaker.execute(async () => {
const result = await dynamodb.get({
TableName: 'Users',
Key: { userId: userId }
}).promise();
// Update cache
await this.cache.setex(
`user:${userId}`,
300,
JSON.stringify(result.Item)
);
return result.Item;
});
} catch (error) {
if (error.message === 'Circuit breaker is OPEN') {
// Fallback to cache
const cached = await this.cache.get(`user:${userId}`);
if (cached) {
return {
...JSON.parse(cached),
_fromCache: true,
_stale: true
};
}
// Last resort: return error
throw new Error('Service unavailable');
}
throw error;
}
}
}
Connection Pooling
Copy
// Reuse DynamoDB client connections
class DynamoDBConnectionPool {
constructor() {
this.clients = new Map();
this.maxClients = 10;
}
getClient(region = 'us-east-1') {
const key = region;
if (!this.clients.has(key)) {
if (this.clients.size >= this.maxClients) {
// Remove oldest client
const firstKey = this.clients.keys().next().value;
this.clients.delete(firstKey);
}
const client = new AWS.DynamoDB.DocumentClient({
region: region,
maxRetries: 3,
httpOptions: {
timeout: 5000,
connectTimeout: 2000,
agent: new https.Agent({
keepAlive: true,
maxSockets: 50
})
}
});
this.clients.set(key, client);
}
return this.clients.get(key);
}
}
// Global instance
const pool = new DynamoDBConnectionPool();
// Usage across application
const dynamodb = pool.getClient('us-east-1');
await dynamodb.get({...}).promise();
Bulk Operations with Rate Limiting
Copy
class BulkOperationService {
constructor(maxConcurrency = 10, rateLimit = 100) {
this.maxConcurrency = maxConcurrency;
this.rateLimit = rateLimit; // items per second
this.queue = [];
}
async bulkWrite(items) {
const batches = this.createBatches(items, 25); // DynamoDB batch size
const results = [];
for (let i = 0; i < batches.length; i += this.maxConcurrency) {
const batchGroup = batches.slice(i, i + this.maxConcurrency);
// Process batches in parallel (up to maxConcurrency)
const batchResults = await Promise.all(
batchGroup.map(batch => this.writeBatch(batch))
);
results.push(...batchResults);
// Rate limiting
if (i + this.maxConcurrency < batches.length) {
const delay = (1000 * this.maxConcurrency * 25) / this.rateLimit;
await new Promise(resolve => setTimeout(resolve, delay));
}
}
return results;
}
async writeBatch(items) {
return await dynamodb.batchWrite({
RequestItems: {
'MyTable': items.map(item => ({
PutRequest: { Item: item }
}))
}
}).promise();
}
createBatches(items, batchSize) {
const batches = [];
for (let i = 0; i < items.length; i += batchSize) {
batches.push(items.slice(i, i + batchSize));
}
return batches;
}
}
// Usage
const bulkService = new BulkOperationService(10, 100);
await bulkService.bulkWrite(largeDataset);
Interview Questions and Answers
Question 1: How do you secure sensitive data in DynamoDB?
Answer: Multi-layered security approach: 1. Encryption at Rest:Copy
// Enable KMS encryption
SSESpecification: {
Enabled: true,
SSEType: 'KMS',
KMSMasterKeyId: 'key-id'
}
- All API calls use HTTPS/TLS
- VPC endpoints for private access
Copy
// Encrypt sensitive fields before storing
const encrypted = await kms.encrypt({
KeyId: 'key-id',
Plaintext: sensitiveData
}).promise();
Copy
// Least privilege access
Condition: {
'ForAllValues:StringEquals': {
'dynamodb:LeadingKeys': ['${aws:userid}']
}
}
- DynamoDB Streams for change tracking
- CloudWatch Logs for API calls
- CloudTrail for access logging
Question 2: How do you optimize DynamoDB costs in production?
Answer: 1. Capacity Mode Selection:Copy
// Analyze usage patterns
const variability = (peak - average) / average;
if (variability > 2) {
// High variability -> On-Demand
} else {
// Predictable -> Provisioned (60% cheaper)
}
- 1-year commitment: 20% savings
- 3-year commitment: 50% savings
Copy
// Minimize item size
// Use shorter attribute names
{
uid: '123', // Not userId
nm: 'Alice', // Not name
em: '[email protected]' // Not email
}
Copy
// Use ProjectionExpression
ProjectionExpression: 'id, name' // Not full item
// Use BatchGetItem (not multiple GetItems)
// Use eventually consistent reads (50% cheaper)
Copy
// Auto-delete old items (no cost)
ttl: Math.floor(Date.now() / 1000) + (30 * 24 * 60 * 60)
Copy
// AWS Budgets for cost alerts
// CloudWatch metrics for capacity utilization
// Cost Explorer for trend analysis
Question 3: How do you handle schema migrations in production?
Answer: Additive Changes (Safe):Copy
// Just add new attributes
await dynamodb.update({
Key: { userId: '123' },
UpdateExpression: 'SET newField = :val',
ExpressionAttributeValues: { ':val': 'default' }
}).promise();
// Old code ignores new field
// New code uses new field
Copy
// Add schema version
{
userId: '123',
schemaVersion: '2.0',
// ... fields
}
// Handle multiple versions
function parseUser(item) {
switch (item.schemaVersion) {
case '1.0':
return migrateV1ToV2(item);
case '2.0':
return item;
default:
throw new Error('Unknown schema version');
}
}
Copy
// Migrate on read/write
async function getUser(userId) {
const item = await dynamodb.get({...}).promise();
if (needsMigration(item.Item)) {
const migrated = migrateSchema(item.Item);
await dynamodb.put({
TableName: 'Users',
Item: migrated
}).promise();
return migrated;
}
return item.Item;
}
Copy
// Add GSI without downtime
await dynamodb.updateTable({
TableName: 'Users',
GlobalSecondaryIndexUpdates: [{
Create: {
IndexName: 'EmailIndex',
KeySchema: [
{ AttributeName: 'email', KeyType: 'HASH' }
],
Projection: { ProjectionType: 'ALL' }
}
}]
}).promise();
// Backfill existing items
Copy
// 1. Create new table
// 2. Dual-write to both tables
// 3. Backfill old data
// 4. Migrate reads to new table
// 5. Stop writes to old table
// 6. Delete old table
Question 4: What’s your debugging strategy for production DynamoDB issues?
Answer: Step 1: Enable Comprehensive Logging:Copy
// X-Ray tracing
const AWSXRay = require('aws-xray-sdk');
const AWS = AWSXRay.captureAWS(require('aws-sdk'));
// CloudWatch Logs
AWS.config.logger = console;
Copy
// Key metrics to check:
- UserErrors (throttling)
- SystemErrors (service issues)
- ConsumedReadCapacityUnits
- ConsumedWriteCapacityUnits
- SuccessfulRequestLatency
Copy
// Identify hot partitions
await dynamodb.updateContributorInsights({
TableName: 'Users',
ContributorInsightsAction: 'ENABLE'
}).promise();
Copy
// Add detailed logging
const start = Date.now();
const result = await dynamodb.query({
...params,
ReturnConsumedCapacity: 'TOTAL'
}).promise();
console.log({
duration: Date.now() - start,
itemCount: result.Items.length,
scannedCount: result.ScannedCount,
consumedCapacity: result.ConsumedCapacity,
efficiency: result.Items.length / result.ScannedCount
});
Copy
// Throttling?
- Check provisioned capacity vs consumed
- Look for hot partitions
- Implement exponential backoff
// High latency?
- Check query efficiency (scanned vs returned)
- Consider caching (DAX or Redis)
- Use ProjectionExpression
// Errors?
- Check error code and message
- Verify IAM permissions
- Validate attribute types
Question 5: Design a production-ready DynamoDB architecture for a high-traffic application.
Answer:Copy
// Multi-region, highly available architecture
const productionArchitecture = {
// 1. Data Layer
dataLayer: {
// Primary table with Global Tables
mainTable: {
name: 'Production',
billingMode: 'PAY_PER_REQUEST', // Auto-scales
globalTables: ['us-east-1', 'us-west-2', 'eu-west-1'],
pointInTimeRecovery: true, // 35-day backup
encryption: {
type: 'KMS',
keyId: 'app-key'
},
streams: {
enabled: true,
viewType: 'NEW_AND_OLD_IMAGES'
}
},
// Caching layer
cache: {
dax: {
clusterSize: 3, // Multi-AZ
nodeType: 'dax.r5.large',
ttl: 300
},
redis: {
clusterMode: true,
replicasPerShard: 2,
useCase: 'Aggregations, leaderboards'
}
}
},
// 2. Application Layer
applicationLayer: {
// Connection pool
connectionPool: {
maxConnections: 50,
keepAlive: true,
timeout: 5000
},
// Retry logic
retryPolicy: {
maxRetries: 5,
exponentialBackoff: true,
jitter: true
},
// Circuit breaker
circuitBreaker: {
failureThreshold: 5,
resetTimeout: 60000
}
},
// 3. Monitoring
monitoring: {
cloudWatch: {
alarms: [
'ReadThrottleEvents > 10',
'WriteThrottleEvents > 10',
'SystemErrors > 5',
'SuccessfulRequestLatency > 100ms'
],
dashboards: [
'Capacity utilization',
'Latency trends',
'Error rates'
]
},
xray: {
enabled: true,
samplingRate: 0.1 // 10% of requests
},
customMetrics: {
businessMetrics: [
'Orders per second',
'Active users',
'Transaction success rate'
]
}
},
// 4. Security
security: {
iam: {
leastPrivilege: true,
itemLevelAccess: true
},
encryption: {
atRest: 'KMS',
inTransit: 'TLS',
clientSide: 'Sensitive fields only'
},
vpcEndpoints: {
enabled: true,
privateAccess: true
},
auditLogging: {
streams: true,
cloudTrail: true,
retention: 90 // days
}
},
// 5. Disaster Recovery
disasterRecovery: {
rto: '< 2 minutes',
rpo: '< 1 second',
strategy: {
multiRegion: true,
automaticFailover: true,
healthChecks: {
interval: 30, // seconds
failureThreshold: 3
}
},
backups: {
pitr: true,
onDemand: 'daily',
s3Export: 'weekly'
},
testing: {
drDrills: 'quarterly',
chaosEngineering: true
}
},
// 6. Cost Optimization
costOptimization: {
capacityMode: 'Analyze and adjust monthly',
reservedCapacity: 'For baseline load',
ttl: 'Auto-cleanup old data',
itemSize: 'Minimized with compression',
budgets: {
monthly: 10000,
alerts: [80, 90, 100] // % of budget
}
}
};
// Implementation example
class ProductionDynamoDBService {
constructor() {
this.dynamodb = this.createClient();
this.dax = this.createDAXClient();
this.redis = new Redis({ cluster: true });
this.circuitBreaker = new CircuitBreaker();
}
createClient() {
return new AWS.DynamoDB.DocumentClient({
region: process.env.AWS_REGION,
maxRetries: 5,
httpOptions: {
timeout: 5000,
agent: new https.Agent({
keepAlive: true,
maxSockets: 50
})
}
});
}
async getItem(key) {
// Layer 1: Check Redis
const cached = await this.redis.get(JSON.stringify(key));
if (cached) return JSON.parse(cached);
// Layer 2: Check DAX
try {
const result = await this.dax.get({
TableName: 'Production',
Key: key
}).promise();
if (result.Item) {
await this.redis.setex(JSON.stringify(key), 300, JSON.stringify(result.Item));
return result.Item;
}
} catch (error) {
// DAX failed, fall through to DynamoDB
}
// Layer 3: DynamoDB with circuit breaker
return await this.circuitBreaker.execute(async () => {
const result = await this.dynamodb.get({
TableName: 'Production',
Key: key
}).promise();
return result.Item;
});
}
}
- Multi-region Global Tables
- Multi-layer caching (DAX + Redis)
- Circuit breaker protection
- Comprehensive monitoring
- Automated failover
- Cost optimization
- Regular DR testing
Summary
Production Checklist:-
Security:
- KMS encryption enabled
- Least-privilege IAM policies
- VPC endpoints configured
- Audit logging enabled
- Client-side encryption for sensitive data
-
Cost Optimization:
- Right-sized capacity mode
- Reserved capacity for baseline
- TTL for auto-cleanup
- Cost monitoring and alerts
- Regular capacity reviews
-
Reliability:
- Point-in-Time Recovery enabled
- Global Tables for DR
- Automated backups
- Circuit breakers implemented
- Graceful degradation
-
Performance:
- Caching layer (DAX/Redis)
- Connection pooling
- Parallel queries
- Optimized data model
- Projection expressions
-
Monitoring:
- CloudWatch alarms
- X-Ray tracing
- Custom metrics
- Health checks
- Regular reviews