| layout | default |
|---|---|
| title | Chapter 8: Production Deployment |
| parent | LangChain Tutorial |
| nav_order | 8 |
Welcome to production deployment! In this chapter, we'll explore how to deploy LangChain applications at scale, implement robust monitoring systems, ensure security and compliance, and optimize performance for production workloads. You'll learn the essential practices for running LangChain applications in real-world environments.
from typing import Dict, Any, List, Optional
from fastapi import FastAPI, Request, HTTPException
from pydantic import BaseModel
import uvicorn
import logging
from contextlib import asynccontextmanager
import asyncio
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import time
import psutil
import threading
class ProductionConfig:
"""Production configuration management"""
def __init__(self):
self.app_name = "LangChain-API"
self.version = "1.0.0"
self.environment = "production"
self.max_concurrent_requests = 10
self.request_timeout = 30
self.rate_limit_requests = 100
self.rate_limit_window = 60 # seconds
# Model configuration
self.model_name = "gpt-4"
self.temperature = 0.7
self.max_tokens = 1000
# Monitoring
self.enable_monitoring = True
self.metrics_port = 9090
# Security
self.enable_auth = True
self.api_keys = set() # Load from secure storage
def load_from_env(self):
"""Load configuration from environment variables"""
import os
self.max_concurrent_requests = int(os.getenv("MAX_CONCURRENT_REQUESTS", "10"))
self.model_name = os.getenv("MODEL_NAME", "gpt-4")
self.enable_auth = os.getenv("ENABLE_AUTH", "true").lower() == "true"
class ProductionLangChainApp:
"""Production-ready LangChain application"""
def __init__(self, config: ProductionConfig):
self.config = config
self.llm = None
self.chains = {}
self.monitoring = ProductionMonitoring()
self.security = SecurityManager(config)
self.rate_limiter = RateLimiter(
requests=config.rate_limit_requests,
window=config.rate_limit_window
)
# Request tracking
self.active_requests = 0
self.request_queue = asyncio.Queue(maxsize=config.max_concurrent_requests)
# Initialize components
self._initialize_components()
def _initialize_components(self):
"""Initialize application components"""
# Initialize LLM
self.llm = OpenAI(
model_name=self.config.model_name,
temperature=self.config.temperature,
max_tokens=self.config.max_tokens,
openai_api_key=self._get_api_key()
)
# Initialize chains
self._initialize_chains()
# Start background tasks
self._start_background_tasks()
def _get_api_key(self) -> str:
"""Securely retrieve API key"""
# In production, use secret management service
import os
return os.environ.get("OPENAI_API_KEY", "")
def _initialize_chains(self):
"""Initialize production chains"""
# Basic Q&A chain
qa_prompt = PromptTemplate(
input_variables=["question"],
template="""Answer the following question accurately and concisely:
Question: {question}
Answer:"""
)
self.chains["qa"] = LLMChain(llm=self.llm, prompt=qa_prompt)
# Summarization chain
summary_prompt = PromptTemplate(
input_variables=["text"],
template="""Summarize the following text in 2-3 sentences:
Text: {text}
Summary:"""
)
self.chains["summarize"] = LLMChain(llm=self.llm, prompt=summary_prompt)
def _start_background_tasks(self):
"""Start background monitoring and maintenance tasks"""
# Start monitoring thread
monitoring_thread = threading.Thread(target=self._monitoring_worker, daemon=True)
monitoring_thread.start()
def _monitoring_worker(self):
"""Background monitoring worker"""
while True:
self.monitoring.record_system_metrics({
"active_requests": self.active_requests,
"cpu_usage": psutil.cpu_percent(),
"memory_usage": psutil.virtual_memory().percent,
"queue_size": self.request_queue.qsize()
})
time.sleep(5) # Update every 5 seconds
async def process_request(self, chain_name: str, inputs: Dict[str, Any], api_key: Optional[str] = None) -> Dict[str, Any]:
"""Process a request through the application"""
start_time = time.time()
try:
# Authentication
if self.config.enable_auth:
if not api_key or not self.security.validate_api_key(api_key):
raise HTTPException(status_code=401, detail="Invalid API key")
# Rate limiting
if not self.rate_limiter.allow_request(api_key or "anonymous"):
raise HTTPException(status_code=429, detail="Rate limit exceeded")
# Queue management
await self.request_queue.put(None) # Acquire slot
self.active_requests += 1
# Validate chain exists
if chain_name not in self.chains:
raise HTTPException(status_code=404, detail=f"Chain '{chain_name}' not found")
# Execute chain with timeout
chain = self.chains[chain_name]
result = await asyncio.wait_for(
self._execute_chain(chain, inputs),
timeout=self.config.request_timeout
)
# Record metrics
execution_time = time.time() - start_time
self.monitoring.record_request_metrics(
chain_name, execution_time, True, len(str(inputs)), len(str(result))
)
return {
"success": True,
"result": result,
"execution_time": execution_time,
"chain": chain_name
}
except asyncio.TimeoutError:
self.monitoring.record_request_metrics(chain_name, time.time() - start_time, False, 0, 0)
raise HTTPException(status_code=408, detail="Request timeout")
except Exception as e:
execution_time = time.time() - start_time
self.monitoring.record_request_metrics(chain_name, execution_time, False, 0, 0)
raise HTTPException(status_code=500, detail=str(e))
finally:
# Release queue slot
try:
self.request_queue.get_nowait()
self.active_requests -= 1
except:
pass
async def _execute_chain(self, chain, inputs: Dict[str, Any]) -> Any:
"""Execute a chain with proper error handling"""
try:
if hasattr(chain, 'arun'):
return await chain.arun(**inputs)
else:
return chain.run(**inputs)
except Exception as e:
self.monitoring.record_error(str(e))
raise
def get_health_status(self) -> Dict[str, Any]:
"""Get application health status"""
return {
"status": "healthy",
"version": self.config.version,
"active_requests": self.active_requests,
"queue_size": self.request_queue.qsize(),
"chains_available": list(self.chains.keys()),
"uptime": time.time() - getattr(self, '_start_time', time.time())
}
# Global application instance
config = ProductionConfig()
app_instance = ProductionLangChainApp(config)from fastapi import FastAPI, HTTPException, Request
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Create FastAPI app
app = FastAPI(
title=config.app_name,
version=config.version,
description="Production LangChain API"
)
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Configure appropriately for production
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Request/Response models
class ChainRequest(BaseModel):
chain_name: str
inputs: Dict[str, Any]
api_key: Optional[str] = None
class ChainResponse(BaseModel):
success: bool
result: Optional[Any] = None
execution_time: Optional[float] = None
chain: Optional[str] = None
error: Optional[str] = None
class HealthResponse(BaseModel):
status: str
version: str
active_requests: int
queue_size: int
chains_available: List[str]
uptime: float
# API endpoints
@app.post("/api/chain/execute", response_model=ChainResponse)
async def execute_chain(request: ChainRequest):
"""Execute a LangChain"""
try:
result = await app_instance.process_request(
request.chain_name,
request.inputs,
request.api_key
)
return ChainResponse(**result)
except HTTPException as e:
return ChainResponse(
success=False,
error=e.detail,
result=None
)
except Exception as e:
logger.error(f"Unexpected error: {e}")
return ChainResponse(
success=False,
error="Internal server error",
result=None
)
@app.get("/api/health", response_model=HealthResponse)
async def health_check():
"""Health check endpoint"""
return HealthResponse(**app_instance.get_health_status())
@app.get("/api/chains")
async def list_chains():
"""List available chains"""
return {
"chains": list(app_instance.chains.keys()),
"count": len(app_instance.chains)
}
@app.get("/api/metrics")
async def get_metrics(api_key: Optional[str] = None):
"""Get application metrics"""
if config.enable_auth and not app_instance.security.validate_api_key(api_key):
raise HTTPException(status_code=401, detail="Invalid API key")
return app_instance.monitoring.get_metrics()
# Startup and shutdown events
@asynccontextmanager
async def lifespan(app: FastAPI):
# Startup
app_instance._start_time = time.time()
logger.info("Application startup complete")
yield
# Shutdown
logger.info("Application shutting down")
app.router.lifespan_context = lifespan
# Error handlers
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException):
return ChainResponse(
success=False,
error=exc.detail,
result=None
)
if __name__ == "__main__":
uvicorn.run(
"main:app",
host="0.0.0.0",
port=8000,
workers=4, # Multiple workers for production
reload=False
)import time
from typing import Dict, Any, List
from collections import defaultdict, deque
import json
class ProductionMonitoring:
"""Production monitoring and metrics collection"""
def __init__(self):
self.request_metrics = defaultdict(list)
self.system_metrics = deque(maxlen=1000) # Keep last 1000 readings
self.error_logs = deque(maxlen=100)
self.start_time = time.time()
def record_request_metrics(self, chain_name: str, execution_time: float,
success: bool, input_size: int, output_size: int):
"""Record request execution metrics"""
metric = {
"timestamp": time.time(),
"chain_name": chain_name,
"execution_time": execution_time,
"success": success,
"input_size": input_size,
"output_size": output_size
}
self.request_metrics[chain_name].append(metric)
# Keep only recent metrics (last 1000 per chain)
if len(self.request_metrics[chain_name]) > 1000:
self.request_metrics[chain_name].pop(0)
def record_system_metrics(self, metrics: Dict[str, Any]):
"""Record system-level metrics"""
metrics["timestamp"] = time.time()
self.system_metrics.append(metrics)
def record_error(self, error: str, context: Optional[Dict[str, Any]] = None):
"""Record application errors"""
error_record = {
"timestamp": time.time(),
"error": error,
"context": context or {}
}
self.error_logs.append(error_record)
def get_metrics(self) -> Dict[str, Any]:
"""Get comprehensive metrics"""
return {
"uptime": time.time() - self.start_time,
"request_metrics": self._aggregate_request_metrics(),
"system_metrics": self._get_latest_system_metrics(),
"error_summary": self._get_error_summary(),
"performance_indicators": self._calculate_performance_indicators()
}
def _aggregate_request_metrics(self) -> Dict[str, Any]:
"""Aggregate request metrics by chain"""
aggregated = {}
for chain_name, metrics in self.request_metrics.items():
if not metrics:
continue
successful_requests = len([m for m in metrics if m["success"]])
total_requests = len(metrics)
avg_execution_time = sum(m["execution_time"] for m in metrics) / total_requests
aggregated[chain_name] = {
"total_requests": total_requests,
"successful_requests": successful_requests,
"success_rate": successful_requests / total_requests,
"average_execution_time": avg_execution_time,
"requests_per_minute": self._calculate_rpm(metrics)
}
return aggregated
def _calculate_rpm(self, metrics: List[Dict[str, Any]]) -> float:
"""Calculate requests per minute"""
if not metrics:
return 0
# Use last 10 minutes of data
cutoff_time = time.time() - 600
recent_metrics = [m for m in metrics if m["timestamp"] > cutoff_time]
if not recent_metrics:
return 0
time_span = time.time() - recent_metrics[0]["timestamp"]
if time_span == 0:
return 0
return len(recent_metrics) / (time_span / 60)
def _get_latest_system_metrics(self) -> Dict[str, Any]:
"""Get latest system metrics"""
if not self.system_metrics:
return {}
return self.system_metrics[-1]
def _get_error_summary(self) -> Dict[str, Any]:
"""Get error summary"""
recent_errors = [e for e in self.error_logs if time.time() - e["timestamp"] < 3600] # Last hour
return {
"total_recent_errors": len(recent_errors),
"error_rate": len(recent_errors) / 60, # Errors per minute
"latest_errors": recent_errors[-5:] if recent_errors else []
}
def _calculate_performance_indicators(self) -> Dict[str, Any]:
"""Calculate key performance indicators"""
indicators = {
"overall_success_rate": 0,
"average_response_time": 0,
"system_health_score": 0
}
# Calculate overall success rate
total_requests = 0
total_successful = 0
for metrics in self.request_metrics.values():
successful = len([m for m in metrics if m["success"]])
total_requests += len(metrics)
total_successful += successful
if total_requests > 0:
indicators["overall_success_rate"] = total_successful / total_requests
# Calculate average response time
if self.request_metrics:
total_time = 0
total_count = 0
for metrics in self.request_metrics.values():
total_time += sum(m["execution_time"] for m in metrics)
total_count += len(metrics)
if total_count > 0:
indicators["average_response_time"] = total_time / total_count
# System health score (0-100)
success_score = indicators["overall_success_rate"] * 50
response_time_score = max(0, 50 - (indicators["average_response_time"] * 10))
indicators["system_health_score"] = success_score + response_time_score
return indicators
def export_metrics(self, format: str = "json") -> str:
"""Export metrics for external monitoring systems"""
metrics = self.get_metrics()
if format == "json":
return json.dumps(metrics, indent=2)
elif format == "prometheus":
return self._format_prometheus(metrics)
else:
return str(metrics)
def _format_prometheus(self, metrics: Dict[str, Any]) -> str:
"""Format metrics for Prometheus"""
prometheus_output = []
# Request metrics
for chain_name, chain_metrics in metrics.get("request_metrics", {}).items():
prometheus_output.append(f'# HELP langchain_requests_total Total requests for {chain_name}')
prometheus_output.append(f'# TYPE langchain_requests_total counter')
prometheus_output.append(f'langchain_requests_total{{chain="{chain_name}"}} {chain_metrics["total_requests"]}')
prometheus_output.append(f'# HELP langchain_success_rate Success rate for {chain_name}')
prometheus_output.append(f'# TYPE langchain_success_rate gauge')
prometheus_output.append(f'langchain_success_rate{{chain="{chain_name}"}} {chain_metrics["success_rate"]}')
return "\n".join(prometheus_output)import hashlib
import secrets
from typing import Set, Optional
class SecurityManager:
"""Security management for the application"""
def __init__(self, config: ProductionConfig):
self.config = config
self.api_keys: Set[str] = set()
self.blocked_ips: Set[str] = set()
self.suspicious_activity: Dict[str, List] = defaultdict(list)
# Load API keys from secure storage
self._load_api_keys()
def _load_api_keys(self):
"""Load API keys from secure storage"""
# In production, load from secret management service
# For demo, we'll accept any key
pass
def validate_api_key(self, api_key: Optional[str]) -> bool:
"""Validate an API key"""
if not self.config.enable_auth:
return True
if not api_key:
return False
# Hash the API key for comparison (don't store plain text)
key_hash = hashlib.sha256(api_key.encode()).hexdigest()
# In production, compare against stored hashes
return True # Placeholder
def register_api_key(self, api_key: str, owner: str):
"""Register a new API key"""
key_hash = hashlib.sha256(api_key.encode()).hexdigest()
# Store in secure storage
self.api_keys.add(key_hash)
def revoke_api_key(self, api_key: str):
"""Revoke an API key"""
key_hash = hashlib.sha256(api_key.encode()).hexdigest()
self.api_keys.discard(key_hash)
def block_ip(self, ip_address: str):
"""Block an IP address"""
self.blocked_ips.add(ip_address)
def is_ip_blocked(self, ip_address: str) -> bool:
"""Check if an IP is blocked"""
return ip_address in self.blocked_ips
def record_suspicious_activity(self, ip_address: str, activity: str):
"""Record suspicious activity"""
self.suspicious_activity[ip_address].append({
"activity": activity,
"timestamp": time.time()
})
# Auto-block if too many suspicious activities
if len(self.suspicious_activity[ip_address]) > 5:
self.block_ip(ip_address)
def get_security_report(self) -> Dict[str, Any]:
"""Get security status report"""
return {
"authentication_enabled": self.config.enable_auth,
"total_api_keys": len(self.api_keys),
"blocked_ips": len(self.blocked_ips),
"suspicious_activities": dict(self.suspicious_activity)
}from collections import defaultdict, deque
import time
class RateLimiter:
"""Rate limiting implementation"""
def __init__(self, requests: int, window: int):
self.requests = requests
self.window = window
self.request_history: Dict[str, deque] = defaultdict(lambda: deque(maxlen=requests))
def allow_request(self, identifier: str) -> bool:
"""Check if a request is allowed"""
now = time.time()
request_times = self.request_history[identifier]
# Remove old requests outside the window
while request_times and now - request_times[0] > self.window:
request_times.popleft()
# Check if under limit
if len(request_times) < self.requests:
request_times.append(now)
return True
return False
def get_remaining_requests(self, identifier: str) -> int:
"""Get remaining requests for the current window"""
now = time.time()
request_times = self.request_history[identifier]
# Clean old requests
while request_times and now - request_times[0] > self.window:
request_times.popleft()
return max(0, self.requests - len(request_times))
def get_reset_time(self, identifier: str) -> float:
"""Get time until rate limit resets"""
request_times = self.request_history[identifier]
if not request_times:
return 0
oldest_request = request_times[0]
return max(0, self.window - (time.time() - oldest_request))from typing import Dict, Any, Optional
import json
import hashlib
class CacheManager:
"""Caching system for expensive operations"""
def __init__(self, max_size: int = 1000, ttl: int = 3600):
self.cache: Dict[str, Dict[str, Any]] = {}
self.max_size = max_size
self.ttl = ttl
def get(self, key: str) -> Optional[Any]:
"""Get cached value"""
if key in self.cache:
entry = self.cache[key]
if time.time() - entry["timestamp"] < self.ttl:
return entry["value"]
else:
# Expired, remove it
del self.cache[key]
return None
def set(self, key: str, value: Any):
"""Set cached value"""
# Evict oldest entries if cache is full
if len(self.cache) >= self.max_size:
self._evict_oldest()
self.cache[key] = {
"value": value,
"timestamp": time.time()
}
def generate_key(self, *args, **kwargs) -> str:
"""Generate cache key from arguments"""
key_data = json.dumps({"args": args, "kwargs": kwargs}, sort_keys=True)
return hashlib.md5(key_data.encode()).hexdigest()
def _evict_oldest(self):
"""Evict the oldest cache entry"""
if not self.cache:
return
oldest_key = min(self.cache.keys(),
key=lambda k: self.cache[k]["timestamp"])
del self.cache[oldest_key]
def clear_expired(self):
"""Clear expired cache entries"""
current_time = time.time()
expired_keys = [
key for key, entry in self.cache.items()
if current_time - entry["timestamp"] >= self.ttl
]
for key in expired_keys:
del self.cache[key]
# Integration with LangChain
class CachedLLMChain:
"""LLMChain with caching"""
def __init__(self, chain, cache_manager: CacheManager):
self.chain = chain
self.cache = cache_manager
async def arun(self, **kwargs) -> Any:
"""Run chain with caching"""
cache_key = self.cache.generate_key(**kwargs)
# Check cache first
cached_result = self.cache.get(cache_key)
if cached_result is not None:
return cached_result
# Execute chain
result = await self.chain.arun(**kwargs)
# Cache result
self.cache.set(cache_key, result)
return result# Dockerfile for production deployment
FROM python:3.9-slim
# Set environment variables
ENV PYTHONUNBUFFERED=1
ENV PYTHONDONTWRITEBYTECODE=1
# Create app directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
gcc \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY . .
# Create non-root user
RUN useradd --create-home --shell /bin/bash app \
&& chown -R app:app /app
USER app
# Health check
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/api/health || exit 1
# Expose port
EXPOSE 8000
# Run application
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4"]version: '3.8'
services:
langchain-api:
build: .
ports:
- "8000:8000"
environment:
- OPENAI_API_KEY=${OPENAI_API_KEY}
- MAX_CONCURRENT_REQUESTS=20
- MODEL_NAME=gpt-4
- ENABLE_AUTH=true
volumes:
- ./logs:/app/logs
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/api/health"]
interval: 30s
timeout: 10s
retries: 3
deploy:
resources:
limits:
cpus: '2.0'
memory: 4G
reservations:
cpus: '1.0'
memory: 2G
prometheus:
image: prom/prometheus
ports:
- "9090:9090"
volumes:
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml
command:
- '--config.file=/etc/prometheus/prometheus.yml'
grafana:
image: grafana/grafana
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
volumes:
- grafana_data:/var/lib/grafana
volumes:
grafana_data:Congratulations! 🎉 You've successfully learned about:
- Production Architecture - Scalable application structure and configuration
- REST API Integration - FastAPI application with proper error handling
- Monitoring and Observability - Comprehensive metrics and health monitoring
- Security and Rate Limiting - Authentication, authorization, and rate limiting
- Performance Optimization - Caching and optimization techniques
- Deployment and Scaling - Docker configuration and orchestration
Before deploying to production:
- Environment variables configured
- API keys secured
- Database connections tested
- Logging configured
- Monitoring set up
- Authentication enabled
- HTTPS configured
- Rate limiting active
- Input validation in place
- Secrets management configured
- Load testing completed
- Caching configured
- Database queries optimized
- Memory usage monitored
- Response times within limits
- Health checks implemented
- Metrics collection active
- Alerting configured
- Log aggregation set up
- Performance dashboards created
- Docker images built
- Container orchestration configured
- Rolling deployment strategy ready
- Rollback plan documented
- Backup strategy in place
Your LangChain journey is complete! You now have the knowledge and tools to build and deploy production-ready AI applications. Here are some next steps:
- Explore Advanced Topics - LangChain has many more advanced features
- Build Real Applications - Create applications that solve real problems
- Contribute to the Community - Share your knowledge and help others
- Stay Updated - Follow LangChain developments and best practices
- Experiment and Innovate - Try new combinations of chains and tools
What production LangChain application will you build first? 🚀
flowchart TD
A[LangChain app] --> B[LangServe REST API]
B --> C[POST /chain/invoke]
C --> D[Chain executes]
D --> E[LangSmith tracing]
E --> F[Trace stored]
D --> G[Response]
A --> H[Docker container]
H --> I[Cloud deployment]
Most teams struggle here because the hard part is not writing more code, but deciding clear boundaries for self, time, metrics so behavior stays predictable as complexity grows.
In practical terms, this chapter helps you avoid three common failures:
- coupling core logic too tightly to one implementation path
- missing the handoff boundaries between setup, execution, and validation
- shipping changes without clear rollback or observability strategy
After working through this chapter, you should be able to reason about Chapter 8: Production Deployment as an operating subsystem inside LangChain Tutorial: Building AI Applications with Large Language Models, with explicit contracts for inputs, state transitions, and outputs.
Use the implementation notes around config, chain, chain_name as your checklist when adapting these patterns to your own repository.
Under the hood, Chapter 8: Production Deployment usually follows a repeatable control path:
- Context bootstrap: initialize runtime config and prerequisites for
self. - Input normalization: shape incoming data so
timereceives stable contracts. - Core execution: run the main logic branch and propagate intermediate state through
metrics. - Policy and safety checks: enforce limits, auth scopes, and failure boundaries.
- Output composition: return canonical result payloads for downstream consumers.
- Operational telemetry: emit logs/metrics needed for debugging and performance tuning.
When debugging, walk this sequence in order and confirm each stage has explicit success/failure conditions.
Use the following upstream sources to verify implementation details while reading this chapter:
- View Repo
Why it matters: authoritative reference on
View Repo(github.com).
Suggested trace strategy:
- search upstream code for
selfandtimeto map concrete implementation paths - compare docs claims against actual runtime/config code before reusing patterns in production