Deploying ML models to production requires careful consideration of scalability, monitoring, and versioning. This guide walks through a production-ready deployment pipeline.
# Dockerfile for ML model serving
FROM python:3.12-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
# Health check endpoint
HEALTHCHECK --interval=30s --timeout=10s
CMD curl -f http://localhost:8000/health || exit 1
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
from fastapi import FastAPI
from pydantic import BaseModel
import mlflow
import numpy as np
app = FastAPI()
class PredictionRequest(BaseModel):
features: list[float]
model_version: str = "latest"
class PredictionResponse(BaseModel):
prediction: float
confidence: float
model_version: str
@app.post("/predict", response_model=PredictionResponse)
async def predict(request: PredictionRequest):
model = mlflow.pyfunc.load_model(
f"models:/production-model/{request.model_version}"
)
features = np.array([request.features])
prediction = model.predict(features)
return PredictionResponse(
prediction=float(prediction[0]),
confidence=0.95,
model_version=request.model_version,
)




