Docker Deployment Guide¶
This document provides comprehensive guidance for deploying the Federated Learning Platform using Docker and Docker Compose across different environments.
Docker Architecture Overview¶
graph TB
subgraph "Docker Compose Profiles"
subgraph "Orchestrator Profile"
FRONTEND[frontend<br/>Next.js Container]
BACKEND[backend-fastapi<br/>FastAPI Container]
MONGO[mongodb<br/>Database Container]
OTEL[otel-collector<br/>Telemetry Container]
TEMPO[tempo<br/>Tracing Container]
GRAFANA[grafana<br/>Monitoring Container]
end
subgraph "Aggregator Profile"
SUPERLINK[superlink<br/>Flower Hub]
AGGREGATOR[aggregator<br/>FL Server]
end
subgraph "Client Profile"
SUPERNODE[supernode<br/>FL Node]
CLIENTAPP[clientapp<br/>FL Client]
INFERENCE[inference<br/>Model Inference]
end
end
subgraph "Docker Networks"
FL_NETWORK[federated-learning-network<br/>Bridge Network]
end
subgraph "Docker Volumes"
MONGO_DATA[mongodb_data]
GRAFANA_DATA[grafana_data]
TEMPO_DATA[tempo_data]
FL_DATA[fl_data]
end
FRONTEND --> FL_NETWORK
BACKEND --> FL_NETWORK
MONGO --> FL_NETWORK
SUPERLINK --> FL_NETWORK
AGGREGATOR --> FL_NETWORK
CLIENTAPP --> FL_NETWORK
MONGO --> MONGO_DATA
GRAFANA --> GRAFANA_DATA
TEMPO --> TEMPO_DATA
AGGREGATOR --> FL_DATA
Container Specifications¶
Frontend Container (Next.js)¶
# frontend/Dockerfile
FROM node:18-alpine AS base
# Development stage
FROM base AS development
WORKDIR /app
COPY package.json package-lock.json* ./
RUN npm ci
COPY . .
EXPOSE 4000
CMD ["npm", "run", "dev"]
# Dependencies stage
FROM base AS deps
WORKDIR /app
COPY package.json package-lock.json* ./
RUN npm ci
# Builder stage
FROM base AS builder
WORKDIR /app
ARG NODE_ENV=development
ARG NEXT_PUBLIC_API_URL=http://localhost:8000
ARG NEXT_PUBLIC_WS_URL=ws://localhost:8000
ENV NODE_ENV=${NODE_ENV}
ENV NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL}
ENV NEXT_PUBLIC_WS_URL=${NEXT_PUBLIC_WS_URL}
ENV NEXT_TELEMETRY_DISABLED=1
COPY --from=deps /app/node_modules ./node_modules
COPY . .
RUN npm run build
# Production stage
FROM base AS runner
WORKDIR /app
ENV NODE_ENV=production
ENV NEXT_TELEMETRY_DISABLED=1
RUN addgroup --system --gid 1001 nodejs
RUN adduser --system --uid 1001 nextjs
COPY --from=builder /app/public ./public
COPY --from=builder --chown=nextjs:nodejs /app/.next/standalone ./
COPY --from=builder --chown=nextjs:nodejs /app/.next/static ./.next/static
USER nextjs
EXPOSE 4000
ENV PORT 4000
CMD ["node", "server.js"]
Backend Container (FastAPI)¶
# backend/Dockerfile
FROM python:3.10-slim AS base
# Set environment variables
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
ENV PYTHONPATH=/app
# Install system dependencies
RUN apt-get update && apt-get install -y \
build-essential \
curl \
docker.io \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Install additional tools
RUN pip install ansible docker
# Copy application code
COPY . .
# Copy FL core
COPY fl-core/ /app/fl-core/
# Remove and recreate mlproject directory
RUN rm -rf /app/fl-core/mlproject && mkdir /app/fl-core/mlproject
# Create SSH directory for Ansible
RUN mkdir -p /root/.ssh
# Make scripts executable
RUN chmod +x start.sh
# Create non-root user for security
RUN groupadd -r appuser && useradd -r -g appuser appuser
RUN chown -R appuser:appuser /app
# Expose ports
EXPOSE 8000 8001
# Health check
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
# Start application
CMD ["./start.sh"]
Federated Learning Containers¶
Server App Container¶
# fl-core/Dockerfile.serverapp
FROM flwr/serverapp:1.15.2
# Install monitoring dependencies
USER root
RUN pip install psutil \
opentelemetry-api==1.20.0 \
opentelemetry-sdk==1.20.0 \
opentelemetry-exporter-otlp==1.20.0 \
opentelemetry-instrumentation==0.41b0
WORKDIR /app
COPY --chown=app:app mlproject /app/mlproject
# Remove simulation dependencies and install project
RUN sed -i 's/.*flwr\[simulation\].*//' /app/mlproject/pyproject.toml \
&& python -m pip install -U /app/mlproject
# Copy startup script
COPY --chown=app:app entrypoint-server.sh /app/entrypoint-server.sh
RUN chmod +x /app/entrypoint-server.sh
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD pgrep -f "flwr-serverapp" || exit 1
ENTRYPOINT ["/app/entrypoint-server.sh"]
Client App Container¶
# fl-core/Dockerfile.clientapp
FROM flwr/clientapp:1.15.2
# Install build dependencies and monitoring tools
USER root
RUN apt-get update && apt-get -y --no-install-recommends install \
build-essential \
&& rm -rf /var/lib/apt/lists/* \
&& pip install psutil \
opentelemetry-api==1.20.0 \
opentelemetry-sdk==1.20.0 \
opentelemetry-exporter-otlp==1.20.0 \
opentelemetry-instrumentation==0.41b0
WORKDIR /app
COPY --chown=app:app mlproject /app/mlproject
# Install project dependencies
RUN sed -i 's/.*flwr\[simulation\].*//' /app/mlproject/pyproject.toml \
&& python -m pip install -U /app/mlproject
# Copy startup script
COPY --chown=app:app entrypoint-client.sh /app/entrypoint-client.sh
RUN chmod +x /app/entrypoint-client.sh
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD pgrep -f "flwr-clientapp" || exit 1
ENTRYPOINT ["/app/entrypoint-client.sh"]
Docker Compose Configuration¶
Main Docker Compose File¶
# docker-compose.yml
version: '3.8'
services:
# Frontend Service
frontend:
build:
context: ./frontend
dockerfile: Dockerfile
target: development
ports:
- "4000:4000"
environment:
- NODE_ENV=development
- NEXT_PUBLIC_API_URL=http://localhost:8000
- NEXT_PUBLIC_WS_URL=ws://localhost:8000
volumes:
- ./frontend:/app
- /app/node_modules
networks:
- federated-learning-network
profiles:
- orchestrator
depends_on:
- backend-fastapi
# Backend Service
backend-fastapi:
build:
context: .
dockerfile: backend/Dockerfile
ports:
- "8000:8000"
environment:
- MONGODB_URL=mongodb://mongodb:27017
- DATABASE_NAME=federated_learning
- SECRET_KEY=${SECRET_KEY:-dev-secret-key}
- DEBUG=true
- OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
volumes:
- ./backend:/app
- /var/run/docker.sock:/var/run/docker.sock
- ~/.ssh:/root/.ssh:ro
networks:
- federated-learning-network
profiles:
- orchestrator
depends_on:
- mongodb
- otel-collector
# Database Service
mongodb:
image: mongo:latest
ports:
- "27017:27017"
environment:
- MONGO_INITDB_DATABASE=federated_learning
volumes:
- mongodb_data:/data/db
networks:
- federated-learning-network
profiles:
- orchestrator
healthcheck:
test: ["CMD", "mongosh", "--eval", "db.adminCommand('ping')"]
interval: 30s
timeout: 10s
retries: 3
# OpenTelemetry Collector
otel-collector:
image: otel/opentelemetry-collector-contrib:latest
command: ["--config=/etc/otel-collector-config.yaml"]
volumes:
- ./backend/otel-collector-config.yaml:/etc/otel-collector-config.yaml
ports:
- "4317:4317" # OTLP gRPC receiver
- "4318:4318" # OTLP HTTP receiver
networks:
- federated-learning-network
profiles:
- orchestrator
depends_on:
- tempo
# Tempo Tracing Backend
tempo:
image: grafana/tempo:latest
command: ["-config.file=/etc/tempo.yaml"]
volumes:
- ./backend/tempo.yaml:/etc/tempo.yaml
- tempo_data:/tmp/tempo
ports:
- "3200:3200"
networks:
- federated-learning-network
profiles:
- orchestrator
# Grafana Monitoring
grafana:
image: grafana/grafana:latest
ports:
- "3001:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
volumes:
- grafana_data:/var/lib/grafana
- ./backend/grafana/provisioning:/etc/grafana/provisioning
- ./backend/grafana/dashboards:/var/lib/grafana/dashboards
networks:
- federated-learning-network
profiles:
- orchestrator
depends_on:
- tempo
# Flower Superlink
superlink:
image: flwr/superlink:1.15.2
command: ["--insecure"]
ports:
- "9091:9091"
- "9093:9093"
networks:
- federated-learning-network
profiles:
- aggregator
healthcheck:
test: ["CMD", "nc", "-z", "localhost", "9091"]
interval: 30s
timeout: 10s
retries: 3
# FL Aggregator
aggregator:
build:
context: ./fl-core
dockerfile: Dockerfile.serverapp
environment:
- SUPERLINK_ADDRESS=superlink:9092
- OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
ports:
- "9092:9092"
volumes:
- fl_data:/app/data
networks:
- federated-learning-network
profiles:
- aggregator
depends_on:
- superlink
volumes:
mongodb_data:
grafana_data:
tempo_data:
fl_data:
networks:
federated-learning-network:
driver: bridge
ipam:
config:
- subnet: 172.16.0.0/16
Deployment Strategies¶
Development Deployment¶
# Start development environment
./setup-local-training.sh
# Or manually
docker-compose --profile orchestrator up -d
# View logs
docker-compose logs -f
# Scale services
docker-compose up -d --scale clientapp=3
Production Deployment¶
# Set production environment
export NODE_ENV=production
export SECRET_KEY="your-production-secret-key"
export MONGODB_URL="mongodb://prod-mongo:27017"
# Build production images
docker-compose -f docker-compose.prod.yml build
# Deploy with production profile
docker-compose -f docker-compose.prod.yml --profile orchestrator up -d
# Health check
docker-compose ps
docker-compose logs --tail=50
Multi-Node Deployment¶
# Orchestrator node
./setup-and-run-orchestrator.sh production
# Aggregator node (separate machine)
docker-compose --profile aggregator up -d
# Client nodes (multiple machines)
docker-compose --profile client up -d
Container Management¶
Health Monitoring¶
# Check container health
docker-compose ps
# View health check logs
docker inspect --format='{{json .State.Health}}' container_name
# Custom health check script
cat > health-check.sh << 'EOF'
#!/bin/bash
set -e
# Check frontend
curl -f http://localhost:4000/health || exit 1
# Check backend
curl -f http://localhost:8000/health || exit 1
# Check MongoDB
docker exec mongodb mongosh --eval "db.adminCommand('ping')" || exit 1
echo "All services healthy"
EOF
chmod +x health-check.sh
./health-check.sh
Log Management¶
# View logs with timestamps
docker-compose logs -f -t
# View logs for specific service
docker-compose logs -f backend-fastapi
# Limit log output
docker-compose logs --tail=100 frontend
# Export logs
docker-compose logs --no-color > deployment.log
Resource Monitoring¶
# Monitor resource usage
docker stats
# Monitor specific containers
docker stats frontend backend-fastapi mongodb
# Get detailed container info
docker inspect container_name
# View container processes
docker exec container_name ps aux
Performance Optimization¶
Container Resource Limits¶
# docker-compose.override.yml
version: '3.8'
services:
backend-fastapi:
deploy:
resources:
limits:
cpus: '2.0'
memory: 2G
reservations:
cpus: '1.0'
memory: 1G
ulimits:
nofile:
soft: 65536
hard: 65536
mongodb:
deploy:
resources:
limits:
cpus: '1.0'
memory: 1G
reservations:
cpus: '0.5'
memory: 512M
frontend:
deploy:
resources:
limits:
cpus: '1.0'
memory: 512M
reservations:
cpus: '0.5'
memory: 256M
Image Optimization¶
# Multi-stage build optimization
FROM python:3.10-slim AS builder
WORKDIR /app
COPY requirements.txt .
RUN pip install --user -r requirements.txt
FROM python:3.10-slim AS runtime
WORKDIR /app
# Copy only necessary files
COPY --from=builder /root/.local /root/.local
COPY app/ ./app/
# Make sure scripts in .local are usable
ENV PATH=/root/.local/bin:$PATH
# Use non-root user
RUN useradd --create-home --shell /bin/bash app
USER app
CMD ["python", "-m", "app.main"]
Network Optimization¶
# Custom network configuration
networks:
federated-learning-network:
driver: bridge
driver_opts:
com.docker.network.bridge.name: fl-bridge
com.docker.network.driver.mtu: 1500
ipam:
driver: default
config:
- subnet: 172.16.0.0/16
gateway: 172.16.0.1
Security Considerations¶
Container Security¶
# Security best practices
FROM python:3.10-slim
# Create non-root user
RUN groupadd -r appuser && useradd -r -g appuser appuser
# Install security updates
RUN apt-get update && apt-get upgrade -y && rm -rf /var/lib/apt/lists/*
# Set secure permissions
COPY --chown=appuser:appuser . /app
WORKDIR /app
# Drop privileges
USER appuser
# Remove unnecessary packages
RUN apt-get autoremove -y && apt-get autoclean
Secrets Management¶
# docker-compose.secrets.yml
version: '3.8'
services:
backend-fastapi:
secrets:
- db_password
- jwt_secret
environment:
- DB_PASSWORD_FILE=/run/secrets/db_password
- JWT_SECRET_FILE=/run/secrets/jwt_secret
secrets:
db_password:
file: ./secrets/db_password.txt
jwt_secret:
file: ./secrets/jwt_secret.txt
Network Security¶
# Isolated networks
networks:
frontend-network:
driver: bridge
internal: false
backend-network:
driver: bridge
internal: true
database-network:
driver: bridge
internal: true
services:
frontend:
networks:
- frontend-network
- backend-network
backend-fastapi:
networks:
- backend-network
- database-network
mongodb:
networks:
- database-network
Troubleshooting¶
Common Issues¶
Container Won't Start¶
# Check container logs
docker-compose logs container_name
# Check container status
docker-compose ps
# Inspect container configuration
docker inspect container_name
# Check resource usage
docker stats
Network Connectivity Issues¶
# Test network connectivity
docker exec container_name ping other_container
# Check network configuration
docker network ls
docker network inspect federated-learning-network
# Test port connectivity
docker exec container_name nc -zv hostname port
Volume Mount Issues¶
# Check volume mounts
docker inspect container_name | grep -A 10 "Mounts"
# Check volume permissions
docker exec container_name ls -la /mounted/path
# Fix permissions
sudo chown -R $(id -u):$(id -g) ./local/path
Debugging Commands¶
# Enter container shell
docker exec -it container_name /bin/bash
# View container filesystem
docker exec container_name find / -name "*.log" 2>/dev/null
# Check environment variables
docker exec container_name env
# Monitor container in real-time
docker exec container_name top
Next: Continue to Security Overview for comprehensive security documentation.