Skip to content

Docker Deployment Guide

This document provides comprehensive guidance for deploying the Federated Learning Platform using Docker and Docker Compose across different environments.

Docker Architecture Overview

graph TB
    subgraph "Docker Compose Profiles"
        subgraph "Orchestrator Profile"
            FRONTEND[frontend<br/>Next.js Container]
            BACKEND[backend-fastapi<br/>FastAPI Container]
            MONGO[mongodb<br/>Database Container]
            OTEL[otel-collector<br/>Telemetry Container]
            TEMPO[tempo<br/>Tracing Container]
            GRAFANA[grafana<br/>Monitoring Container]
        end

        subgraph "Aggregator Profile"
            SUPERLINK[superlink<br/>Flower Hub]
            AGGREGATOR[aggregator<br/>FL Server]
        end

        subgraph "Client Profile"
            SUPERNODE[supernode<br/>FL Node]
            CLIENTAPP[clientapp<br/>FL Client]
            INFERENCE[inference<br/>Model Inference]
        end
    end

    subgraph "Docker Networks"
        FL_NETWORK[federated-learning-network<br/>Bridge Network]
    end

    subgraph "Docker Volumes"
        MONGO_DATA[mongodb_data]
        GRAFANA_DATA[grafana_data]
        TEMPO_DATA[tempo_data]
        FL_DATA[fl_data]
    end

    FRONTEND --> FL_NETWORK
    BACKEND --> FL_NETWORK
    MONGO --> FL_NETWORK
    SUPERLINK --> FL_NETWORK
    AGGREGATOR --> FL_NETWORK
    CLIENTAPP --> FL_NETWORK

    MONGO --> MONGO_DATA
    GRAFANA --> GRAFANA_DATA
    TEMPO --> TEMPO_DATA
    AGGREGATOR --> FL_DATA

Container Specifications

Frontend Container (Next.js)

# frontend/Dockerfile
FROM node:18-alpine AS base

# Development stage
FROM base AS development
WORKDIR /app
COPY package.json package-lock.json* ./
RUN npm ci
COPY . .
EXPOSE 4000
CMD ["npm", "run", "dev"]

# Dependencies stage
FROM base AS deps
WORKDIR /app
COPY package.json package-lock.json* ./
RUN npm ci

# Builder stage
FROM base AS builder
WORKDIR /app

ARG NODE_ENV=development
ARG NEXT_PUBLIC_API_URL=http://localhost:8000
ARG NEXT_PUBLIC_WS_URL=ws://localhost:8000

ENV NODE_ENV=${NODE_ENV}
ENV NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL}
ENV NEXT_PUBLIC_WS_URL=${NEXT_PUBLIC_WS_URL}
ENV NEXT_TELEMETRY_DISABLED=1

COPY --from=deps /app/node_modules ./node_modules
COPY . .
RUN npm run build

# Production stage
FROM base AS runner
WORKDIR /app

ENV NODE_ENV=production
ENV NEXT_TELEMETRY_DISABLED=1

RUN addgroup --system --gid 1001 nodejs
RUN adduser --system --uid 1001 nextjs

COPY --from=builder /app/public ./public
COPY --from=builder --chown=nextjs:nodejs /app/.next/standalone ./
COPY --from=builder --chown=nextjs:nodejs /app/.next/static ./.next/static

USER nextjs
EXPOSE 4000
ENV PORT 4000

CMD ["node", "server.js"]

Backend Container (FastAPI)

# backend/Dockerfile
FROM python:3.10-slim AS base

# Set environment variables
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
ENV PYTHONPATH=/app

# Install system dependencies
RUN apt-get update && apt-get install -y \
    build-essential \
    curl \
    docker.io \
    && rm -rf /var/lib/apt/lists/*

WORKDIR /app

# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Install additional tools
RUN pip install ansible docker

# Copy application code
COPY . .

# Copy FL core
COPY fl-core/ /app/fl-core/

# Remove and recreate mlproject directory
RUN rm -rf /app/fl-core/mlproject && mkdir /app/fl-core/mlproject

# Create SSH directory for Ansible
RUN mkdir -p /root/.ssh

# Make scripts executable
RUN chmod +x start.sh

# Create non-root user for security
RUN groupadd -r appuser && useradd -r -g appuser appuser
RUN chown -R appuser:appuser /app

# Expose ports
EXPOSE 8000 8001

# Health check
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
    CMD curl -f http://localhost:8000/health || exit 1

# Start application
CMD ["./start.sh"]

Federated Learning Containers

Server App Container

# fl-core/Dockerfile.serverapp
FROM flwr/serverapp:1.15.2

# Install monitoring dependencies
USER root
RUN pip install psutil \
    opentelemetry-api==1.20.0 \
    opentelemetry-sdk==1.20.0 \
    opentelemetry-exporter-otlp==1.20.0 \
    opentelemetry-instrumentation==0.41b0

WORKDIR /app
COPY --chown=app:app mlproject /app/mlproject

# Remove simulation dependencies and install project
RUN sed -i 's/.*flwr\[simulation\].*//' /app/mlproject/pyproject.toml \
    && python -m pip install -U /app/mlproject

# Copy startup script
COPY --chown=app:app entrypoint-server.sh /app/entrypoint-server.sh
RUN chmod +x /app/entrypoint-server.sh

# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
    CMD pgrep -f "flwr-serverapp" || exit 1

ENTRYPOINT ["/app/entrypoint-server.sh"]

Client App Container

# fl-core/Dockerfile.clientapp
FROM flwr/clientapp:1.15.2

# Install build dependencies and monitoring tools
USER root
RUN apt-get update && apt-get -y --no-install-recommends install \
    build-essential \
    && rm -rf /var/lib/apt/lists/* \
    && pip install psutil \
    opentelemetry-api==1.20.0 \
    opentelemetry-sdk==1.20.0 \
    opentelemetry-exporter-otlp==1.20.0 \
    opentelemetry-instrumentation==0.41b0

WORKDIR /app
COPY --chown=app:app mlproject /app/mlproject

# Install project dependencies
RUN sed -i 's/.*flwr\[simulation\].*//' /app/mlproject/pyproject.toml \
    && python -m pip install -U /app/mlproject

# Copy startup script
COPY --chown=app:app entrypoint-client.sh /app/entrypoint-client.sh
RUN chmod +x /app/entrypoint-client.sh

# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
    CMD pgrep -f "flwr-clientapp" || exit 1

ENTRYPOINT ["/app/entrypoint-client.sh"]

Docker Compose Configuration

Main Docker Compose File

# docker-compose.yml
version: '3.8'

services:
  # Frontend Service
  frontend:
    build:
      context: ./frontend
      dockerfile: Dockerfile
      target: development
    ports:
      - "4000:4000"
    environment:
      - NODE_ENV=development
      - NEXT_PUBLIC_API_URL=http://localhost:8000
      - NEXT_PUBLIC_WS_URL=ws://localhost:8000
    volumes:
      - ./frontend:/app
      - /app/node_modules
    networks:
      - federated-learning-network
    profiles:
      - orchestrator
    depends_on:
      - backend-fastapi

  # Backend Service
  backend-fastapi:
    build:
      context: .
      dockerfile: backend/Dockerfile
    ports:
      - "8000:8000"
    environment:
      - MONGODB_URL=mongodb://mongodb:27017
      - DATABASE_NAME=federated_learning
      - SECRET_KEY=${SECRET_KEY:-dev-secret-key}
      - DEBUG=true
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
    volumes:
      - ./backend:/app
      - /var/run/docker.sock:/var/run/docker.sock
      - ~/.ssh:/root/.ssh:ro
    networks:
      - federated-learning-network
    profiles:
      - orchestrator
    depends_on:
      - mongodb
      - otel-collector

  # Database Service
  mongodb:
    image: mongo:latest
    ports:
      - "27017:27017"
    environment:
      - MONGO_INITDB_DATABASE=federated_learning
    volumes:
      - mongodb_data:/data/db
    networks:
      - federated-learning-network
    profiles:
      - orchestrator
    healthcheck:
      test: ["CMD", "mongosh", "--eval", "db.adminCommand('ping')"]
      interval: 30s
      timeout: 10s
      retries: 3

  # OpenTelemetry Collector
  otel-collector:
    image: otel/opentelemetry-collector-contrib:latest
    command: ["--config=/etc/otel-collector-config.yaml"]
    volumes:
      - ./backend/otel-collector-config.yaml:/etc/otel-collector-config.yaml
    ports:
      - "4317:4317"   # OTLP gRPC receiver
      - "4318:4318"   # OTLP HTTP receiver
    networks:
      - federated-learning-network
    profiles:
      - orchestrator
    depends_on:
      - tempo

  # Tempo Tracing Backend
  tempo:
    image: grafana/tempo:latest
    command: ["-config.file=/etc/tempo.yaml"]
    volumes:
      - ./backend/tempo.yaml:/etc/tempo.yaml
      - tempo_data:/tmp/tempo
    ports:
      - "3200:3200"
    networks:
      - federated-learning-network
    profiles:
      - orchestrator

  # Grafana Monitoring
  grafana:
    image: grafana/grafana:latest
    ports:
      - "3001:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin
    volumes:
      - grafana_data:/var/lib/grafana
      - ./backend/grafana/provisioning:/etc/grafana/provisioning
      - ./backend/grafana/dashboards:/var/lib/grafana/dashboards
    networks:
      - federated-learning-network
    profiles:
      - orchestrator
    depends_on:
      - tempo

  # Flower Superlink
  superlink:
    image: flwr/superlink:1.15.2
    command: ["--insecure"]
    ports:
      - "9091:9091"
      - "9093:9093"
    networks:
      - federated-learning-network
    profiles:
      - aggregator
    healthcheck:
      test: ["CMD", "nc", "-z", "localhost", "9091"]
      interval: 30s
      timeout: 10s
      retries: 3

  # FL Aggregator
  aggregator:
    build:
      context: ./fl-core
      dockerfile: Dockerfile.serverapp
    environment:
      - SUPERLINK_ADDRESS=superlink:9092
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
    ports:
      - "9092:9092"
    volumes:
      - fl_data:/app/data
    networks:
      - federated-learning-network
    profiles:
      - aggregator
    depends_on:
      - superlink

volumes:
  mongodb_data:
  grafana_data:
  tempo_data:
  fl_data:

networks:
  federated-learning-network:
    driver: bridge
    ipam:
      config:
        - subnet: 172.16.0.0/16

Deployment Strategies

Development Deployment

# Start development environment
./setup-local-training.sh

# Or manually
docker-compose --profile orchestrator up -d

# View logs
docker-compose logs -f

# Scale services
docker-compose up -d --scale clientapp=3

Production Deployment

# Set production environment
export NODE_ENV=production
export SECRET_KEY="your-production-secret-key"
export MONGODB_URL="mongodb://prod-mongo:27017"

# Build production images
docker-compose -f docker-compose.prod.yml build

# Deploy with production profile
docker-compose -f docker-compose.prod.yml --profile orchestrator up -d

# Health check
docker-compose ps
docker-compose logs --tail=50

Multi-Node Deployment

# Orchestrator node
./setup-and-run-orchestrator.sh production

# Aggregator node (separate machine)
docker-compose --profile aggregator up -d

# Client nodes (multiple machines)
docker-compose --profile client up -d

Container Management

Health Monitoring

# Check container health
docker-compose ps

# View health check logs
docker inspect --format='{{json .State.Health}}' container_name

# Custom health check script
cat > health-check.sh << 'EOF'
#!/bin/bash
set -e

# Check frontend
curl -f http://localhost:4000/health || exit 1

# Check backend
curl -f http://localhost:8000/health || exit 1

# Check MongoDB
docker exec mongodb mongosh --eval "db.adminCommand('ping')" || exit 1

echo "All services healthy"
EOF

chmod +x health-check.sh
./health-check.sh

Log Management

# View logs with timestamps
docker-compose logs -f -t

# View logs for specific service
docker-compose logs -f backend-fastapi

# Limit log output
docker-compose logs --tail=100 frontend

# Export logs
docker-compose logs --no-color > deployment.log

Resource Monitoring

# Monitor resource usage
docker stats

# Monitor specific containers
docker stats frontend backend-fastapi mongodb

# Get detailed container info
docker inspect container_name

# View container processes
docker exec container_name ps aux

Performance Optimization

Container Resource Limits

# docker-compose.override.yml
version: '3.8'

services:
  backend-fastapi:
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 2G
        reservations:
          cpus: '1.0'
          memory: 1G
    ulimits:
      nofile:
        soft: 65536
        hard: 65536

  mongodb:
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 1G
        reservations:
          cpus: '0.5'
          memory: 512M

  frontend:
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 512M
        reservations:
          cpus: '0.5'
          memory: 256M

Image Optimization

# Multi-stage build optimization
FROM python:3.10-slim AS builder
WORKDIR /app
COPY requirements.txt .
RUN pip install --user -r requirements.txt

FROM python:3.10-slim AS runtime
WORKDIR /app

# Copy only necessary files
COPY --from=builder /root/.local /root/.local
COPY app/ ./app/

# Make sure scripts in .local are usable
ENV PATH=/root/.local/bin:$PATH

# Use non-root user
RUN useradd --create-home --shell /bin/bash app
USER app

CMD ["python", "-m", "app.main"]

Network Optimization

# Custom network configuration
networks:
  federated-learning-network:
    driver: bridge
    driver_opts:
      com.docker.network.bridge.name: fl-bridge
      com.docker.network.driver.mtu: 1500
    ipam:
      driver: default
      config:
        - subnet: 172.16.0.0/16
          gateway: 172.16.0.1

Security Considerations

Container Security

# Security best practices
FROM python:3.10-slim

# Create non-root user
RUN groupadd -r appuser && useradd -r -g appuser appuser

# Install security updates
RUN apt-get update && apt-get upgrade -y && rm -rf /var/lib/apt/lists/*

# Set secure permissions
COPY --chown=appuser:appuser . /app
WORKDIR /app

# Drop privileges
USER appuser

# Remove unnecessary packages
RUN apt-get autoremove -y && apt-get autoclean

Secrets Management

# docker-compose.secrets.yml
version: '3.8'

services:
  backend-fastapi:
    secrets:
      - db_password
      - jwt_secret
    environment:
      - DB_PASSWORD_FILE=/run/secrets/db_password
      - JWT_SECRET_FILE=/run/secrets/jwt_secret

secrets:
  db_password:
    file: ./secrets/db_password.txt
  jwt_secret:
    file: ./secrets/jwt_secret.txt

Network Security

# Isolated networks
networks:
  frontend-network:
    driver: bridge
    internal: false
  backend-network:
    driver: bridge
    internal: true
  database-network:
    driver: bridge
    internal: true

services:
  frontend:
    networks:
      - frontend-network
      - backend-network

  backend-fastapi:
    networks:
      - backend-network
      - database-network

  mongodb:
    networks:
      - database-network

Troubleshooting

Common Issues

Container Won't Start

# Check container logs
docker-compose logs container_name

# Check container status
docker-compose ps

# Inspect container configuration
docker inspect container_name

# Check resource usage
docker stats

Network Connectivity Issues

# Test network connectivity
docker exec container_name ping other_container

# Check network configuration
docker network ls
docker network inspect federated-learning-network

# Test port connectivity
docker exec container_name nc -zv hostname port

Volume Mount Issues

# Check volume mounts
docker inspect container_name | grep -A 10 "Mounts"

# Check volume permissions
docker exec container_name ls -la /mounted/path

# Fix permissions
sudo chown -R $(id -u):$(id -g) ./local/path

Debugging Commands

# Enter container shell
docker exec -it container_name /bin/bash

# View container filesystem
docker exec container_name find / -name "*.log" 2>/dev/null

# Check environment variables
docker exec container_name env

# Monitor container in real-time
docker exec container_name top

Next: Continue to Security Overview for comprehensive security documentation.