Skip to content

Troubleshooting Guide

This comprehensive troubleshooting guide provides solutions for common issues encountered in the Federated Learning Platform, organized by component and severity level.

Quick Diagnostic Commands

System Health Check

#!/bin/bash
# health-check.sh - Quick system health verification

echo "=== Federated Learning Platform Health Check ==="

# Check Docker services
echo "1. Checking Docker services..."
docker-compose ps

# Check service endpoints
echo "2. Checking service endpoints..."
curl -f http://localhost:4000/health || echo "Frontend: FAILED"
curl -f http://localhost:8000/health || echo "Backend: FAILED"
curl -f http://localhost:3001/api/health || echo "Grafana: FAILED"

# Check database connectivity
echo "3. Checking database..."
docker exec mongodb mongosh --eval "db.adminCommand('ping')" || echo "MongoDB: FAILED"

# Check federated learning services
echo "4. Checking FL services..."
nc -zv localhost 9091 || echo "Superlink: FAILED"
nc -zv localhost 9092 || echo "Aggregator: FAILED"

# Check resource usage
echo "5. System resources..."
docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}"

echo "=== Health Check Complete ==="

Log Collection Script

#!/bin/bash
# collect-logs.sh - Comprehensive log collection

TIMESTAMP=$(date +%Y%m%d_%H%M%S)
LOG_DIR="logs_${TIMESTAMP}"
mkdir -p "$LOG_DIR"

echo "Collecting logs to $LOG_DIR..."

# Docker container logs
docker-compose logs --no-color > "$LOG_DIR/docker-compose.log"
docker-compose logs --no-color frontend > "$LOG_DIR/frontend.log"
docker-compose logs --no-color backend-fastapi > "$LOG_DIR/backend.log"
docker-compose logs --no-color mongodb > "$LOG_DIR/mongodb.log"
docker-compose logs --no-color grafana > "$LOG_DIR/grafana.log"

# System logs
journalctl -u docker --since "1 hour ago" > "$LOG_DIR/docker-system.log"
dmesg | tail -100 > "$LOG_DIR/kernel.log"

# Application-specific logs
if [ -d "/var/log/federated-learning" ]; then
    cp -r /var/log/federated-learning "$LOG_DIR/"
fi

# System information
docker version > "$LOG_DIR/docker-version.txt"
docker-compose version > "$LOG_DIR/compose-version.txt"
df -h > "$LOG_DIR/disk-usage.txt"
free -h > "$LOG_DIR/memory-usage.txt"
ps aux > "$LOG_DIR/processes.txt"

echo "Logs collected in $LOG_DIR"
tar -czf "${LOG_DIR}.tar.gz" "$LOG_DIR"
echo "Archive created: ${LOG_DIR}.tar.gz"

Common Issues and Solutions

1. Container Startup Issues

Issue: Containers fail to start

Symptoms: - docker-compose up fails - Services show "Exited" status - Port binding errors

Diagnosis:

# Check container status
docker-compose ps

# View container logs
docker-compose logs [service-name]

# Check port conflicts
netstat -tulpn | grep [port-number]
lsof -i :[port-number]

Solutions:

# Kill process using the port
sudo kill -9 $(lsof -t -i:[port-number])

# Or change port in docker-compose.yml
ports:
  - "4001:4000"  # Use different host port
# Fix Docker permissions
sudo usermod -aG docker $USER
newgrp docker

# Fix file permissions
sudo chown -R $USER:$USER .
chmod +x setup-*.sh
# Check available resources
docker system df
docker system prune -a

# Increase Docker memory limit
# Docker Desktop: Settings > Resources > Memory

Issue: Database connection failures

Symptoms: - Backend cannot connect to MongoDB - Connection timeout errors - Authentication failures

Diagnosis:

# Test MongoDB connectivity
docker exec mongodb mongosh --eval "db.adminCommand('ping')"

# Check MongoDB logs
docker-compose logs mongodb

# Test network connectivity
docker exec backend-fastapi ping mongodb

Solutions:

# Correct connection string format
MONGODB_URL = "mongodb://mongodb:27017/federated_learning"

# For authentication
MONGODB_URL = "mongodb://username:password@mongodb:27017/federated_learning"
# Recreate Docker network
docker-compose down
docker network prune
docker-compose up -d
# Reset MongoDB data
docker-compose down
docker volume rm flip_mongodb_data
docker-compose up -d mongodb

2. Federated Learning Issues

Issue: Training jobs fail to start

Symptoms: - Jobs stuck in "initializing" status - Flower connection errors - Client deployment failures

Diagnosis:

# Check Flower services
docker-compose logs superlink
docker-compose logs aggregator

# Test Flower connectivity
nc -zv localhost 9091
nc -zv localhost 9092
nc -zv localhost 9093

# Check Ansible deployment
docker-compose logs backend-fastapi | grep ansible

Solutions:

# Restart Flower services
docker-compose restart superlink aggregator

# Check Flower configuration
docker exec aggregator cat /app/mlproject/pyproject.toml
# Check SSH connectivity to clients
ssh -i ~/.ssh/id_ansible user@client-ip "echo 'Connection OK'"

# Verify Ansible inventory
cat backend/ansible/inventory/devices.ini

# Test Ansible connectivity
cd backend/ansible
ansible -i inventory/devices.ini all -m ping
# Validate ML project structure
docker exec backend-fastapi ls -la /app/fl-core/mlproject/

# Check required files
docker exec backend-fastapi find /app/fl-core/mlproject/ -name "*.py"

Issue: Clients disconnect during training

Symptoms: - Clients drop out mid-training - Inconsistent participation rates - Network timeout errors

Diagnosis:

# Check client logs
ssh client-ip "docker logs clientapp"

# Monitor network connectivity
ping -c 10 client-ip

# Check client resource usage
ssh client-ip "docker stats --no-stream"

Solutions:

# Increase timeout values
# In server_app.py
config = ServerConfig(
    num_rounds=10,
    round_timeout=600,  # Increase timeout
)

# Configure network retry
# In client configuration
retry_config = {
    "max_retries": 3,
    "retry_delay": 5
}
# Monitor client resources
ssh client-ip "free -h && df -h"

# Adjust batch size and model complexity
# In training configuration
config = {
    "batch_size": 16,  # Reduce if memory issues
    "local_epochs": 1  # Reduce computation load
}

3. Performance Issues

Issue: Slow API response times

Symptoms: - High response latencies - Timeout errors - Poor user experience

Diagnosis:

# Check API performance
curl -w "@curl-format.txt" -o /dev/null -s http://localhost:8000/health

# Monitor resource usage
docker stats backend-fastapi

# Check database performance
docker exec mongodb mongosh --eval "db.runCommand({serverStatus: 1}).metrics"

Solutions:

# Add database indexes
await db.users.create_index("username")
await db.projects.create_index([("owner_id", 1), ("created_at", -1)])
await db.training_jobs.create_index("status")

# Use connection pooling
client = AsyncIOMotorClient(
    connection_string,
    maxPoolSize=50,
    minPoolSize=10
)
# Add Redis caching
import redis.asyncio as redis

cache = redis.from_url("redis://localhost:6379")

async def get_user_projects(user_id: str):
    cache_key = f"user_projects:{user_id}"
    cached = await cache.get(cache_key)

    if cached:
        return json.loads(cached)

    projects = await db.projects.find({"owner_id": user_id}).to_list(None)
    await cache.setex(cache_key, 300, json.dumps(projects, default=str))
    return projects
# Increase container resources
services:
  backend-fastapi:
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 2G
        reservations:
          cpus: '1.0'
          memory: 1G

Issue: High memory usage

Symptoms: - Out of memory errors - Container restarts - System slowdown

Diagnosis:

# Monitor memory usage
docker stats --format "table {{.Container}}\t{{.MemUsage}}\t{{.MemPerc}}"

# Check memory leaks
docker exec backend-fastapi ps aux --sort=-%mem

# Analyze heap usage (Python)
docker exec backend-fastapi python -c "
import psutil
process = psutil.Process()
print(f'Memory: {process.memory_info().rss / 1024 / 1024:.2f} MB')
"

Solutions:

# Implement pagination
async def get_training_jobs(page: int = 1, limit: int = 20):
    skip = (page - 1) * limit
    jobs = await db.training_jobs.find().skip(skip).limit(limit).to_list(None)
    return jobs

# Use generators for large datasets
async def process_large_dataset():
    async for document in db.large_collection.find():
        yield process_document(document)

# Clear unused variables
import gc
gc.collect()
# Set memory limits
services:
  backend-fastapi:
    mem_limit: 1g
    memswap_limit: 1g
    oom_kill_disable: false

4. Authentication and Authorization Issues

Issue: JWT token validation failures

Symptoms: - 401 Unauthorized errors - Token expired messages - Login failures

Diagnosis:

# Check JWT configuration
docker exec backend-fastapi env | grep JWT

# Validate token format
echo "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9..." | base64 -d

# Check system time synchronization
date
docker exec backend-fastapi date

Solutions:

# Verify JWT settings
SECRET_KEY = os.getenv("SECRET_KEY")  # Must be consistent
ALGORITHM = "HS256"
ACCESS_TOKEN_EXPIRE_MINUTES = 30

# Add token refresh mechanism
def create_refresh_token(data: dict):
    to_encode = data.copy()
    expire = datetime.utcnow() + timedelta(days=7)
    to_encode.update({"exp": expire, "type": "refresh"})
    return jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
# Synchronize system time
sudo ntpdate -s time.nist.gov

# Configure NTP
sudo systemctl enable ntp
sudo systemctl start ntp

Issue: Permission denied errors

Symptoms: - 403 Forbidden responses - Access denied messages - Role-based access failures

Diagnosis:

# Check user roles
docker exec mongodb mongosh federated_learning --eval "db.users.find({}, {username: 1, roles: 1})"

# Verify permission configuration
docker exec backend-fastapi python -c "
from app.core.auth import ROLE_PERMISSIONS
print(ROLE_PERMISSIONS)
"

Solutions:

# Update user roles
async def update_user_role(user_id: str, new_role: str):
    await db.users.update_one(
        {"_id": ObjectId(user_id)},
        {"$set": {"roles": [new_role]}}
    )

# Check permissions
def has_permission(user_roles: List[str], required_permission: str) -> bool:
    for role in user_roles:
        if required_permission in ROLE_PERMISSIONS.get(role, []):
            return True
    return False

5. Monitoring and Observability Issues

Issue: Missing metrics or traces

Symptoms: - Empty Grafana dashboards - No trace data in Tempo - Missing telemetry data

Diagnosis:

# Check OpenTelemetry collector
docker-compose logs otel-collector

# Verify endpoints
curl http://localhost:4317/v1/traces
curl http://localhost:4318/v1/metrics

# Check Grafana data sources
curl -u admin:admin http://localhost:3001/api/datasources

Solutions:

# Verify OTEL setup
from opentelemetry import trace, metrics

# Check if providers are set
tracer_provider = trace.get_tracer_provider()
meter_provider = metrics.get_meter_provider()

print(f"Tracer: {tracer_provider}")
print(f"Meter: {meter_provider}")
# Test collector connectivity
docker exec backend-fastapi nc -zv otel-collector 4317
docker exec backend-fastapi nc -zv otel-collector 4318

# Check collector configuration
docker exec otel-collector cat /etc/otel-collector-config.yaml

Emergency Procedures

System Recovery

Complete System Reset

#!/bin/bash
# emergency-reset.sh - Complete system reset

echo "WARNING: This will destroy all data. Continue? (y/N)"
read -r response

if [[ "$response" =~ ^[Yy]$ ]]; then
    echo "Stopping all services..."
    docker-compose --profile "*" down

    echo "Removing volumes..."
    docker volume prune -f

    echo "Removing networks..."
    docker network prune -f

    echo "Removing images..."
    docker image prune -a -f

    echo "Rebuilding system..."
    docker-compose build --no-cache
    docker-compose --profile orchestrator up -d

    echo "System reset complete"
else
    echo "Reset cancelled"
fi

Data Backup and Restore

#!/bin/bash
# backup-restore.sh - Data backup and restore procedures

backup_data() {
    BACKUP_DIR="backup_$(date +%Y%m%d_%H%M%S)"
    mkdir -p "$BACKUP_DIR"

    echo "Backing up MongoDB..."
    docker exec mongodb mongodump --out /tmp/backup
    docker cp mongodb:/tmp/backup "$BACKUP_DIR/mongodb"

    echo "Backing up configuration files..."
    cp -r backend/ansible/inventory "$BACKUP_DIR/"
    cp -r fl-core/mlproject "$BACKUP_DIR/" 2>/dev/null || true

    echo "Creating archive..."
    tar -czf "${BACKUP_DIR}.tar.gz" "$BACKUP_DIR"
    rm -rf "$BACKUP_DIR"

    echo "Backup created: ${BACKUP_DIR}.tar.gz"
}

restore_data() {
    BACKUP_FILE="$1"

    if [ ! -f "$BACKUP_FILE" ]; then
        echo "Backup file not found: $BACKUP_FILE"
        exit 1
    fi

    echo "Extracting backup..."
    tar -xzf "$BACKUP_FILE"
    BACKUP_DIR="${BACKUP_FILE%.tar.gz}"

    echo "Restoring MongoDB..."
    docker cp "$BACKUP_DIR/mongodb" mongodb:/tmp/restore
    docker exec mongodb mongorestore /tmp/restore

    echo "Restoring configuration..."
    cp -r "$BACKUP_DIR/inventory" backend/ansible/ 2>/dev/null || true
    cp -r "$BACKUP_DIR/mlproject" fl-core/ 2>/dev/null || true

    echo "Restore complete"
    rm -rf "$BACKUP_DIR"
}

case "$1" in
    backup)
        backup_data
        ;;
    restore)
        restore_data "$2"
        ;;
    *)
        echo "Usage: $0 {backup|restore <backup-file>}"
        exit 1
        ;;
esac

Incident Response

Security Incident Response

#!/bin/bash
# security-incident.sh - Security incident response

echo "=== SECURITY INCIDENT RESPONSE ==="

# 1. Isolate affected systems
echo "1. Isolating systems..."
docker-compose pause backend-fastapi
iptables -A INPUT -p tcp --dport 8000 -j DROP

# 2. Collect evidence
echo "2. Collecting evidence..."
./collect-logs.sh
cp -r /var/log/auth.log evidence/
netstat -tulpn > evidence/network-connections.txt

# 3. Check for indicators of compromise
echo "3. Checking for IOCs..."
docker exec backend-fastapi find /app -name "*.py" -newer /tmp/last-known-good
docker exec mongodb mongosh --eval "db.users.find({created_at: {\$gte: new Date('2024-01-01')}})"

# 4. Reset credentials
echo "4. Resetting credentials..."
docker exec mongodb mongosh federated_learning --eval "
db.users.updateMany({}, {\$set: {password_reset_required: true}})
"

# 5. Update security measures
echo "5. Updating security..."
# Force password reset, enable MFA, update firewall rules

echo "Incident response complete. Review evidence/ directory."

Maintenance Procedures

Regular Maintenance Tasks

Daily Maintenance

#!/bin/bash
# daily-maintenance.sh

# Check system health
./health-check.sh

# Clean up old logs
find /var/log -name "*.log" -mtime +7 -delete

# Update system packages
sudo apt update && sudo apt upgrade -y

# Check disk space
df -h | awk '$5 > 80 {print "WARNING: " $0}'

# Backup critical data
./backup-restore.sh backup

Weekly Maintenance

#!/bin/bash
# weekly-maintenance.sh

# Docker cleanup
docker system prune -f
docker volume prune -f

# Database maintenance
docker exec mongodb mongosh federated_learning --eval "
db.runCommand({compact: 'training_jobs'});
db.runCommand({reIndex: 'users'});
"

# Security updates
docker-compose pull
docker-compose build --no-cache

# Performance analysis
docker stats --no-stream > performance-report-$(date +%Y%m%d).txt

Next: Continue to API Reference for comprehensive API documentation.