Commit 89fefc11 authored by Sören Wacker's avatar Sören Wacker
Browse files

fix multi-GPU: add NCCL_P2P_DISABLE=1 for cross-NUMA GPUs, use CIFAR-10/ResNet18

parent bc722063
Loading
Loading
Loading
Loading
Loading
+28 −0
Original line number Diff line number Diff line
@@ -57,3 +57,31 @@ quota -s
1. Check error file: `cat <jobname>_<jobid>.err`
2. Verify modules load correctly: `module load 2025/gpu cuda/12.9`
3. Check working directory in job script uses `$SLURM_SUBMIT_DIR`

## Multi-GPU training issues

### Training hangs with multiple GPUs

**Symptoms:** Training hangs after "Initializing distributed" or "All distributed processes registered". NCCL all_reduce operations never complete.

**Cause:** DAIC GPU nodes have GPUs on different NUMA nodes (CPU sockets). NCCL P2P (peer-to-peer) communication fails between GPUs that aren't directly connected.

**Solution:** Add this to your job script:

```bash
export NCCL_P2P_DISABLE=1
```

This forces NCCL to use shared memory instead of P2P, which works across NUMA boundaries.

### Verify GPU topology

Check how GPUs are connected:

```bash
nvidia-smi topo -m
```

If you see `SYS` between GPUs (not `NV#` for NVLink), you need `NCCL_P2P_DISABLE=1`.

See [Multi-GPU Training](/tutorials/multi-gpu/#nccl-configuration-on-daic) for details.
+4 −0
Original line number Diff line number Diff line
@@ -35,6 +35,10 @@ echo "CPUs: $SLURM_CPUS_PER_TASK"
echo "Start: $(date)"
echo "========================================"

# Fix for cross-NUMA GPU communication on DAIC
# P2P doesn't work between GPUs on different CPU sockets
export NCCL_P2P_DISABLE=1

# Calculate number of GPUs and workers
NUM_GPUS=$(echo $CUDA_VISIBLE_DEVICES | tr ',' '\n' | wc -l)
WORKERS_PER_GPU=$((SLURM_CPUS_PER_TASK / NUM_GPUS))
+4 −0
Original line number Diff line number Diff line
@@ -36,6 +36,10 @@ echo "CPUs per task: $SLURM_CPUS_PER_TASK"
echo "Start: $(date)"
echo "========================================"

# Fix for cross-NUMA GPU communication on DAIC
# P2P doesn't work between GPUs on different CPU sockets
export NCCL_P2P_DISABLE=1

# Set master address and port
export MASTER_ADDR=$(hostname)
export MASTER_PORT=${MASTER_PORT:-29500}
+30 −24
Original line number Diff line number Diff line
#!/bin/bash
#SBATCH --job-name=lightning-multi-gpu
#SBATCH --job-name=lightning-cifar10
# SBATCH --account=<your-account>  # Uncomment and set if required
#SBATCH --partition=all
#SBATCH --time=00:30:00
#SBATCH --time=01:00:00
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=2
#SBATCH --cpus-per-task=4
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
#SBATCH --mem=32G
#SBATCH --gres=gpu:2
#SBATCH --output=lightning_%j.out
@@ -18,12 +18,21 @@ set -e
module purge
module load 2025/gpu cuda/12.9

# Set up UV paths (adjust if UV is installed elsewhere)
# Set up UV paths
export PATH="$HOME/linuxhome/.local/bin:$PATH"
export UV_CACHE_DIR="$HOME/linuxhome/.cache/uv"
export UV_PYTHON_INSTALL_DIR="$HOME/linuxhome/.local/share/uv/python"

# Navigate to script directory (SLURM_SUBMIT_DIR is where sbatch was called)
# Disable SLURM auto-detection in Lightning
# This lets Lightning spawn processes itself
unset SLURM_NTASKS
unset SLURM_JOB_NAME

# Fix for cross-NUMA GPU communication
# P2P doesn't work between GPUs on different CPU sockets
export NCCL_P2P_DISABLE=1

# Navigate to script directory
cd "$SLURM_SUBMIT_DIR"

# Print job info
@@ -31,34 +40,31 @@ echo "========================================"
echo "Job ID: $SLURM_JOB_ID"
echo "Node: $(hostname)"
echo "GPUs: $CUDA_VISIBLE_DEVICES"
echo "CPUs: $SLURM_CPUS_PER_TASK"
echo "Start: $(date)"
echo "========================================"

# Calculate workers per GPU
NUM_GPUS=$(echo $CUDA_VISIBLE_DEVICES | tr ',' '\n' | wc -l)
WORKERS_PER_GPU=$((SLURM_CPUS_PER_TASK / NUM_GPUS))

echo "Number of GPUs: $NUM_GPUS"
echo "Workers per GPU: $WORKERS_PER_GPU"
echo "========================================"

# Install dependencies if needed
if [ ! -d ".venv" ]; then
    echo "Installing dependencies..."
    uv sync
fi

# Pre-download data on rank 0 only to avoid race conditions
uv run python -c "from torchvision import datasets; datasets.MNIST('${TMPDIR:-/tmp}/mnist', train=True, download=True); datasets.MNIST('${TMPDIR:-/tmp}/mnist', train=False, download=True)"
# Pre-download CIFAR-10 data
DATA_DIR="${TMPDIR:-/tmp}/cifar10"
echo "Downloading CIFAR-10 to $DATA_DIR..."
uv run python -c "from torchvision import datasets; datasets.CIFAR10('$DATA_DIR', train=True, download=True); datasets.CIFAR10('$DATA_DIR', train=False, download=True)"

# Get number of GPUs
NUM_GPUS=$(echo $CUDA_VISIBLE_DEVICES | tr ',' '\n' | wc -l)
echo "Launching training on $NUM_GPUS GPUs..."

# Run training with srun (SLURM launches one process per GPU)
srun uv run python train.py \
    --devices 1 \
    --epochs 5 \
    --batch-size 64 \
    --num-workers "$WORKERS_PER_GPU" \
    --data-dir "${TMPDIR:-/tmp}/mnist"
# Let Lightning handle multi-GPU via DDP
uv run python train.py \
    --devices $NUM_GPUS \
    --epochs 10 \
    --batch-size 128 \
    --num-workers 4 \
    --data-dir "$DATA_DIR"

echo "========================================"
echo "End: $(date)"
+59 −0
Original line number Diff line number Diff line
#!/bin/bash
#SBATCH --job-name=lightning-1gpu
# SBATCH --account=<your-account>  # Uncomment and set if required
#SBATCH --partition=all
#SBATCH --time=01:00:00
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=4
#SBATCH --mem=16G
#SBATCH --gres=gpu:1
#SBATCH --output=lightning_1gpu_%j.out
#SBATCH --error=lightning_1gpu_%j.err

# Exit on error
set -e

# Load modules
module purge
module load 2025/gpu cuda/12.9

# Set up UV paths (adjust if UV is installed elsewhere)
export PATH="$HOME/linuxhome/.local/bin:$PATH"
export UV_CACHE_DIR="$HOME/linuxhome/.cache/uv"
export UV_PYTHON_INSTALL_DIR="$HOME/linuxhome/.local/share/uv/python"

# Navigate to script directory
cd "$SLURM_SUBMIT_DIR"

# Print job info
echo "========================================"
echo "Job ID: $SLURM_JOB_ID"
echo "Node: $(hostname)"
echo "GPUs: $CUDA_VISIBLE_DEVICES"
echo "CPUs: $SLURM_CPUS_PER_TASK"
echo "Start: $(date)"
echo "========================================"

# Install dependencies if needed
if [ ! -d ".venv" ]; then
    echo "Installing dependencies..."
    uv sync
fi

# Pre-download CIFAR-10 data
DATA_DIR="${TMPDIR:-/tmp}/cifar10"
echo "Downloading CIFAR-10 to $DATA_DIR..."
uv run python -c "from torchvision import datasets; datasets.CIFAR10('$DATA_DIR', train=True, download=True); datasets.CIFAR10('$DATA_DIR', train=False, download=True)"

# Run training - single GPU baseline
uv run python train.py \
    --devices 1 \
    --epochs 10 \
    --batch-size 128 \
    --num-workers "$SLURM_CPUS_PER_TASK" \
    --data-dir "$DATA_DIR"

echo "========================================"
echo "End: $(date)"
echo "========================================"
Loading