fix multi-GPU: add NCCL_P2P_DISABLE=1 for cross-NUMA GPUs, use CIFAR-10/ResNet18 (89fefc11) · Commits · DAIC / docs-experimental

content/en/support/troubleshooting.md

+28 −0

Original line number	Diff line number	Diff line
		@@ -57,3 +57,31 @@ quota -s
		1. Check error file: `cat <jobname>_<jobid>.err`
		2. Verify modules load correctly: `module load 2025/gpu cuda/12.9`
		3. Check working directory in job script uses `$SLURM_SUBMIT_DIR`

		## Multi-GPU training issues

		### Training hangs with multiple GPUs

		Symptoms: Training hangs after "Initializing distributed" or "All distributed processes registered". NCCL all_reduce operations never complete.

		Cause: DAIC GPU nodes have GPUs on different NUMA nodes (CPU sockets). NCCL P2P (peer-to-peer) communication fails between GPUs that aren't directly connected.

		Solution: Add this to your job script:

		```bash
		export NCCL_P2P_DISABLE=1
		```

		This forces NCCL to use shared memory instead of P2P, which works across NUMA boundaries.

		### Verify GPU topology

		Check how GPUs are connected:

		```bash
		nvidia-smi topo -m
		```

		If you see `SYS` between GPUs (not `NV#` for NVLink), you need `NCCL_P2P_DISABLE=1`.

		See [Multi-GPU Training](/tutorials/multi-gpu/#nccl-configuration-on-daic) for details.

content/en/tutorials/multi-gpu/examples/accelerate/job.sh

+4 −0

Original line number	Diff line number	Diff line
		@@ -35,6 +35,10 @@ echo "CPUs: $SLURM_CPUS_PER_TASK"
		echo "Start: $(date)"
		echo "========================================"

		# Fix for cross-NUMA GPU communication on DAIC
		# P2P doesn't work between GPUs on different CPU sockets
		export NCCL_P2P_DISABLE=1

		# Calculate number of GPUs and workers
		NUM_GPUS=$(echo $CUDA_VISIBLE_DEVICES \| tr ',' '\n' \| wc -l)
		WORKERS_PER_GPU=$((SLURM_CPUS_PER_TASK / NUM_GPUS))

content/en/tutorials/multi-gpu/examples/ddp/job.sh

+4 −0

Original line number	Diff line number	Diff line
		@@ -36,6 +36,10 @@ echo "CPUs per task: $SLURM_CPUS_PER_TASK"
		echo "Start: $(date)"
		echo "========================================"

		# Fix for cross-NUMA GPU communication on DAIC
		# P2P doesn't work between GPUs on different CPU sockets
		export NCCL_P2P_DISABLE=1

		# Set master address and port
		export MASTER_ADDR=$(hostname)
		export MASTER_PORT=${MASTER_PORT:-29500}

content/en/tutorials/multi-gpu/examples/lightning/job.sh

+30 −24

Original line number	Diff line number	Diff line
		#!/bin/bash
		#SBATCH --job-name=lightning-multi-gpu
		#SBATCH --job-name=lightning-cifar10
		# SBATCH --account=<your-account> # Uncomment and set if required
		#SBATCH --partition=all
		#SBATCH --time=00:30:00
		#SBATCH --time=01:00:00
		#SBATCH --nodes=1
		#SBATCH --ntasks-per-node=2
		#SBATCH --cpus-per-task=4
		#SBATCH --ntasks=1
		#SBATCH --cpus-per-task=8
		#SBATCH --mem=32G
		#SBATCH --gres=gpu:2
		#SBATCH --output=lightning_%j.out
		@@ -18,12 +18,21 @@ set -e
		module purge
		module load 2025/gpu cuda/12.9

		# Set up UV paths (adjust if UV is installed elsewhere)
		# Set up UV paths
		export PATH="$HOME/linuxhome/.local/bin:$PATH"
		export UV_CACHE_DIR="$HOME/linuxhome/.cache/uv"
		export UV_PYTHON_INSTALL_DIR="$HOME/linuxhome/.local/share/uv/python"

		# Navigate to script directory (SLURM_SUBMIT_DIR is where sbatch was called)
		# Disable SLURM auto-detection in Lightning
		# This lets Lightning spawn processes itself
		unset SLURM_NTASKS
		unset SLURM_JOB_NAME

		# Fix for cross-NUMA GPU communication
		# P2P doesn't work between GPUs on different CPU sockets
		export NCCL_P2P_DISABLE=1

		# Navigate to script directory
		cd "$SLURM_SUBMIT_DIR"

		# Print job info
		@@ -31,34 +40,31 @@ echo "========================================"
		echo "Job ID: $SLURM_JOB_ID"
		echo "Node: $(hostname)"
		echo "GPUs: $CUDA_VISIBLE_DEVICES"
		echo "CPUs: $SLURM_CPUS_PER_TASK"
		echo "Start: $(date)"
		echo "========================================"

		# Calculate workers per GPU
		NUM_GPUS=$(echo $CUDA_VISIBLE_DEVICES \| tr ',' '\n' \| wc -l)
		WORKERS_PER_GPU=$((SLURM_CPUS_PER_TASK / NUM_GPUS))

		echo "Number of GPUs: $NUM_GPUS"
		echo "Workers per GPU: $WORKERS_PER_GPU"
		echo "========================================"

		# Install dependencies if needed
		if [ ! -d ".venv" ]; then
		echo "Installing dependencies..."
		uv sync
		fi

		# Pre-download data on rank 0 only to avoid race conditions
		uv run python -c "from torchvision import datasets; datasets.MNIST('${TMPDIR:-/tmp}/mnist', train=True, download=True); datasets.MNIST('${TMPDIR:-/tmp}/mnist', train=False, download=True)"
		# Pre-download CIFAR-10 data
		DATA_DIR="${TMPDIR:-/tmp}/cifar10"
		echo "Downloading CIFAR-10 to $DATA_DIR..."
		uv run python -c "from torchvision import datasets; datasets.CIFAR10('$DATA_DIR', train=True, download=True); datasets.CIFAR10('$DATA_DIR', train=False, download=True)"

		# Get number of GPUs
		NUM_GPUS=$(echo $CUDA_VISIBLE_DEVICES \| tr ',' '\n' \| wc -l)
		echo "Launching training on $NUM_GPUS GPUs..."

		# Run training with srun (SLURM launches one process per GPU)
		srun uv run python train.py \
		--devices 1 \
		--epochs 5 \
		--batch-size 64 \
		--num-workers "$WORKERS_PER_GPU" \
		--data-dir "${TMPDIR:-/tmp}/mnist"
		# Let Lightning handle multi-GPU via DDP
		uv run python train.py \
		--devices $NUM_GPUS \
		--epochs 10 \
		--batch-size 128 \
		--num-workers 4 \
		--data-dir "$DATA_DIR"

		echo "========================================"
		echo "End: $(date)"

content/en/tutorials/multi-gpu/examples/lightning/job_1gpu.sh

0 → 100644

+59 −0

Original line number	Diff line number	Diff line
		#!/bin/bash
		#SBATCH --job-name=lightning-1gpu
		# SBATCH --account=<your-account> # Uncomment and set if required
		#SBATCH --partition=all
		#SBATCH --time=01:00:00
		#SBATCH --nodes=1
		#SBATCH --ntasks-per-node=1
		#SBATCH --cpus-per-task=4
		#SBATCH --mem=16G
		#SBATCH --gres=gpu:1
		#SBATCH --output=lightning_1gpu_%j.out
		#SBATCH --error=lightning_1gpu_%j.err

		# Exit on error
		set -e

		# Load modules
		module purge
		module load 2025/gpu cuda/12.9

		# Set up UV paths (adjust if UV is installed elsewhere)
		export PATH="$HOME/linuxhome/.local/bin:$PATH"
		export UV_CACHE_DIR="$HOME/linuxhome/.cache/uv"
		export UV_PYTHON_INSTALL_DIR="$HOME/linuxhome/.local/share/uv/python"

		# Navigate to script directory
		cd "$SLURM_SUBMIT_DIR"

		# Print job info
		echo "========================================"
		echo "Job ID: $SLURM_JOB_ID"
		echo "Node: $(hostname)"
		echo "GPUs: $CUDA_VISIBLE_DEVICES"
		echo "CPUs: $SLURM_CPUS_PER_TASK"
		echo "Start: $(date)"
		echo "========================================"

		# Install dependencies if needed
		if [ ! -d ".venv" ]; then
		echo "Installing dependencies..."
		uv sync
		fi

		# Pre-download CIFAR-10 data
		DATA_DIR="${TMPDIR:-/tmp}/cifar10"
		echo "Downloading CIFAR-10 to $DATA_DIR..."
		uv run python -c "from torchvision import datasets; datasets.CIFAR10('$DATA_DIR', train=True, download=True); datasets.CIFAR10('$DATA_DIR', train=False, download=True)"

		# Run training - single GPU baseline
		uv run python train.py \
		--devices 1 \
		--epochs 10 \
		--batch-size 128 \
		--num-workers "$SLURM_CPUS_PER_TASK" \
		--data-dir "$DATA_DIR"

		echo "========================================"
		echo "End: $(date)"
		echo "========================================"