Day: October 9, 2025

admin | October 9, 2025

Slurm Job: Cluster Sampler & Diagnostics (One-Click)

This job collects GPU/CPU, memory, NUMA, PCIe/NVLink, NIC/IB, and optional Nsight/NCCL/iperf3 telemetry across all allocated nodes while your workload runs, then bundles everything into a single .tgz.

Usage: Save as profile_env.slurm and submit:

sbatch --export=ALL,WORKLOAD="torchrun --nproc_per_node=8 train.py --cfg config.yaml",ENABLE_NSYS=1,RUN_NCCL_TESTS=1,DURATION=1800 profile_env.slurm

#!/usr/bin/env bash
#
# profile_env.slurm — cluster-wide performance sampler & diagnostics
#
#SBATCH -J prof-playbook
#SBATCH -o prof-%x-%j.out
#SBATCH -e prof-%x-%j.err
#SBATCH –time=01:00:00
#SBATCH –nodes=1
#SBATCH –ntasks-per-node=1
#SBATCH –gres=gpu:1
#SBATCH –cpus-per-task=8
## Uncomment/adjust for your site:
## #SBATCH –partition=gpu
## #SBATCH –qos=normal

set -euo pipefail

#############################
# Tunables (override via: sbatch –export=ALL,WORKLOAD=”python train.py”,DURATION=900 …)
#############################
WORKLOAD=”${WORKLOAD:-}”                 # e.g., “python train.py”; if empty, uses a tiny CUDA sample
DURATION=”${DURATION:-600}”              # seconds to sample (upper bound)
SAMPLE_INT=”${SAMPLE_INT:-1}”            # sampler interval (seconds)
ENABLE_NSYS=”${ENABLE_NSYS:-0}”          # 1 to record short Nsight Systems traces per node
NSYS_SECONDS=”${NSYS_SECONDS:-45}”       # Nsight trace duration
RUN_NCCL_TESTS=”${RUN_NCCL_TESTS:-0}”    # 1 to run nccl-tests all_reduce_perf
NCCL_TEST_BIN=”${NCCL_TEST_BIN:-all_reduce_perf}”  # path or in $PATH
IPERF_SERVER=”${IPERF_SERVER:-}”         # host/IP to test NIC TCP throughput (iperf3 server required)
OUTROOT=”${OUTROOT:-$PWD}”
TAG=”${TAG:-$(date +%Y%m%d-%H%M%S)}”
OUTDIR=”$OUTROOT/prof-${SLURM_JOB_ID:-nojob}-$TAG”

mkdir -p “$OUTDIR”

echo “==[JOB]==========================================================”
echo ” JOBID        : ${SLURM_JOB_ID:-local}”
echo ” NNODES/NTASKS: ${SLURM_NNODES:-1} / ${SLURM_NTASKS:-1}”
echo ” GPUS         : ${SLURM_GPUS:-unknown} (CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES)”
echo ” WORKLOAD     : ${WORKLOAD:-<built-in demo>}”
echo ” DURATION     : $DURATION  s”
echo ” SAMPLE_INT   : $SAMPLE_INT s”
echo ” ENABLE_NSYS  : $ENABLE_NSYS  (for $NSYS_SECONDS s)”
echo ” RUN_NCCL     : $RUN_NCCL_TESTS”
echo ” IPERF_SERVER : ${IPERF_SERVER:-<none>}”
echo ” OUTDIR       : $OUTDIR”
echo “==================================================================”

# Helper to run a command on every allocated node
node_run() { local cmd=”$1″; srun –ntasks-per-node=1 –nodes=”${SLURM_NNODES:-1}” –label bash -lc “$cmd”; }

# ——– Per-node init: inventory & topology ——–
node_run ‘bash -lc ”
set -euo pipefail
NDIR=\”‘$OUTDIR’/\${HOSTNAME}\”
mkdir -p \”\$NDIR\”
{
  echo \”# Slurm/Env\”
  env | egrep \”SLURM|CUDA|NCCL\” || true
  echo
  echo \”# System\”
  uname -a
  lsb_release -a 2>/dev/null || cat /etc/os-release || true
  date -Iseconds
  echo
  echo \”# CPU/NUMA\”
  lscpu || true
  numactl –hardware || true
} > \”\$NDIR/env.txt\”

if command -v nvidia-smi >/dev/null 2>&1; then
  nvidia-smi -L > \”\$NDIR/gpu_list.txt\” || true
  nvidia-smi topo -m > \”\$NDIR/gpu_topo.txt\” || true
  nvidia-smi -q -x > \”\$NDIR/nvidia_smi.xml\” || nvidia-smi -q > \”\$NDIR/nvidia_smi.txt\” || true
  nvidia-smi pmon -c 1 -s um > \”\$NDIR/pmon_header.txt\” || true
fi

(lspci -nn | egrep -i \”nvidia|mellanox|ethernet|infiniband|network\” || true) > \”\$NDIR/lspci.txt\”
if command -v ibstat >/dev/null 2>&1; then ibstat > \”\$NDIR/ibstat.txt\” || true; fi
if command -v ibv_devinfo >/dev/null 2>&1; then ibv_devinfo > \”\$NDIR/ibv_devinfo.txt\” || true; fi

ip -br link show up | awk ‘\”$1!=\”lo\”{print $1}’\” > \”\$NDIR/ifaces.txt\” || true
while read -r IFACE; do
  (ethtool \”\$IFACE\” && ethtool -k \”\$IFACE\” && ethtool -S \”\$IFACE\” | egrep -i \”err|drop|disc|pause|fcs|crc\” || true) > \”\$NDIR/ethtool_\${IFACE}.txt\” 2>&1 || true
done < \"\$NDIR/ifaces.txt\"
"'

# -------- Start background samplers on each node --------
node_run 'bash -lc "
set -euo pipefail
NDIR=\"'$OUTDIR'/\${HOSTNAME}\"
mkdir -p \"\$NDIR\"
echo $$ > \”\$NDIR/sampler_parent.pid\”

if command -v nvidia-smi >/dev/null 2>&1; then
  (nvidia-smi dmon -s pucvmet -d ‘$SAMPLE_INT’ > \”\$NDIR/gpu_dmon.log\”) &
  echo $! > \”\$NDIR/gpu_dmon.pid\”
  ( (nvidia-smi nvlink -s; while true; do nvidia-smi nvlink -s; sleep 10; done) > \”\$NDIR/nvlink_watch.log\” 2>&1 ) &
  echo $! > \”\$NDIR/nvlink_watch.pid\”
fi

(command -v mpstat >/dev/null 2>&1 && mpstat -P ALL ‘$SAMPLE_INT’ > \”\$NDIR/mpstat.log\”) & echo $! > \”\$NDIR/mpstat.pid\” || true
(command -v pidstat >/dev/null 2>&1 && pidstat -u -r -d ‘$SAMPLE_INT’ > \”\$NDIR/pidstat.log\”) & echo $! > \”\$NDIR/pidstat.pid\” || true
(command -v vmstat  >/dev/null 2>&1 && vmstat ‘$SAMPLE_INT’ > \”\$NDIR/vmstat.log\”)  & echo $! > \”\$NDIR/vmstat.pid\”  || true
(command -v iostat  >/dev/null 2>&1 && iostat -xy ‘$SAMPLE_INT’ > \”\$NDIR/iostat.log\”)  & echo $! > \”\$NDIR/iostat.pid\”  || true

if command -v dcgmi >/dev/null 2>&1; then
  (dcgmi dmon -e 100 -d ‘$SAMPLE_INT’ > \”\$NDIR/dcgm_dmon.log\”) &
  echo $! > \”\$NDIR/dcgm_dmon.pid\”
fi

if command -v numastat >/dev/null 2>&1; then
  (while true; do
     for P in $(pgrep -f -n python || true); do numastat -p \$P; done
     sleep 10
   done > \”\$NDIR/numastat_watch.log\” 2>&1) &
  echo $! > \”\$NDIR/numastat_watch.pid\”
fi
“‘

cleanup() {
  node_run ‘bash -lc ”
    set -euo pipefail
    NDIR=\”‘$OUTDIR’/\${HOSTNAME}\”
    for f in gpu_dmon pidstat mpstat vmstat iostat dcgm_dmon nvlink_watch numastat_watch; do
      if [[ -f \”\$NDIR/\${f}.pid\” ]]; then kill \$(cat \”\$NDIR/\${f}.pid\”) 2>/dev/null || true; fi
    done
  “‘ || true
}
trap cleanup EXIT

# ——– Optional Nsight Systems short trace ——–
if [[ “$ENABLE_NSYS” -eq 1 ]] && command -v nsys >/dev/null 2>&1; then
  node_run ‘bash -lc ”
    set -euo pipefail
    NDIR=\”‘$OUTDIR’/\${HOSTNAME}\”
    mkdir -p \”\$NDIR\”
    nsys profile -t cuda,osrt,nvtx -o \”\$NDIR/nsys_\${HOSTNAME}\” –duration ‘$NSYS_SECONDS’ –stop-on-exit true –capture-range=none sleep ‘$NSYS_SECONDS’
  “‘
fi

# ——– Run workload (or a tiny CUDA demo) ——–
echo “== Running workload ==”
if [[ -z “$WORKLOAD” ]]; then
  node_run ‘bash -lc ”
    set -euo pipefail
    NDIR=\”‘$OUTDIR’/\${HOSTNAME}\”
    echo \”No WORKLOAD provided; running a small CUDA loop…\” | tee -a \”\$NDIR/workload.log\”
    python – <<'PY' || sleep 30
import torch, time
if torch.cuda.is_available():
    a=torch.randn((8192,8192),device='cuda'); b=torch.randn((8192,8192),device='cuda')
    for i in range(50):
        c=a@b; torch.cuda.synchronize()
        time.sleep(0.1)
else:
    time.sleep(30)
PY
  "'
else
  RUNWRK=$(printf '%q ' $WORKLOAD)
  node_run "bash -lc 'set -euo pipefail; NDIR=$OUTDIR/\${HOSTNAME}; echo Running: $RUNWRK | tee -a \"\$NDIR/workload.log\"; $RUNWRK |& tee -a \"\$NDIR/workload.log\"'"
fi
echo "== Workload section complete =="

# -------- Optional network & NCCL checks --------
if [[ -n "$IPERF_SERVER" ]] && command -v iperf3 >/dev/null 2>&1; then
  node_run “iperf3 -c $IPERF_SERVER -P 8 -t 30 | tee ‘$OUTDIR/\${HOSTNAME}/iperf3_\${HOSTNAME}.log'”
fi

if [[ “$RUN_NCCL_TESTS” -eq 1 ]]; then
  node_run “bash -lc ‘set -euo pipefail; NDIR=$OUTDIR/\${HOSTNAME}; if command -v $NCCL_TEST_BIN >/dev/null 2>&1; then $NCCL_TEST_BIN -b 8M -e 512M -f 2 -g \${SLURM_GPUS_PER_NODE:-1} | tee \”\$NDIR/nccl_all_reduce.log\”; else echo \”$NCCL_TEST_BIN not found\” | tee \”\$NDIR/nccl_all_reduce.log\”; fi'”
fi

# ——– Final snapshots & packaging ——–
node_run ‘bash -lc ”
set -euo pipefail
NDIR=\”‘$OUTDIR’/\${HOSTNAME}\”
if command -v nvidia-smi >/dev/null 2>&1; then
  nvidia-smi –query-gpu=index,utilization.gpu,utilization.memory,clocks.sm,clocks.mem,power.draw,temperature.gpu –format=csv -l 1 -f \”\$NDIR/nvidia_smi_final.csv\” -c 3 || true
fi
free -h > \”\$NDIR/free.txt\” || true
df -h   > \”\$NDIR/df.txt\” || true
{
  echo \”=== tail gpu_dmon ===\”; tail -n 30 \”\$NDIR/gpu_dmon.log\” 2>/dev/null || true
  echo
  echo \”=== tail pidstat ===\”; tail -n 30 \”\$NDIR/pidstat.log\” 2>/dev/null || true
} > \”\$NDIR/quick_summary.txt\” || true
“‘

cleanup

tar -C “$OUTROOT” -czf “$OUTDIR.tgz” “$(basename “$OUTDIR”)”
echo “Artifacts packaged at: $OUTDIR.tgz”
echo “Per-node logs under   : $OUTDIR/<hostname>/”
echo “Done.”
    </hostname></none></built-in>

Prefer a direct file? You can also grab the ready-made script: Download profile_env.slurm

admin October 9, 2025 HPCNo Comments »

admin | October 9, 2025

A practical, repeatable workflow for NVIDIA-GPU Linux clusters (Slurm/K8s or bare-metal) to pinpoint whether your bottleneck is GPU, CPU, memory bandwidth, or network

Profiling Playbook: Detect GPU/CPU, Memory Bandwidth, and Network Bottlenecks

A practical, repeatable workflow for NVIDIA-GPU Linux clusters (Slurm/K8s or bare-metal) to pinpoint whether your bottleneck is GPU, CPU, memory bandwidth, or network.

0) Prep: Make the Test Reproducible

Choose a workload: (a) your real training/inference job, plus (b) a couple of microbenchmarks.
Pin placement/affinity: match production (same container, CUDA/cuDNN, drivers, env vars, GPU/CPU affinity).
Record node info: driver, CUDA, GPU model, CPU model, NUMA, NIC, topology.

nvidia-smi; nvidia-smi topo -m
lscpu; numactl --hardware

1) GPU Profiling (Utilization, Kernels, Memory, Interconnect)

Quick Live View (low overhead)

# 1s sampling: Power (p) Util (u) Clocks (c) Mem util (v) Enc/Dec (e) PCIe/NVLink (t)
nvidia-smi dmon -s pucvmet

# More fields, CSV:
nvidia-smi --query-gpu=index,name,utilization.gpu,utilization.memory,clocks.sm,clocks.mem,power.draw,temperature.gpu,pcie.link.gen.current,pcie.link.width.current,clocks_throttle_reasons.active --format=csv -l 1

What to notice

utilization.gpu ~ 0–40% while job is “busy” → likely CPU or input (I/O) bound.
High memory util + low SM util → global memory bandwidth bound.
Power below expected / throttling active → power/thermal cap or app clocks.
PCIe gen/width lower than expected → host-device transfer bottleneck.

Deep Timeline (Nsight Systems → find where time is spent)

nsys profile -t cuda,osrt,nvtx,mpi --sample=process-tree -o /tmp/trace \
    --export=sqlite python train.py
# Open /tmp/trace.qdrep in Nsight Systems GUI, or analyze the sqlite export

Look for:

Long CPU gaps before kernels → dataloader/CPU stall.
CUDA memcpy / NCCL all-reduce dominating → I/O or network bottleneck.
Many short kernels with gaps → kernel launch overhead (try CUDA Graphs).

Kernel Efficiency (Nsight Compute → why GPU is slow)

ncu --set full --target-processes all -o /tmp/ncu python train.py
# Then: ncu --import /tmp/ncu.ncu-rep --csv --page summary

Signals:

Low/achieved SM occupancy & high dram__throughput vs arithmetic intensity → memory-bound kernels.
High barrier/serialization → reformulate kernels or change backend.

NVLink / PCIe Health

# NVLink counters (A100+/NVSwitch)
nvidia-smi nvlink -s
# Topology sanity:
nvidia-smi topo -m

If inter-GPU traffic stalls or retry errors climb, expect intra-node comms bottlenecks.

2) CPU & Memory-Bandwidth Profiling (Host Side)

Fast CPU View

mpstat -P ALL 1
pidstat -u -r -d 1 -p $(pgrep -n python)   # CPU, RSS, I/O per PID

High CPU% & run queue + GPU idle → CPU compute bound (augmentations, tokenization).
Low CPU% & waiting on I/O + GPU idle → storage or network input bottleneck.

NUMA Locality (critical for feeders/data loaders)

numactl -s
numastat -p $(pgrep -n python)  # remote vs local memory hits

Many remote hits → pin processes to closest NUMA node; bind NIC/GPU affinity.

Hardware Counters (perf) & Memory Bandwidth

# Whole process counters
perf stat -d -p $(pgrep -n python) -- sleep 30

# Hotspots (then open interactive report)
perf record -F 99 -g -p $(pgrep -n python) -- sleep 30
perf report

Low IPC + many L3/mem stalls → memory bandwidth bound on CPU. Validate with STREAM / Intel PCM:

# STREAM (approximate host RAM BW)
stream
# Intel PCM memory (Intel CPUs)
pcm-memory 1

3) Network Throughput/Latency (Intra & Inter-node)

Raw NIC Performance

# TCP test (adjust -P for parallel flows)
iperf3 -s   # on server
iperf3 -c <server> -P 8 -t 30
# For UDP or specific MTU/Jumbo: use -u and set mtu via ip link/ethtool

Compare results to NIC line-rate (e.g., 100/200/400GbE).

RDMA / InfiniBand (if applicable)

ibstat; ibv_devinfo
ib_write_bw -d mlx5_0 -F -q 4 -l 512 -s 8388608 -D 30
ib_send_bw  -d mlx5_0 -F -q 4 -l 512 -s 8388608 -D 30

If RDMA BW/latency is poor, check PFC/ECN, RoCE config, and mtu 9000 end-to-end.

Collective (NCCL) Reality Check

# From nccl-tests (build once)
./build/all_reduce_perf -b 8M -e 1G -f 2 -g 8   # intra-node
# Multi-node (via mpirun or torchrun)

Throughput far below expectation → network path/topology, or NCCL env (e.g., NCCL_IB, NCCL_NET_GDR_LEVEL, CollNet/NVLS).

NIC Counters / Driver

ethtool -S <iface> | egrep "err|drop|disc|pause"
ethtool -k <iface>   # offloads; ensure GRO/LRO settings suit your stack

Growing errors/pause frames → congestion, bad optics, or flow-control tuning.

4) Tie It Together with a Roofline View

Compute intensity (FLOPs/byte) vs achieved bandwidth quickly classifies memory-bound vs compute-bound. Use Nsight Compute’s roofline page for kernels; for end-to-end, annotate steps with NVTX and view in Nsight Systems.

5) Microbenchmarks to Isolate Layers

GPU math: HPL/HPL-AI, cuBLAS GEMM runner, nvidia/cuda-samples (matrixMulCUBLAS).
Host RAM BW: STREAM.
Disk I/O: fio (sequential vs random, queue depth).
Network: iperf3, ib_*_bw, NCCL tests.

If microbenchmarks are fine but the real job isn’t, the issue is software pipeline (dataloader, preprocessing, small batch, Python GIL, etc.).

6) Common Bottlenecks → Fixes

Symptom	Likely Bottleneck	Quick Fixes
GPU util low, CPU busy	CPU pipeline	Increase workers/prefetch, move aug to GPU (DALI), compile ops, pin threads/NUMA.
High GPU mem util, SM low	GPU mem-bound	Fuse kernels, better tensor layouts, mixed precision (bf16/fp16), larger batch if headroom.
NCCL all-reduce dominates	Network	Enable RDMA, tune NCCL env, jumbo MTU 9000, keep same switch tier, test CollNet/NVLS.
memcpy HtoD heavy	PCIe/host I/O	Page-locked buffers, async prefetch, increase batch queue, ensure max PCIe Gen/width.
Frequent GPU throttling	Power/Thermal	Raise power limit (if safe), fix cooling, set application clocks, check throttling reasons.
Remote NUMA hits high	NUMA	Bind processes to local NUMA of GPU/NIC, interleave wisely.

7) Optional: One-Node Sampler Script

Paste into profile.sh and run bash profile.sh python train.py.

#!/usr/bin/env bash
set -euo pipefail
APP="$@"  # e.g., python train.py

echo "== System =="
nvidia-smi --query-gpu=name,uuid,driver_version,pstate,pcie.link.gen.current,pcie.link.width.current --format=csv
lscpu | egrep 'Model name|Socket|NUMA|Thread|MHz'
echo

echo "== Start background samplers =="
(nvidia-smi dmon -s pucvmet -d 1 > /tmp/gpu_dmon.log) &
GPU_DMON_PID=$!
(pidstat -u -r -d 1 > /tmp/pidstat.log) &
PIDSTAT_PID=$!

echo "== Run workload =="
$APP || true

echo "== Cleanup =="
kill $GPU_DMON_PID $PIDSTAT_PID 2>/dev/null || true

echo "== Summaries =="
tail -n +1 /tmp/gpu_dmon.log | head
tail -n 20 /tmp/gpu_dmon.log
tail -n 20 /tmp/pidstat.log

8) HPE-Specific Checks (If Relevant)

HPE iLO/OneView: check thermal/power capping, fan curves, PSU headroom.
HPE Performance Cluster Manager / Cray: use built-in telemetry and fabric diagnostics.
BIOS: Performance power profile, NUMA exposed, deterministic turbo, PCIe Gen4/Gen5, Above 4G decoding on, SR-IOV/ATS if virtualized.

Need a tailored version? Tell me your GPU model(s), CPUs, NIC/fabric, batch size/model, and orchestration (Slurm/K8s). I can generate a vendor-ready checklist and a Slurm job that auto-collects Nsight & NCCL traces.

admin October 9, 2025 HPC, LinuxNo Comments »

Nick Tailor's Technical Blog

A detail-minded individual, combining strong technical understanding and communication skills with experiences in Systems Administration, Engineering, Automation, AI Automation and Solutions; a proven methodical problem solver.