Category: HPC

admin | October 9, 2025

Slurm Job: Cluster Sampler & Diagnostics (One-Click)

This job collects GPU/CPU, memory, NUMA, PCIe/NVLink, NIC/IB, and optional Nsight/NCCL/iperf3 telemetry across all allocated nodes while your workload runs, then bundles everything into a single .tgz.

Usage: Save as profile_env.slurm and submit:

sbatch --export=ALL,WORKLOAD="torchrun --nproc_per_node=8 train.py --cfg config.yaml",ENABLE_NSYS=1,RUN_NCCL_TESTS=1,DURATION=1800 profile_env.slurm

#!/usr/bin/env bash
#
# profile_env.slurm — cluster-wide performance sampler & diagnostics
#
#SBATCH -J prof-playbook
#SBATCH -o prof-%x-%j.out
#SBATCH -e prof-%x-%j.err
#SBATCH –time=01:00:00
#SBATCH –nodes=1
#SBATCH –ntasks-per-node=1
#SBATCH –gres=gpu:1
#SBATCH –cpus-per-task=8
## Uncomment/adjust for your site:
## #SBATCH –partition=gpu
## #SBATCH –qos=normal

set -euo pipefail

#############################
# Tunables (override via: sbatch –export=ALL,WORKLOAD=”python train.py”,DURATION=900 …)
#############################
WORKLOAD=”${WORKLOAD:-}”                 # e.g., “python train.py”; if empty, uses a tiny CUDA sample
DURATION=”${DURATION:-600}”              # seconds to sample (upper bound)
SAMPLE_INT=”${SAMPLE_INT:-1}”            # sampler interval (seconds)
ENABLE_NSYS=”${ENABLE_NSYS:-0}”          # 1 to record short Nsight Systems traces per node
NSYS_SECONDS=”${NSYS_SECONDS:-45}”       # Nsight trace duration
RUN_NCCL_TESTS=”${RUN_NCCL_TESTS:-0}”    # 1 to run nccl-tests all_reduce_perf
NCCL_TEST_BIN=”${NCCL_TEST_BIN:-all_reduce_perf}”  # path or in $PATH
IPERF_SERVER=”${IPERF_SERVER:-}”         # host/IP to test NIC TCP throughput (iperf3 server required)
OUTROOT=”${OUTROOT:-$PWD}”
TAG=”${TAG:-$(date +%Y%m%d-%H%M%S)}”
OUTDIR=”$OUTROOT/prof-${SLURM_JOB_ID:-nojob}-$TAG”

mkdir -p “$OUTDIR”

echo “==[JOB]==========================================================”
echo ” JOBID        : ${SLURM_JOB_ID:-local}”
echo ” NNODES/NTASKS: ${SLURM_NNODES:-1} / ${SLURM_NTASKS:-1}”
echo ” GPUS         : ${SLURM_GPUS:-unknown} (CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES)”
echo ” WORKLOAD     : ${WORKLOAD:-<built-in demo>}”
echo ” DURATION     : $DURATION  s”
echo ” SAMPLE_INT   : $SAMPLE_INT s”
echo ” ENABLE_NSYS  : $ENABLE_NSYS  (for $NSYS_SECONDS s)”
echo ” RUN_NCCL     : $RUN_NCCL_TESTS”
echo ” IPERF_SERVER : ${IPERF_SERVER:-<none>}”
echo ” OUTDIR       : $OUTDIR”
echo “==================================================================”

# Helper to run a command on every allocated node
node_run() { local cmd=”$1″; srun –ntasks-per-node=1 –nodes=”${SLURM_NNODES:-1}” –label bash -lc “$cmd”; }

# ——– Per-node init: inventory & topology ——–
node_run ‘bash -lc ”
set -euo pipefail
NDIR=\”‘$OUTDIR’/\${HOSTNAME}\”
mkdir -p \”\$NDIR\”
{
  echo \”# Slurm/Env\”
  env | egrep \”SLURM|CUDA|NCCL\” || true
  echo
  echo \”# System\”
  uname -a
  lsb_release -a 2>/dev/null || cat /etc/os-release || true
  date -Iseconds
  echo
  echo \”# CPU/NUMA\”
  lscpu || true
  numactl –hardware || true
} > \”\$NDIR/env.txt\”

if command -v nvidia-smi >/dev/null 2>&1; then
  nvidia-smi -L > \”\$NDIR/gpu_list.txt\” || true
  nvidia-smi topo -m > \”\$NDIR/gpu_topo.txt\” || true
  nvidia-smi -q -x > \”\$NDIR/nvidia_smi.xml\” || nvidia-smi -q > \”\$NDIR/nvidia_smi.txt\” || true
  nvidia-smi pmon -c 1 -s um > \”\$NDIR/pmon_header.txt\” || true
fi

(lspci -nn | egrep -i \”nvidia|mellanox|ethernet|infiniband|network\” || true) > \”\$NDIR/lspci.txt\”
if command -v ibstat >/dev/null 2>&1; then ibstat > \”\$NDIR/ibstat.txt\” || true; fi
if command -v ibv_devinfo >/dev/null 2>&1; then ibv_devinfo > \”\$NDIR/ibv_devinfo.txt\” || true; fi

ip -br link show up | awk ‘\”$1!=\”lo\”{print $1}’\” > \”\$NDIR/ifaces.txt\” || true
while read -r IFACE; do
  (ethtool \”\$IFACE\” && ethtool -k \”\$IFACE\” && ethtool -S \”\$IFACE\” | egrep -i \”err|drop|disc|pause|fcs|crc\” || true) > \”\$NDIR/ethtool_\${IFACE}.txt\” 2>&1 || true
done < \"\$NDIR/ifaces.txt\"
"'

# -------- Start background samplers on each node --------
node_run 'bash -lc "
set -euo pipefail
NDIR=\"'$OUTDIR'/\${HOSTNAME}\"
mkdir -p \"\$NDIR\"
echo $$ > \”\$NDIR/sampler_parent.pid\”

if command -v nvidia-smi >/dev/null 2>&1; then
  (nvidia-smi dmon -s pucvmet -d ‘$SAMPLE_INT’ > \”\$NDIR/gpu_dmon.log\”) &
  echo $! > \”\$NDIR/gpu_dmon.pid\”
  ( (nvidia-smi nvlink -s; while true; do nvidia-smi nvlink -s; sleep 10; done) > \”\$NDIR/nvlink_watch.log\” 2>&1 ) &
  echo $! > \”\$NDIR/nvlink_watch.pid\”
fi

(command -v mpstat >/dev/null 2>&1 && mpstat -P ALL ‘$SAMPLE_INT’ > \”\$NDIR/mpstat.log\”) & echo $! > \”\$NDIR/mpstat.pid\” || true
(command -v pidstat >/dev/null 2>&1 && pidstat -u -r -d ‘$SAMPLE_INT’ > \”\$NDIR/pidstat.log\”) & echo $! > \”\$NDIR/pidstat.pid\” || true
(command -v vmstat  >/dev/null 2>&1 && vmstat ‘$SAMPLE_INT’ > \”\$NDIR/vmstat.log\”)  & echo $! > \”\$NDIR/vmstat.pid\”  || true
(command -v iostat  >/dev/null 2>&1 && iostat -xy ‘$SAMPLE_INT’ > \”\$NDIR/iostat.log\”)  & echo $! > \”\$NDIR/iostat.pid\”  || true

if command -v dcgmi >/dev/null 2>&1; then
  (dcgmi dmon -e 100 -d ‘$SAMPLE_INT’ > \”\$NDIR/dcgm_dmon.log\”) &
  echo $! > \”\$NDIR/dcgm_dmon.pid\”
fi

if command -v numastat >/dev/null 2>&1; then
  (while true; do
     for P in $(pgrep -f -n python || true); do numastat -p \$P; done
     sleep 10
   done > \”\$NDIR/numastat_watch.log\” 2>&1) &
  echo $! > \”\$NDIR/numastat_watch.pid\”
fi
“‘

cleanup() {
  node_run ‘bash -lc ”
    set -euo pipefail
    NDIR=\”‘$OUTDIR’/\${HOSTNAME}\”
    for f in gpu_dmon pidstat mpstat vmstat iostat dcgm_dmon nvlink_watch numastat_watch; do
      if [[ -f \”\$NDIR/\${f}.pid\” ]]; then kill \$(cat \”\$NDIR/\${f}.pid\”) 2>/dev/null || true; fi
    done
  “‘ || true
}
trap cleanup EXIT

# ——– Optional Nsight Systems short trace ——–
if [[ “$ENABLE_NSYS” -eq 1 ]] && command -v nsys >/dev/null 2>&1; then
  node_run ‘bash -lc ”
    set -euo pipefail
    NDIR=\”‘$OUTDIR’/\${HOSTNAME}\”
    mkdir -p \”\$NDIR\”
    nsys profile -t cuda,osrt,nvtx -o \”\$NDIR/nsys_\${HOSTNAME}\” –duration ‘$NSYS_SECONDS’ –stop-on-exit true –capture-range=none sleep ‘$NSYS_SECONDS’
  “‘
fi

# ——– Run workload (or a tiny CUDA demo) ——–
echo “== Running workload ==”
if [[ -z “$WORKLOAD” ]]; then
  node_run ‘bash -lc ”
    set -euo pipefail
    NDIR=\”‘$OUTDIR’/\${HOSTNAME}\”
    echo \”No WORKLOAD provided; running a small CUDA loop…\” | tee -a \”\$NDIR/workload.log\”
    python – <<'PY' || sleep 30
import torch, time
if torch.cuda.is_available():
    a=torch.randn((8192,8192),device='cuda'); b=torch.randn((8192,8192),device='cuda')
    for i in range(50):
        c=a@b; torch.cuda.synchronize()
        time.sleep(0.1)
else:
    time.sleep(30)
PY
  "'
else
  RUNWRK=$(printf '%q ' $WORKLOAD)
  node_run "bash -lc 'set -euo pipefail; NDIR=$OUTDIR/\${HOSTNAME}; echo Running: $RUNWRK | tee -a \"\$NDIR/workload.log\"; $RUNWRK |& tee -a \"\$NDIR/workload.log\"'"
fi
echo "== Workload section complete =="

# -------- Optional network & NCCL checks --------
if [[ -n "$IPERF_SERVER" ]] && command -v iperf3 >/dev/null 2>&1; then
  node_run “iperf3 -c $IPERF_SERVER -P 8 -t 30 | tee ‘$OUTDIR/\${HOSTNAME}/iperf3_\${HOSTNAME}.log'”
fi

if [[ “$RUN_NCCL_TESTS” -eq 1 ]]; then
  node_run “bash -lc ‘set -euo pipefail; NDIR=$OUTDIR/\${HOSTNAME}; if command -v $NCCL_TEST_BIN >/dev/null 2>&1; then $NCCL_TEST_BIN -b 8M -e 512M -f 2 -g \${SLURM_GPUS_PER_NODE:-1} | tee \”\$NDIR/nccl_all_reduce.log\”; else echo \”$NCCL_TEST_BIN not found\” | tee \”\$NDIR/nccl_all_reduce.log\”; fi'”
fi

# ——– Final snapshots & packaging ——–
node_run ‘bash -lc ”
set -euo pipefail
NDIR=\”‘$OUTDIR’/\${HOSTNAME}\”
if command -v nvidia-smi >/dev/null 2>&1; then
  nvidia-smi –query-gpu=index,utilization.gpu,utilization.memory,clocks.sm,clocks.mem,power.draw,temperature.gpu –format=csv -l 1 -f \”\$NDIR/nvidia_smi_final.csv\” -c 3 || true
fi
free -h > \”\$NDIR/free.txt\” || true
df -h   > \”\$NDIR/df.txt\” || true
{
  echo \”=== tail gpu_dmon ===\”; tail -n 30 \”\$NDIR/gpu_dmon.log\” 2>/dev/null || true
  echo
  echo \”=== tail pidstat ===\”; tail -n 30 \”\$NDIR/pidstat.log\” 2>/dev/null || true
} > \”\$NDIR/quick_summary.txt\” || true
“‘

cleanup

tar -C “$OUTROOT” -czf “$OUTDIR.tgz” “$(basename “$OUTDIR”)”
echo “Artifacts packaged at: $OUTDIR.tgz”
echo “Per-node logs under   : $OUTDIR/<hostname>/”
echo “Done.”
    </hostname></none></built-in>

Prefer a direct file? You can also grab the ready-made script: Download profile_env.slurm

admin October 9, 2025 HPCNo Comments »

admin | October 9, 2025

A practical, repeatable workflow for NVIDIA-GPU Linux clusters (Slurm/K8s or bare-metal) to pinpoint whether your bottleneck is GPU, CPU, memory bandwidth, or network

Profiling Playbook: Detect GPU/CPU, Memory Bandwidth, and Network Bottlenecks

A practical, repeatable workflow for NVIDIA-GPU Linux clusters (Slurm/K8s or bare-metal) to pinpoint whether your bottleneck is GPU, CPU, memory bandwidth, or network.

0) Prep: Make the Test Reproducible

Choose a workload: (a) your real training/inference job, plus (b) a couple of microbenchmarks.
Pin placement/affinity: match production (same container, CUDA/cuDNN, drivers, env vars, GPU/CPU affinity).
Record node info: driver, CUDA, GPU model, CPU model, NUMA, NIC, topology.

nvidia-smi; nvidia-smi topo -m
lscpu; numactl --hardware

1) GPU Profiling (Utilization, Kernels, Memory, Interconnect)

Quick Live View (low overhead)

# 1s sampling: Power (p) Util (u) Clocks (c) Mem util (v) Enc/Dec (e) PCIe/NVLink (t)
nvidia-smi dmon -s pucvmet

# More fields, CSV:
nvidia-smi --query-gpu=index,name,utilization.gpu,utilization.memory,clocks.sm,clocks.mem,power.draw,temperature.gpu,pcie.link.gen.current,pcie.link.width.current,clocks_throttle_reasons.active --format=csv -l 1

What to notice

utilization.gpu ~ 0–40% while job is “busy” → likely CPU or input (I/O) bound.
High memory util + low SM util → global memory bandwidth bound.
Power below expected / throttling active → power/thermal cap or app clocks.
PCIe gen/width lower than expected → host-device transfer bottleneck.

Deep Timeline (Nsight Systems → find where time is spent)

nsys profile -t cuda,osrt,nvtx,mpi --sample=process-tree -o /tmp/trace \
    --export=sqlite python train.py
# Open /tmp/trace.qdrep in Nsight Systems GUI, or analyze the sqlite export

Look for:

Long CPU gaps before kernels → dataloader/CPU stall.
CUDA memcpy / NCCL all-reduce dominating → I/O or network bottleneck.
Many short kernels with gaps → kernel launch overhead (try CUDA Graphs).

Kernel Efficiency (Nsight Compute → why GPU is slow)

ncu --set full --target-processes all -o /tmp/ncu python train.py
# Then: ncu --import /tmp/ncu.ncu-rep --csv --page summary

Signals:

Low/achieved SM occupancy & high dram__throughput vs arithmetic intensity → memory-bound kernels.
High barrier/serialization → reformulate kernels or change backend.

NVLink / PCIe Health

# NVLink counters (A100+/NVSwitch)
nvidia-smi nvlink -s
# Topology sanity:
nvidia-smi topo -m

If inter-GPU traffic stalls or retry errors climb, expect intra-node comms bottlenecks.

2) CPU & Memory-Bandwidth Profiling (Host Side)

Fast CPU View

mpstat -P ALL 1
pidstat -u -r -d 1 -p $(pgrep -n python)   # CPU, RSS, I/O per PID

High CPU% & run queue + GPU idle → CPU compute bound (augmentations, tokenization).
Low CPU% & waiting on I/O + GPU idle → storage or network input bottleneck.

NUMA Locality (critical for feeders/data loaders)

numactl -s
numastat -p $(pgrep -n python)  # remote vs local memory hits

Many remote hits → pin processes to closest NUMA node; bind NIC/GPU affinity.

Hardware Counters (perf) & Memory Bandwidth

# Whole process counters
perf stat -d -p $(pgrep -n python) -- sleep 30

# Hotspots (then open interactive report)
perf record -F 99 -g -p $(pgrep -n python) -- sleep 30
perf report

Low IPC + many L3/mem stalls → memory bandwidth bound on CPU. Validate with STREAM / Intel PCM:

# STREAM (approximate host RAM BW)
stream
# Intel PCM memory (Intel CPUs)
pcm-memory 1

3) Network Throughput/Latency (Intra & Inter-node)

Raw NIC Performance

# TCP test (adjust -P for parallel flows)
iperf3 -s   # on server
iperf3 -c <server> -P 8 -t 30
# For UDP or specific MTU/Jumbo: use -u and set mtu via ip link/ethtool

Compare results to NIC line-rate (e.g., 100/200/400GbE).

RDMA / InfiniBand (if applicable)

ibstat; ibv_devinfo
ib_write_bw -d mlx5_0 -F -q 4 -l 512 -s 8388608 -D 30
ib_send_bw  -d mlx5_0 -F -q 4 -l 512 -s 8388608 -D 30

If RDMA BW/latency is poor, check PFC/ECN, RoCE config, and mtu 9000 end-to-end.

Collective (NCCL) Reality Check

# From nccl-tests (build once)
./build/all_reduce_perf -b 8M -e 1G -f 2 -g 8   # intra-node
# Multi-node (via mpirun or torchrun)

Throughput far below expectation → network path/topology, or NCCL env (e.g., NCCL_IB, NCCL_NET_GDR_LEVEL, CollNet/NVLS).

NIC Counters / Driver

ethtool -S <iface> | egrep "err|drop|disc|pause"
ethtool -k <iface>   # offloads; ensure GRO/LRO settings suit your stack

Growing errors/pause frames → congestion, bad optics, or flow-control tuning.

4) Tie It Together with a Roofline View

Compute intensity (FLOPs/byte) vs achieved bandwidth quickly classifies memory-bound vs compute-bound. Use Nsight Compute’s roofline page for kernels; for end-to-end, annotate steps with NVTX and view in Nsight Systems.

5) Microbenchmarks to Isolate Layers

GPU math: HPL/HPL-AI, cuBLAS GEMM runner, nvidia/cuda-samples (matrixMulCUBLAS).
Host RAM BW: STREAM.
Disk I/O: fio (sequential vs random, queue depth).
Network: iperf3, ib_*_bw, NCCL tests.

If microbenchmarks are fine but the real job isn’t, the issue is software pipeline (dataloader, preprocessing, small batch, Python GIL, etc.).

6) Common Bottlenecks → Fixes

Symptom	Likely Bottleneck	Quick Fixes
GPU util low, CPU busy	CPU pipeline	Increase workers/prefetch, move aug to GPU (DALI), compile ops, pin threads/NUMA.
High GPU mem util, SM low	GPU mem-bound	Fuse kernels, better tensor layouts, mixed precision (bf16/fp16), larger batch if headroom.
NCCL all-reduce dominates	Network	Enable RDMA, tune NCCL env, jumbo MTU 9000, keep same switch tier, test CollNet/NVLS.
memcpy HtoD heavy	PCIe/host I/O	Page-locked buffers, async prefetch, increase batch queue, ensure max PCIe Gen/width.
Frequent GPU throttling	Power/Thermal	Raise power limit (if safe), fix cooling, set application clocks, check throttling reasons.
Remote NUMA hits high	NUMA	Bind processes to local NUMA of GPU/NIC, interleave wisely.

7) Optional: One-Node Sampler Script

Paste into profile.sh and run bash profile.sh python train.py.

#!/usr/bin/env bash
set -euo pipefail
APP="$@"  # e.g., python train.py

echo "== System =="
nvidia-smi --query-gpu=name,uuid,driver_version,pstate,pcie.link.gen.current,pcie.link.width.current --format=csv
lscpu | egrep 'Model name|Socket|NUMA|Thread|MHz'
echo

echo "== Start background samplers =="
(nvidia-smi dmon -s pucvmet -d 1 > /tmp/gpu_dmon.log) &
GPU_DMON_PID=$!
(pidstat -u -r -d 1 > /tmp/pidstat.log) &
PIDSTAT_PID=$!

echo "== Run workload =="
$APP || true

echo "== Cleanup =="
kill $GPU_DMON_PID $PIDSTAT_PID 2>/dev/null || true

echo "== Summaries =="
tail -n +1 /tmp/gpu_dmon.log | head
tail -n 20 /tmp/gpu_dmon.log
tail -n 20 /tmp/pidstat.log

8) HPE-Specific Checks (If Relevant)

HPE iLO/OneView: check thermal/power capping, fan curves, PSU headroom.
HPE Performance Cluster Manager / Cray: use built-in telemetry and fabric diagnostics.
BIOS: Performance power profile, NUMA exposed, deterministic turbo, PCIe Gen4/Gen5, Above 4G decoding on, SR-IOV/ATS if virtualized.

Need a tailored version? Tell me your GPU model(s), CPUs, NIC/fabric, batch size/model, and orchestration (Slurm/K8s). I can generate a vendor-ready checklist and a Slurm job that auto-collects Nsight & NCCL traces.

admin October 9, 2025 HPC, LinuxNo Comments »

admin | July 15, 2025

Deploying SLURM with Slinky: Bridging HPC and Kubernetes for Container Workloads

High-Performance Computing (HPC) environments are evolving rapidly, and the need to integrate traditional HPC job schedulers with modern containerized infrastructure has never been greater. Enter Slinky – SchedMD’s official project that seamlessly integrates SLURM with Kubernetes, enabling you to run containerized workloads through SLURM’s powerful scheduling capabilities.

In this comprehensive guide, we’ll walk through deploying SLURM using Slinky with Docker container support, bringing together the best of both HPC and cloud-native worlds.

What is Slinky?

Slinky is a toolbox of components developed by SchedMD (the creators of SLURM) to integrate SLURM with Kubernetes. Unlike traditional approaches that force users to change how they interact with SLURM, Slinky preserves the familiar SLURM user experience while adding powerful container orchestration capabilities.

Key Components:

Slurm Operator – Manages SLURM clusters as Kubernetes resources
Container Support – Native OCI container execution through SLURM
Auto-scaling – Dynamic resource allocation based on workload demand
Slurm Bridge – Converged workload scheduling and prioritization

Why Slinky Matters: Slinky enables simultaneous management of HPC workloads using SLURM and containerized applications via Kubernetes on the same infrastructure, making it ideal for organizations running AI/ML training, scientific simulations, and cloud-native applications.

Prerequisites and Environment Setup

Before we begin, ensure you have a working Kubernetes cluster with the following requirements:

Kubernetes 1.24+ cluster with admin access
Helm 3.x installed
kubectl configured and connected to your cluster
Sufficient cluster resources (minimum 4 CPU cores, 8GB RAM)

Step 1: Install Required Dependencies

Slinky requires several prerequisite components. Let’s install them using Helm:

# Add required Helm repositories
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo add metrics-server https://kubernetes-sigs.github.io/metrics-server/
helm repo add bitnami https://charts.bitnami.com/bitnami
helm repo add jetstack https://charts.jetstack.io
helm repo update

# Install cert-manager for TLS certificate management
helm install cert-manager jetstack/cert-manager \
  --namespace cert-manager --create-namespace --set crds.enabled=true

# Install Prometheus stack for monitoring
helm install prometheus prometheus-community/kube-prometheus-stack \
  --namespace prometheus --create-namespace --set installCRDs=true

Wait for all pods to be running before proceeding:

# Verify installations
kubectl get pods -n cert-manager
kubectl get pods -n prometheus

Step 2: Deploy the Slinky SLURM Operator

Now we’ll install the core Slinky operator that manages SLURM clusters within Kubernetes:

# Download the default configuration
curl -L https://raw.githubusercontent.com/SlinkyProject/slurm-operator/refs/tags/v0.2.1/helm/slurm-operator/values.yaml \
  -o values-operator.yaml

# Install the Slurm Operator
helm install slurm-operator oci://ghcr.io/slinkyproject/charts/slurm-operator \
  --values=values-operator.yaml --version=0.2.1 \
  --namespace=slinky --create-namespace

Verify the operator is running:

kubectl get pods -n slinky
# Expected output: slurm-operator pod in Running status

Step 3: Configure Container Support

Before deploying the SLURM cluster, let’s configure it for container support. Download and modify the SLURM configuration:

# Download SLURM cluster configuration
curl -L https://raw.githubusercontent.com/SlinkyProject/slurm-operator/refs/tags/v0.2.1/helm/slurm/values.yaml \
  -o values-slurm.yaml

Edit values-slurm.yaml to enable container support:

# Add container configuration to values-slurm.yaml
controller:
  config:
    slurm.conf: |
      # Basic cluster configuration
      ClusterName=slinky-cluster
      ControlMachine=slurm-controller-0
      
      # Enable container support
      ProctrackType=proctrack/cgroup
      TaskPlugin=task/cgroup,task/affinity
      PluginDir=/usr/lib64/slurm
      
      # Authentication
      AuthType=auth/munge
      
      # Node configuration
      NodeName=slurm-compute-debug-[0-9] CPUs=4 Boards=1 SocketsPerBoard=1 CoresPerSocket=2 ThreadsPerCore=2 State=UNKNOWN
      PartitionName=debug Nodes=slurm-compute-debug-[0-9] Default=YES MaxTime=INFINITE State=UP
      
      # Accounting
      AccountingStorageType=accounting_storage/slurmdbd
      AccountingStorageHost=slurm-accounting-0

compute:
  config:
    oci.conf: |
      # OCI container runtime configuration
      RunTimeQuery="runc --version"
      RunTimeCreate="runc create %n.%u %b"
      RunTimeStart="runc start %n.%u"
      RunTimeKill="runc kill --all %n.%u SIGTERM"
      RunTimeDelete="runc delete --force %n.%u"
      
      # Security and patterns
      OCIPattern="^[a-zA-Z0-9][a-zA-Z0-9_.-]*$"
      CreateEnvFile="/tmp/slurm-oci-create-env-%j.%u.%t.tmp"
      RunTimeEnvExclude="HOME,PATH,LD_LIBRARY_PATH"

Step 4: Deploy the SLURM Cluster

Now deploy the SLURM cluster with container support enabled:

# Deploy SLURM cluster
helm install slurm oci://ghcr.io/slinkyproject/charts/slurm \
  --values=values-slurm.yaml --version=0.2.1 \
  --namespace=slurm --create-namespace

Monitor the deployment progress:

# Watch pods come online
kubectl get pods -n slurm -w

# Expected pods:
# slurm-accounting-0      1/1     Running
# slurm-compute-debug-0   1/1     Running  
# slurm-controller-0      2/2     Running
# slurm-exporter-xxx      1/1     Running
# slurm-login-xxx         1/1     Running
# slurm-mariadb-0         1/1     Running
# slurm-restapi-xxx       1/1     Running

Step 5: Access and Test the SLURM Cluster

Once all pods are running, connect to the SLURM login node:

# Get login node IP address
SLURM_LOGIN_IP="$(kubectl get services -n slurm -l app.kubernetes.io/instance=slurm,app.kubernetes.io/name=login -o jsonpath="{.items[0].status.loadBalancer.ingress[0].ip}")"

# SSH to login node (default port 2222)
ssh -p 2222 root@${SLURM_LOGIN_IP}

If you don’t have LoadBalancer support, use port-forwarding:

# Port forward to login pod
kubectl port-forward -n slurm service/slurm-login 2222:2222

# Connect via localhost
ssh -p 2222 root@localhost

Step 6: Running Container Jobs

Now for the exciting part – running containerized workloads through SLURM!

Basic Container Job

Create a simple container job script:

# Create a container job script
cat > container_test.sh << EOF
#!/bin/bash
#SBATCH --job-name=container-hello
#SBATCH --ntasks=1
#SBATCH --time=00:05:00
#SBATCH --container=docker://alpine:latest

echo "Hello from containerized SLURM job!"
echo "Running on node: \$(hostname)"
echo "Job ID: \$SLURM_JOB_ID"
echo "Container OS: \$(cat /etc/os-release | grep PRETTY_NAME)"
EOF

# Submit the job
sbatch container_test.sh

# Check job status
squeue

Interactive Container Sessions

Run containers interactively using srun:

# Interactive Ubuntu container
srun --container=docker://ubuntu:20.04 /bin/bash

# Quick command in Alpine container
srun --container=docker://alpine:latest /bin/sh -c "echo 'Container execution successful'; uname -a"

# Python data science container
srun --container=docker://python:3.9 python -c "import sys; print(f'Python {sys.version} running in container')"

GPU Container Jobs

If your cluster has GPU nodes, you can run GPU-accelerated containers:

# GPU container job
cat > gpu_container.sh << EOF
#!/bin/bash
#SBATCH --job-name=gpu-test
#SBATCH --gres=gpu:1
#SBATCH --container=docker://nvidia/cuda:11.0-runtime-ubuntu20.04

nvidia-smi
nvcc --version
EOF

sbatch gpu_container.sh

MPI Container Jobs

Run parallel MPI applications in containers:

# MPI container job
cat > mpi_container.sh << EOF
#!/bin/bash
#SBATCH --job-name=mpi-test
#SBATCH --ntasks=4
#SBATCH --container=docker://mpirun/openmpi:latest

mpirun -np \$SLURM_NTASKS hostname
EOF

sbatch mpi_container.sh

Step 7: Monitoring and Auto-scaling

Monitor Cluster Health

Check SLURM cluster status from the login node:

# Check node status
sinfo

# Check running jobs
squeue

# Check cluster configuration
scontrol show config | grep -i container

Kubernetes Monitoring

Monitor from the Kubernetes side:

# Check pod resource usage
kubectl top pods -n slurm

# View SLURM operator logs
kubectl logs -n slinky deployment/slurm-operator

# Check custom resources
kubectl get clusters.slinky.slurm.net -n slurm
kubectl get nodesets.slinky.slurm.net -n slurm

Configure Auto-scaling

Enable auto-scaling by updating your values file:

# Add to values-slurm.yaml
compute:
  autoscaling:
    enabled: true
    minReplicas: 1
    maxReplicas: 10
    targetCPUUtilizationPercentage: 70

# Update the deployment
helm upgrade slurm oci://ghcr.io/slinkyproject/charts/slurm \
  --values=values-slurm.yaml --version=0.2.1 \
  --namespace=slurm

Advanced Configuration Tips

Custom Container Runtimes

Configure alternative container runtimes like Podman:

# Alternative oci.conf for Podman
compute:
  config:
    oci.conf: |
      # Podman runtime configuration
      RunTimeQuery="podman --version"
      RunTimeRun="podman run --rm --cgroups=disabled --name=%n.%u %m %c"
      
      # Security settings
      OCIPattern="^[a-zA-Z0-9][a-zA-Z0-9_.-]*$"
      CreateEnvFile="/tmp/slurm-oci-create-env-%j.%u.%t.tmp"

Persistent Storage for Containers

Configure persistent volumes for containerized jobs:

# Add persistent volume support
compute:
  persistence:
    enabled: true
    storageClass: "fast-ssd"
    size: "100Gi"
    mountPath: "/shared"

Troubleshooting Common Issues

Container Runtime Not Found

If you encounter container runtime errors:

# Check runtime availability on compute nodes
kubectl exec -n slurm slurm-compute-debug-0 -- which runc
kubectl exec -n slurm slurm-compute-debug-0 -- runc --version

# Verify oci.conf is properly mounted
kubectl exec -n slurm slurm-compute-debug-0 -- cat /etc/slurm/oci.conf

Job Submission Failures

Debug job submission issues:

# Check SLURM logs
kubectl logs -n slurm slurm-controller-0 -c slurmctld

# Verify container image availability
srun --container=docker://alpine:latest /bin/echo "Container test"

# Check job details
scontrol show job

Conclusion

Slinky represents a significant step forward in bridging the gap between traditional HPC and modern cloud-native infrastructure. By deploying SLURM with Slinky, you get:

Unified Infrastructure - Run both SLURM and Kubernetes workloads on the same cluster
Container Support - Native OCI container execution through familiar SLURM commands
Auto-scaling - Dynamic resource allocation based on workload demand
Cloud Native - Standard Kubernetes deployment and management patterns
Preserved Workflow - Keep existing SLURM scripts and user experience

This powerful combination enables organizations to modernize their HPC infrastructure while maintaining the robust scheduling and resource management capabilities that SLURM is known for. Whether you're running AI/ML training workloads, scientific simulations, or data processing pipelines, Slinky provides the flexibility to containerize your applications without sacrificing the control and efficiency of SLURM.

Next Steps: Consider exploring Slinky's advanced features like custom schedulers, resource quotas, and integration with cloud provider auto-scaling groups to further optimize your HPC container workloads.

Ready to get started? The Slinky project is open-source and available on GitHub. Visit the SlinkyProject GitHub organization for the latest documentation and releases.

admin July 15, 2025 HPCNo Comments »

admin | December 6, 2024

Key Components for Setting Up an HPC Cluster

Head Node (Controller)

• Manages job scheduling and resource allocation.

• Runs Slurm Controller Daemon (`slurmctld`).

Compute Nodes

• Execute computational tasks.

• Run Slurm Node Daemon (`slurmd`).

• Configured for CPUs, GPUs, or specialized hardware.

Networking

• High-speed interconnect like Infiniband or Ethernet.

• Ensures fast communication between nodes.

Storage

• Centralized storage like NFS, Lustre, or BeeGFS.

• Provides shared file access for all nodes.

Authentication

• Use Munge for secure communication between Slurm components.

Scheduler

• Slurm for job scheduling and resource management.

• Configured with partitions and node definitions.

Resource Management

• Use cgroups to control CPU, memory, and GPU usage.

• Optional: ProctrackType=cgroup in Slurm.

Parallel File System (Optional)

• High-performance shared storage for parallel workloads.

• Examples: Lustre, GPFS.

Interconnect Libraries

• MPI (Message Passing Interface) for distributed computing.

• Install libraries like OpenMPI or MPICH.

Monitoring and Debugging Tools

• Tools like Prometheus, Grafana, or Ganglia for resource monitoring.

• Enable verbose logging in Slurm for debugging.

admin December 6, 2024 HPCNo Comments »

admin | December 6, 2024

How to configure Slurm Controller Node on Ubuntu 22.04

How to setup HPC-Slurm Controller Node

Refer to Key Components for HPC Cluster Setup; for which pieces you need to setup.

This guide provides step-by-step instructions for setting up the Slurm controller daemon (`slurmctld`) on Ubuntu 22.04. It also includes common errors encountered during the setup process and how to resolve them.

Step 1: Install Prerequisites

To begin, install the required dependencies for Slurm and its components:

sudo apt update && sudo apt upgrade -y
sudo apt install -y munge libmunge-dev libmunge2 build-essential man-db mariadb-server mariadb-client libmariadb-dev python3 python3-pip chrony

Step 2: Configure Munge (Authentication for slurm)

Munge is required for authentication within the Slurm cluster.

1. Generate a Munge key on the controller node:
sudo create-munge-key

2. Copy the key to all compute nodes:
scp /etc/munge/munge.key user@node:/etc/munge/

3. Start the Munge service:
sudo systemctl enable –now munge

Step 3: Install Slurm

1. Download and compile Slurm:
wget https://download.schedmd.com/slurm/slurm-23.02.4.tar.bz2
tar -xvjf slurm-23.02.4.tar.bz2
cd slurm-23.02.4
./configure –prefix=/usr/local/slurm –sysconfdir=/etc/slurm
make -j$(nproc)
sudo make install

2. Create necessary directories and set permissions:
sudo mkdir -p /etc/slurm /var/spool/slurm /var/log/slurm
sudo chown slurm: /var/spool/slurm /var/log/slurm

3. Add the Slurm user:
sudo useradd -m slurm

Step 4: Configure Slurm; more complex configs contact Nick Tailor

1. Generate a basic `slurm.conf` using the configurator tool at
https://slurm.schedmd.com/configurator.html. Save the configuration to `/etc/slurm/slurm.conf`.

# Basic Slurm Configuration

ClusterName=my_cluster

ControlMachine=slurmctld # Replace with your control node’s hostname

# BackupController=backup-slurmctld # Uncomment and replace if you have a backup controller

# Authentication

AuthType=auth/munge

CryptoType=crypto/munge

# Logging

SlurmdLogFile=/var/log/slurm/slurmd.log

SlurmctldLogFile=/var/log/slurm/slurmctld.log

SlurmctldDebug=info

SlurmdDebug=info

# Slurm User

SlurmUser=slurm

StateSaveLocation=/var/spool/slurm

SlurmdSpoolDir=/var/spool/slurmd

# Scheduler

SchedulerType=sched/backfill

SchedulerParameters=bf_continue

# Accounting

AccountingStorageType=accounting_storage/none

JobAcctGatherType=jobacct_gather/linux

# Compute Nodes

NodeName=node[1-2] CPUs=4 RealMemory=8192 State=UNKNOWN

PartitionName=debug Nodes=node[1-2] Default=YES MaxTime=INFINITE State=UP

2. Distribute `slurm.conf` to all compute nodes:
scp /etc/slurm/slurm.conf user@node:/etc/slurm/

3. Restart Slurm services:
sudo systemctl restart slurmctld
sudo systemctl restart slurmd

Troubleshooting Common Errors

root@slrmcltd:~# tail /var/log/slurm/slurmctld.log

[2024-12-06T11:57:25.428] error: High latency for 1000 calls to gettimeofday(): 20012 microseconds

[2024-12-06T11:57:25.431] fatal: mkdir(/var/spool/slurm): Permission denied

[2024-12-06T11:58:34.862] error: High latency for 1000 calls to gettimeofday(): 20029 microseconds

[2024-12-06T11:58:34.864] fatal: mkdir(/var/spool/slurm): Permission denied

[2024-12-06T11:59:38.843] error: High latency for 1000 calls to gettimeofday(): 18842 microseconds

[2024-12-06T11:59:38.847] fatal: mkdir(/var/spool/slurm): Permission denied

Error: Permission Denied for /var/spool/slurm

This error occurs when the `slurm` user does not have the correct permissions to access the directory.

Fix:
sudo mkdir -p /var/spool/slurm
sudo chown -R slurm: /var/spool/slurm
sudo chmod -R 755 /var/spool/slurm

Error: Temporary Failure in Name Resolution

Slurm could not resolve the hostname `slurmctld`. This can be fixed by updating `/etc/hosts`:

1. Edit `/etc/hosts` and add the following:
127.0.0.1 slurmctld
192.168.20.8 slurmctld

2. Verify the hostname matches `ControlMachine` in `/etc/slurm/slurm.conf`.

3. Restart networking and test hostname resolution:
sudo systemctl restart systemd-networkd
ping slurmctld

Error: High Latency for gettimeofday()

Dec 06 11:57:25 slrmcltd.home systemd[1]: Started Slurm controller daemon.

Dec 06 11:57:25 slrmcltd.home slurmctld[2619]: slurmctld: error: High latency for 1000 calls to gettimeofday(): 20012 microseconds

Dec 06 11:57:25 slrmcltd.home systemd[1]: slurmctld.service: Main process exited, code=exited, status=1/FAILURE

Dec 06 11:57:25 slrmcltd.home systemd[1]: slurmctld.service: Failed with result ‘exit-code’.

This warning typically indicates timing issues in the system.

Fixes:
1. Install and configure `chrony` for time synchronization:
sudo apt install chrony
sudo systemctl enable –now chrony
chronyc tracking
timedatectl
2. For virtualized environments, optimize the clocksource:
sudo echo tsc > /sys/devices/system/clocksource/clocksource0/current_clocksource

3. Disable high-precision timing in `slurm.conf` (optional):
HighPrecisionTimer=NO
sudo systemctl restart slurmctld

Step 5: Verify and Test the Setup

1. Validate the configuration:
scontrol reconfigure
– no errors mean its working. If this doesn’t work check the connection between nodes
update your /etc/hosts to have the hosts all listed across the all machines and nodes.

2. Check node and partition status:
sinfo

root@slrmcltd:/etc/slurm# sinfo

PARTITION AVAIL TIMELIMIT NODES STATE NODELIST

debug* up infinite 1 idle* node1

3. Monitor logs for errors:
sudo tail -f /var/log/slurm/slurmctld.log

Written By: Nick Tailor

admin December 6, 2024 HPCNo Comments »

admin | November 12, 2024

Deploying Lustre File System with RDMA, Node Maps, and ACLs

Lustre is the de facto parallel file system for high-performance computing (HPC) clusters, providing extreme scalability, high throughput, and low-latency access across thousands of nodes. This guide walks through a complete deployment of Lustre using RDMA over InfiniBand for performance, along with Node Maps for client access control and ACLs for fine-grained permissions.

1. Understanding the Lustre Architecture

Lustre separates metadata and data services into distinct roles:

MGS (Management Server) – Manages Lustre configuration and coordinates cluster services.
MDT (Metadata Target) – Stores file system metadata (names, permissions, directories).
OST (Object Storage Target) – Stores file data blocks.
Clients – Mount and access the Lustre file system for I/O.

The typical architecture looks like this:

+-------------+        +-------------+
|   Client 1  |        |   Client 2  |
| /mnt/lustre |        | /mnt/lustre |
+------+------+        +------+------+
       |                        |
       +--------o2ib RDMA-------+
                |
        +-------+-------+
        |     OSS/OST    |
        |   (Data I/O)   |
        +-------+-------+
                |
        +-------+-------+
        |     MGS/MDT    |
        |  (Metadata)    |
        +---------------+

2. Prerequisites and Environment

Component	Requirements
OS	RHEL / Rocky / AlmaLinux 8.x or higher
Kernel	Built with Lustre and OFED RDMA modules
Network	InfiniBand fabric (Mellanox or compatible)
Lustre Version	2.14 or later
Devices	Separate block devices for MDT, OST(s), and client mount

3. Install Lustre Packages

On MGS, MDT, and OSS Nodes:

dnf install -y lustre kmod-lustre lustre-osd-ldiskfs

On Client Nodes:

dnf install -y lustre-client kmod-lustre-client

4. Configure InfiniBand and RDMA (o2ib)

InfiniBand provides the lowest latency for Lustre communication via RDMA. Configure the o2ib network type for Lustre.

1. Install and verify InfiniBand stack

dnf install -y rdma-core infiniband-diags perftest libibverbs-utils
systemctl enable --now rdma
ibstat

2. Configure IB network

nmcli con add type infiniband ifname ib0 con-name ib0 ip4 10.0.0.1/24
nmcli con up ib0

3. Verify RDMA link

ibv_devinfo
ibv_rc_pingpong -d mlx5_0

4. Configure LNET for o2ib

Create /etc/modprobe.d/lustre.conf with:

options lnet networks="o2ib(ib0)"

modprobe lnet
lnetctl lnet configure
lnetctl net add --net o2ib --if ib0
lnetctl net show

Expected output:

net:
  - net type: o2ib
    interfaces:
      0: ib0

5. Format and Mount Lustre Targets

Metadata Server (MGS + MDT)

mkfs.lustre --fsname=lustrefs --mgs --mdt --index=0 /dev/sdb
mount -t lustre /dev/sdb /mnt/mdt

Object Storage Server (OSS)

mkfs.lustre --fsname=lustrefs --ost --index=0 --mgsnode=<MGS>@o2ib /dev/sdc
mount -t lustre /dev/sdc /mnt/ost

Client Node

mount -t lustre <MGS>@o2ib:/lustrefs /mnt/lustre
sudo mkdir -p /mnt/lustre

sudo mount -t lustre \
  172.16.0.10@o2ib:/lustrefs \
  /mnt/lustre

example without ibnetwork
[root@vbox ~]# mount -t lustre 172.16.0.10@tcp:/lustre /mnt/lustre-client
[root@vbox ~]# 
[root@vbox ~]# # Verify the mount worked
[root@vbox ~]# df -h /mnt/lustre-client
Filesystem                Size  Used Avail Use% Mounted on
172.16.0.10@tcp:/lustre   12G  2.5M   11G   1% /mnt/lustre-client
[root@vbox ~]# lfs df -h
UUID                       bytes        Used   Available Use% Mounted on
lustre-MDT0000_UUID         4.5G        1.9M        4.1G   1% /mnt/lustre-client[MDT:0]
lustre-OST0000_UUID         7.5G        1.2M        7.0G   1% /mnt/lustre-client[OST:0]
lustre-OST0001_UUID         3.9G        1.2M        3.7G   1% /mnt/lustre-client[OST:1]
filesystem_summary:        11.4G        2.4M       10.7G   1% /mnt/lustre-client

6. Configuring Node Maps (Access Control)

Node maps allow administrators to restrict Lustre client access based on network or host identity.

1. View current node maps

lctl nodemap_list

2. Create a new node map for trusted clients

lctl nodemap_add trusted_clients

3. Add allowed network range or host

lctl nodemap_add_range trusted_clients 10.0.0.0/24

4. Enable enforcement

lctl set_param nodemap.trusted_clients.admin=1
lctl set_param nodemap.trusted_clients.trust_client_ids=1

5. Restrict default map

lctl set_param nodemap.default.reject_unauthenticated=1

This ensures only IPs in 10.0.0.0/24 can mount and access the Lustre filesystem.

7. Configuring Access Control Lists (ACLs)

Lustre supports standard POSIX ACLs for fine-grained directory and file permissions.

1. Enable ACL support on mount

mount -t lustre -o acl <MGS>@o2ib:/lustrefs /mnt/lustre

2. Verify ACL support

mount | grep lustre

Should show:

/dev/sda on /mnt/lustre type lustre (rw,acl)

3. Set ACLs on directories

setfacl -m u:researcher:rwx /mnt/lustre/projects
setfacl -m g:analysts:rx /mnt/lustre/reports

4. View ACLs

getfacl /mnt/lustre/projects

Sample output:

# file: projects
# owner: root
# group: root
user::rwx
user:researcher:rwx
group::r-x
group:analysts:r-x
mask::rwx
other::---

8. Verifying Cluster Health

On all nodes:

lctl ping <MGS>@o2ib
lctl dl
lctl get_param -n net.*.state

Check RDMA performance:

lctl get_param -n o2iblnd.*.stats

Check file system mount from client:

df -h /mnt/lustre

Optional: Check node map enforcement

Try mounting from an unauthorized IP — it should fail:

mount -t lustre <MGS>@o2ib:/lustrefs /mnt/test
mount.lustre: mount <MGS>@o2ib:/lustrefs at /mnt/test failed: Permission denied

9. Common Issues and Troubleshooting

Issue	Possible Cause	Resolution
`Mount failed: no route to host`	IB subnet mismatch or LNET not configured	Verify `lnetctl net show` and `ping -I ib0` between nodes.
`Permission denied`	Node map restriction active	Check `lctl nodemap_list` and ensure client IP range is allowed.
`Slow performance`	RDMA disabled or fallback to TCP	Verify `lctl list_nids` shows `@o2ib` transport.

10. Final Validation Checklist

InfiniBand RDMA verified with ibv_rc_pingpong
LNET configured for o2ib(ib0)
MGS, MDT, and OST mounted successfully
Clients connected via @o2ib
Node maps restricting unauthorized hosts
ACLs correctly enforcing directory-level access

Summary

With RDMA transport, Lustre achieves near line-rate performance while node maps and ACLs enforce robust security and access control. This combination provides a scalable, high-performance, and policy-driven storage environment ideal for AI, HPC, and research workloads.

admin November 12, 2024 HPCNo Comments »

Nick Tailor's Technical Blog

A detail-minded individual, combining strong technical understanding and communication skills with experiences in Systems Administration, Engineering, Automation, AI Automation and Solutions; a proven methodical problem solver.