github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/bench/microbenchmarks/dl/dataloader_cmp.sh (about) 1 #!/bin/bash -p 2 3 # 4 # Control script to run a number of dataloader tests as specified in the 5 # given config file, using a hardware topology (number of DGX nodes, 6 # GPUs per node) as specified in the config file. 7 # 8 9 function whinge { 10 echo "FAILURE $@" 11 exit 2 12 } 13 14 function usage { 15 PROG=$1 16 17 cat <<-EOU 18 Usage: $PROG -c <testrc> -d <output-base-dir> -r <redis-svc> [-t <tag>] [-n] [-f] [-i <image>] [-s <pod-script>] 19 EOU 20 } 21 22 declare -i dryrun=0 # dry-run; 1 = create pods but don't train; 2 = only show intent 23 outdir="" 24 tag="" 25 redis_svc="bm-redis-master" 26 image="quay.io/nvidia/pytorch:hack11" # image for containers 27 flush=false 28 script='./scripts/run_dl_test.sh' 29 30 while getopts ":c:d:fhi:nr:s:t:" opt; do 31 case $opt in 32 c) testrc=$OPTARG 33 [[ -r $testrc ]] || whinge "proposed test config file $testrc is absent/unreadable" 34 ;; 35 d) outdir=$OPTARG 36 [[ -d $outdir ]] || whinge "proposed output directory $outdir does not exist" 37 ;; 38 f) flush=true 39 ;; 40 h) 41 usage $0 42 exit 1 43 ;; 44 i) image=$OPTARG 45 ;; 46 n) dryrun=$((dryrun + 1)) 47 ;; 48 r) redis=$OPTARG 49 ;; 50 s) script=$OPTARG 51 [[ -f $script ]] || whinge "$script is awol!" 52 ;; 53 t) 54 tag="$t" 55 ;; 56 \?) usage $0 57 exit 1 58 esac 59 done 60 61 [[ -n "$testrc" ]] || whinge "specify a test config file using -c" 62 [[ -n "$outdir" ]] || whinge "specify an output parent directory using -d" 63 64 DATE="$(date +'%Y%m%d_%H%M')" 65 dest=$outdir/$DATE 66 logs=$dest/logs 67 mkdir -p $logs || whinge "failed to make output and log subdirectories" 68 echo "Run output will accumulate in $dest" 69 70 # 71 # Copy test config into output directory with a standard name, and add any test tag/comment. 72 # 73 cp $testrc $dest/testrc 74 [[ -n "$tag" ]] && echo "$tag" > $dest/tag 75 76 # 77 # Test infrastructure bits - no likely need to change these 78 # 79 masterhost=dmaster # distributed master, will be published in DNS for master pod 80 masterport=29500 # port to rendevouz on at master 81 groupname="dlbench" # used to label headless service, and for group ops (delete) on pods we start 82 groupnum="$$" # for multiple users of script (it's a start! clash on service name still) 83 84 # 85 # Token check that we're alone 86 # 87 [[ -z $(kubectl get pods --no-headers --selector=group=${groupname}) ]] || whinge "we are not alone" 88 89 # 90 # Grok redis master IP from the given service name. We require a (passwordless!) 91 # redis service to coordinate between this script and pods. 92 # 93 redis_ip=$(kubectl get service/${redis_svc} -o=jsonpath="{.spec.clusterIP}" 2>/dev/null) 94 [[ -n "$redis_ip " ]] || whinge "Could not lookup cluster-IP of redis service $redis_svc" 95 96 # this assumes we're running the control script on a 97 # cluster node 98 function rsignal { 99 redis-cli -h ${redis_ip} "$@" 100 } 101 102 # 103 # Testcase globals 104 # 105 declare -i tc_numnodes 106 declare -i tc_ngpupernode 107 declare tc_mode 108 declare tc_backend 109 declare tc_ctx 110 declare tc_datapath 111 declare tc_batchsize 112 declare tc_cast 113 declare -i tc_epochs 114 declare -i tc_iters 115 declare tc_extra 116 declare tc_testloaders 117 118 # 119 # Create a configmap from the pod test script; specify key as 'runbench' 120 # regardless of script basename. 121 # 122 kubectl delete configmap bench-script >/dev/null 2>&1 123 kubectl create configmap bench-script --from-file=runbench=${script} || \ 124 whinge "failed to create bench script configmap" 125 126 # 127 # Create a pod to run a number of ranks of the distributed job, necessarily 128 # all on the same physical node (being in a single pod). 129 # 130 # The "number of nodes" in torch.distributed is realized by that number of 131 # pods. Depending on the gpu resource requests, it it possible that more than 132 # one pod could be scheduled on the same physical node. If fillorder is 133 # "widthfirst" then we use pod anti-affinity to spread pods over DGX 134 # systems before filling a system; "depthfirst" we try to influence scheduling 135 # to pack the pods in more densely. 136 # 137 function create_pod { 138 local podnum=$1 139 local fillorder=$2 140 local reqnode=$3 141 142 typeset master=false 143 ports="" 144 if [[ $podnum -eq 0 ]]; then 145 master=true 146 ports="\ 147 - name: distmaster 148 containerPort: ${masterport} 149 protocol: TCP" 150 fi 151 152 # nodeAffinity and node labeling may be nicer, but this'll do 153 reqnodename="" 154 if [[ $reqnode != "-" ]]; then 155 reqnodename="\ 156 nodeName: "$reqnode" 157 " 158 fi 159 160 if [[ $fillorder == "widthfirst" ]]; then 161 aff_which="podAntiAffinity" 162 elif [[ $fillorder == "depthfirst" ]]; then 163 aff_which="podAffinity" 164 else 165 whinge "Unknown fill order $fillorder" 166 fi 167 168 affinity="\ 169 affinity: 170 ${aff_which}: 171 preferredDuringSchedulingIgnoredDuringExecution: 172 - weight: 100 173 podAffinityTerm: 174 labelSelector: 175 matchExpressions: 176 - key: group 177 operator: In 178 values: 179 - $groupname 180 topologyKey: "kubernetes.io/hostname" 181 " 182 183 # 184 # For the master pod create a headless clusterIP service so that other 185 # pods can use the master pod DNS name to redevouz on. 186 # 187 if $master; then 188 kubectl apply -f - <<EOSVC 189 --- 190 apiVersion: v1 191 kind: Service 192 metadata: 193 name: "${masterhost}" 194 labels: 195 group: "${groupname}" 196 groupnum: "${groupnum}" 197 spec: 198 type: ClusterIP 199 clusterIP: None 200 selector: 201 group: "${groupname}" 202 groupnum: "${groupnum}" 203 master: "true" 204 EOSVC 205 206 [[ $? -ne 0 ]] && whinge "Failed to create headless service" 207 fi 208 209 # 210 # Create pod; use a temp file so we can view any failing yaml we build 211 # 212 tf=$(mktemp) 213 cat > $tf <<EOPOD 214 --- 215 apiVersion: v1 216 kind: Pod 217 metadata: 218 generateName: ${groupname}-${podnum}- 219 labels: 220 group: "${groupname}" 221 groupnum: "${groupnum}" 222 master: "${master}" 223 spec: 224 containers: 225 - name: dl 226 image: ${image} 227 resources: 228 requests: 229 limits: 230 nvidia.com/gpu: "${tc_ngpupernode}" 231 command: [ "/bin/bash" ] 232 args: [ "/bench/runbench" ] 233 env: 234 - name: ARG_MASTER_HOST 235 value: "${masterhost}" 236 - name: ARG_MASTER_PORT 237 value: "${masterport}" 238 - name: ARG_NNODES 239 value: "${tc_numnodes}" 240 - name: ARG_NODE_RANK 241 value: "${podnum}" 242 - name: ARG_NPROC_PER_NODE 243 value: "${tc_ngpupernode}" 244 ports: 245 ${ports} 246 volumeMounts: 247 - name: scripts 248 mountPath: /scripts 249 - name: bench-script 250 mountPath: /bench 251 - name: benchrc 252 mountPath: /benchrc 253 - name: dshm 254 mountPath: /dev/shm 255 - name: imagenet-nfs 256 mountPath: /nfs/imagenet # 'val' here is a copy of 'train' 257 readOnly: true 258 - name: imagenet-inflated-train-nfs 259 mountPath: /nfs/imagenet-inflated/train # 'train' here is inflated; no inflated val, so ... 260 readOnly: true 261 - name: imagenet-inflated-val-nfs 262 mountPath: /nfs/imagenet-inflated/val # ... this also mounts the train set on val 263 readOnly: true 264 - name: imagenet-train-ssd # std imagenet on SSD of every DGX system 265 mountPath: /data/imagenet/train 266 - name: imagenet-val-ssd 267 mountPath: /data/imagenet/val # we point this to train data below 268 - name: imagenet-inflated-train-ssd # only present on dgx18 269 mountPath: /data/imagenet-inflated/train 270 - name: imagenet-inflated-val-ssd # only present on dgx18, point to train data 271 mountPath: /data/imagenet-inflated/val 272 ${reqnodename} 273 ${affinity} 274 volumes: 275 - name: scripts 276 configMap: 277 name: scripts 278 - name: bench-script 279 configMap: 280 name: bench-script 281 - name: benchrc 282 configMap: 283 name: benchrc-cm 284 - name: dshm 285 emptyDir: 286 medium: Memory 287 - name: imagenet-nfs 288 persistentVolumeClaim: 289 claimName: imagenet-nfs-pvc 290 readOnly: true 291 - name: imagenet-inflated-train-nfs 292 persistentVolumeClaim: 293 claimName: imagenet-inflated-train-nfs-pvc 294 readOnly: true 295 - name: imagenet-inflated-val-nfs 296 persistentVolumeClaim: 297 claimName: imagenet-inflated-val-nfs-pvc 298 readOnly: true 299 - name: imagenet-train-ssd 300 hostPath: 301 path: /imagenet/train 302 - name: imagenet-val-ssd 303 hostPath: 304 path: /imagenet/train # not a typo 305 - name: imagenet-inflated-train-ssd # only populated on dgx18, empty elsewhere 306 hostPath: 307 path: /imagenet/train-inflated 308 - name: imagenet-inflated-val-ssd # only populated on dgx18, empty elsewhere 309 hostPath: 310 path: /imagenet/train-inflated 311 restartPolicy: Never 312 EOPOD 313 314 kubectl create -f $tf 315 if [[ $? -eq 0 ]]; then 316 rm $tf 317 else 318 whinge "Failed to create pod; see yaml at $tf" 319 fi 320 } 321 322 # 323 # Used to flush only components involved in a test; now uses 324 # ye olde blunderbuss. 325 # 326 # XXX Hardcoded bits here 327 # 328 function drop_caches { 329 echo "Dropping caches in k8s cluster" 330 ansible -f 24 -i $HOME/hosts.ini k8s-cluster -m shell -a 'echo 3 > /proc/sys/vm/drop_caches' --become >/dev/null 331 332 echo "Dropping cache on single node AIS system" 333 ansible -i $HOME/hosts.ini single -m shell -a 'echo 3 > /proc/sys/vm/drop_caches' --become >/dev/null 334 335 echo "Dropping cache on single node NFS server" 336 ansible -i $HOME/hosts.ini nfs -m shell -a 'echo 3 > /proc/sys/vm/drop_caches' --become >/dev/null 337 } 338 339 function testit { 340 local pfx=$1 341 local fillorder=$2 342 343 # 344 # For webdataset, form the full datapath 345 # 346 if [[ $tc_backend =~ webdataset ]]; then 347 tc_datapath=${tc_ctx}/v1/objects/$tc_datapath 348 fi 349 350 local summary="$tc_numnodes node(s) $tc_mode with $tc_ngpupernode GPU per node; " 351 summary+="$tc_backend over $tc_datapath ($tc_extra) bs $tc_batchsize cast $tc_cast " 352 summary+="${tc_epochs}x${tc_iters}" 353 354 reqnode="-" 355 # special case - the inflated imagenet dataset for SSD is only available on dgx18 356 if [[ $tc_datapath == "/data/imagenet-inflated" ]]; then 357 [[ $tc_numnodes -eq 1 ]] || whinge "The inflated imagenet set for SSD is only populated on one node!" 358 reqnode="dgx18" 359 fi 360 361 echo "***** $summary" 362 [[ $dryrun -ge 2 ]] && return 363 364 if [[ $dryrun -ne 0 ]]; then 365 pod_dryrun=true 366 else 367 pod_dryrun=false 368 $flush && drop_caches 369 fi 370 371 # 372 # Create a config map to configure the run in the pod 373 # 374 tf=$(mktemp) || whinge "failed to create temporary file" 375 cat > $tf <<-EOCM 376 # 377 # sourced by script in pod to control training/inference 378 # 379 380 what=$tc_mode 381 be=$tc_backend 382 ds=$tc_datapath 383 bs=$tc_batchsize 384 cast=$tc_cast 385 epochs=$tc_epochs 386 prof=$tc_iters 387 testloaders=$tc_testloaders 388 redis_svc=${redis_svc} 389 dryrun=$pod_dryrun 390 391 # 392 # for webdataset backends 393 # 394 export openbufsize=32768 395 export tarbufsize=524288 396 export shards="${tc_extra}" 397 export val="${tc_extra}" 398 EOCM 399 400 kubectl delete cm/benchrc-cm 2>/dev/null 401 kubectl create cm benchrc-cm --from-file=benchrc=$tf || whinge "failed to create configmap" 402 rm $tf 403 404 # Some very crude synchronization between control script and pods ... 405 rsignal DEL benchr 406 rsignal DEL benchc 407 408 for ((i=0; i<tc_numnodes; i=i+1)); do 409 create_pod $i $fillorder $reqnode 410 done 411 412 # 413 # Check and report pods confirmed as running the test 414 # XXX No allowance for pod failures! 415 # 416 sleep 5 417 while true; do 418 running=$(rsignal GET benchr) 419 completed=$(rsignal GET benchc) 420 421 echo "${running:-0} pods of $tc_numnodes confirmed running, ${completed:-0} completed ($summary)" 422 if [[ -n "$completed" ]]; then 423 [[ $completed -eq $tc_numnodes ]] && break 424 fi 425 426 sleep 10 427 done 428 429 # prefix is passed with testnum to avoid output collisions 430 if [[ $dryrun -eq 0 ]]; then 431 rf="${pfx}-${tc_mode}-${tc_backend}-${tc_ctx}-${tc_batchsize}-${tc_cast}" 432 masterpod=$(kubectl get pods --no-headers --selector=group=${groupname},groupnum=${groupnum},master=true | awk '{print $1}' ) 433 echo "All pods done, grabbing results from pod $masterpod to $dest/${rf}.json" 434 kubectl cp ${masterpod}:/results/result.json $dest/${rf}.json 435 fi 436 437 echo "Preserving pod logs ..." 438 for p in $(kubectl get pods --no-headers --selector=group=${groupname},groupnum=${groupnum} | awk '{print $1}'); do 439 kubectl logs $p > $logs/${rf}-$p-log.out 440 done 441 442 echo "Deleting pods ..." 443 kubectl delete pods --selector=group=${groupname},groupnum=${groupnum} 444 sleep 5 445 } 446 447 # 448 # Source testrc file 449 # 450 . $testrc || whinge "error in sourcing $testrc" 451 452 for testset in ${enabled_sets[@]}; do 453 declare config_worlds="" 454 declare config_fillorder="widthfirst" 455 456 for pass in planning doing; do 457 declare -i testnum=1 458 tc_testloaders=false 459 460 eval echo "\"\$${testset}\"" | while read line; do 461 if [[ $line =~ = ]]; then 462 [[ $testnum -eq 1 ]] || whinge "set $testset - cannot change test config once test spec start" 463 [[ $line =~ config_.*= ]] || whinge "set $testset illegal variable in test $testnum" 464 eval $line 465 continue 466 fi 467 468 # check required vars are set 469 [[ -n "$config_worlds" ]] || whinge "set $testset missing config_worlds assignment" 470 471 set -- $line 472 declare modes=$1 473 tc_backend=$2 474 tc_ctx=$3 475 tc_datapath=$4 476 tc_batchsize=$5 477 tc_cast=$6 478 tc_epochs=$7 479 tc_iters=$8 480 tc_extra=$9 481 [[ -n "$config_testloaders" ]] && tc_testloaders=$config_testloaders 482 483 if [[ $modes == "both" ]]; then 484 modes="training inference" 485 elif [[ $modes == "-" ]]; then 486 continue 487 elif [[ $modes != "training" && $modes != "inference" ]]; then 488 whinge "Unexpected mode $modes in $testset test $testnum" 489 fi 490 491 if [[ $mode == "planning" ]]; then 492 testnum=$((testnum + 1)) 493 continue 494 fi 495 496 # 497 # expand context (gateway url for AIS) 498 # 499 if [[ "${tc_ctx}" != "-" ]]; then 500 tc_ctx=$(eval echo \$config_${tc_ctx}) 501 fi 502 503 # 504 # expand extra (shard pattern for AIS) 505 # 506 if [[ "${tc_extra}" != "-" ]]; then 507 tc_extra=$(eval echo \$config_${tc_extra}) 508 fi 509 510 for topo in $(echo $config_worlds | tr ',' ' '); do 511 tc_numnodes=$(echo $topo | cut -d x -f 1) 512 tc_ngpupernode=$(echo $topo | cut -d x -f 2) 513 for tc_mode in $modes; do 514 testit "${testset}-${testnum}" $config_fillorder 515 done 516 done 517 518 testnum=$((testnum + 1)) 519 520 done 521 done 522 done