github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/bench/microbenchmarks/dl/scripts/run_dl_test.sh (about) 1 # 2 # Run in container with the following in env: 3 # 4 # ARG_NNODES 5 # ARG_NODE_RANK 6 # ARG_MASTER_HOST 7 # ARG_MASTER_PORT 8 # ARG_NPROC_PER_NODE 9 # 10 11 # 12 # This is a clone with modified main_wl.py, and supporting dataloaders.py. 13 # 14 RN50="/root/DeepLearningExamples/PyTorch/Classification/RN50v1.5" 15 16 # 17 # WS to create and populate with results 18 # 19 WS=/results 20 rm -rf $WS # committed image has past results :-( 21 22 function whinge { 23 echo "FAILURE $@" >&2 24 exit 2 25 } 26 27 mkdir -p $WS || whinge "Could not make results workspace $WS" 28 cd $RN50 || whinge "Could not chdir to $RN50" 29 30 function rsignal { 31 redis-cli -h $redis_svc "$@" 32 } 33 34 try=0 35 while [[ $try -lt 30 ]]; do 36 ping -c 1 $ARG_MASTER_HOST 37 [[ $? -eq 0 ]] && break 38 try=$((try + 1)) 39 sleep 1 40 done 41 [[ $try -eq 30 ]] && whinge "Could not ping $ARG_MASTER_HOST" 42 43 function runit { 44 typeset backend=$1 45 typeset data=$2 46 typeset -i bs=$3 47 typeset cast=$4 48 typeset -i epochs=$5 49 typeset -i prof=$6 50 typeset tl=$7 51 shift; shift; shift; shift; shift; shift; shift 52 53 usecast="$cast" 54 [[ "$cast" == "none" ]] && usecast="" 55 [[ "$cast" == "fp32" ]] && usecast="" 56 [[ $prof -eq 0 ]] && prof="" 57 58 typeset raport="result.json" 59 60 usetl="" 61 $tl && usetl="--test-loaders" 62 63 dist="" 64 if [[ $((ARG_NNODES * ARG_NPROC_PER_NODE)) -gt 1 ]]; then 65 dist="./multiproc.py \ 66 --nnodes $ARG_NNODES \ 67 --node_rank $ARG_NODE_RANK \ 68 --master_addr $ARG_MASTER_HOST \ 69 --master_port $ARG_MASTER_PORT \ 70 --nproc_per_node $ARG_NPROC_PER_NODE" 71 fi 72 73 set -x 74 python3 $dist \ 75 ./main_wl.py \ 76 --workspace $WS \ 77 --tag "${ARG_NNODES} nodes ${ARG_NPROC_PER_NODE} GPU per node" \ 78 --raport-file $raport \ 79 -j5 \ 80 -p 1 \ 81 -b $bs \ 82 ${usecast:+--$usecast} \ 83 --epochs $epochs ${prof:+--prof $prof} ${usetl} \ 84 --data-backend $backend \ 85 "$@" \ 86 $data 87 set +x 88 } 89 90 function train { 91 typeset be=$1 92 typeset ds=$2 93 typeset -i bs=$3 94 typeset cast=$4 95 typeset -i epochs=$5 96 typeset -i prof=$6 97 typeset tl=$7 98 99 typeset -i obs=$((bs * ARG_NPROC_PER_NODE * ARG_NNODES)) 100 101 runit $be $ds $bs $cast $epochs $prof $tl \ 102 --training-only --lr 2.048 \ 103 --optimizer-batch-size $obs \ 104 --warmup 8 --arch resnet50 -c fanin \ 105 --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 106 } 107 108 function inference { 109 typeset be=$1 110 typeset ds=$2 111 typeset -i bs=$3 112 typeset cast=$4 113 typeset -i epochs=$5 114 typeset -i prof=$6 115 typeset tl=$7 116 117 runit $be $ds $bs $cast $epochs $prof $tl \ 118 --arch resnet50 --evaluate 119 } 120 121 # hack because no Dockerfile at the moment 122 # cat main_wl.py | sed 's/torch.set_num_threads(8)/torch.set_num_threads(4)/' > main_wl_fixed.py 123 124 # read config 125 [[ -f /benchrc/benchrc ]] || whinge "/bench/benchrc absent" 126 . /benchrc/benchrc || whinge "sourcing benchrc failed" 127 128 # benchrc supplies ${redis_svc} 129 rsignal INCR benchr 130 131 if [[ $what == "inference" ]]; then 132 echo inference $be $ds $bs $cast $epochs $prof $testloaders 133 $dryrun || inference $be $ds $bs $cast $epochs $prof $testloaders 134 else 135 echo train $be $ds $bs $cast $epochs $prof $testloaders 136 $dryrun || train $be $ds $bs $cast $epochs $prof $testloaders 137 fi 138 139 rsignal INCR benchc 140 141 sleep 1000 # control script cleans up