github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/bench/microbenchmarks/dl/scripts/run_dl_test.sh (about)

     1  #
     2  # Run in container with the following in env:
     3  #
     4  # ARG_NNODES
     5  # ARG_NODE_RANK
     6  # ARG_MASTER_HOST
     7  # ARG_MASTER_PORT
     8  # ARG_NPROC_PER_NODE
     9  #
    10  
    11  #
    12  # This is a clone with modified main_wl.py, and supporting dataloaders.py.
    13  #
    14  RN50="/root/DeepLearningExamples/PyTorch/Classification/RN50v1.5"
    15  
    16  #
    17  # WS to create and populate with results
    18  #
    19  WS=/results
    20  rm -rf $WS	# committed image has past results :-(
    21  
    22  function whinge {
    23  	echo "FAILURE $@" >&2
    24  	exit 2
    25  }
    26  
    27  mkdir -p $WS || whinge "Could not make results workspace $WS"
    28  cd $RN50 || whinge "Could not chdir to $RN50"
    29  
    30  function rsignal {
    31          redis-cli -h $redis_svc "$@"
    32  }
    33  
    34  try=0
    35  while [[ $try -lt 30 ]]; do
    36  	ping -c 1 $ARG_MASTER_HOST
    37  	[[ $? -eq 0 ]] && break
    38  	try=$((try + 1))
    39  	sleep 1
    40  done
    41  [[ $try -eq 30 ]] && whinge "Could not ping $ARG_MASTER_HOST"
    42  
    43  function runit {
    44  	typeset backend=$1
    45  	typeset data=$2
    46  	typeset -i bs=$3
    47  	typeset cast=$4
    48  	typeset -i epochs=$5
    49  	typeset -i prof=$6
    50  	typeset tl=$7
    51  	shift; shift; shift; shift; shift; shift; shift
    52  
    53  	usecast="$cast"
    54  	[[ "$cast" == "none" ]] && usecast=""
    55  	[[ "$cast" == "fp32" ]] && usecast=""
    56  	[[ $prof -eq 0 ]] && prof=""
    57  
    58  	typeset raport="result.json"
    59  
    60  	usetl=""
    61  	$tl && usetl="--test-loaders"
    62  
    63  	dist=""
    64  	if [[ $((ARG_NNODES * ARG_NPROC_PER_NODE)) -gt 1 ]]; then
    65  		dist="./multiproc.py \
    66  		--nnodes $ARG_NNODES \
    67  		--node_rank $ARG_NODE_RANK \
    68  		--master_addr $ARG_MASTER_HOST \
    69  		--master_port $ARG_MASTER_PORT \
    70  		--nproc_per_node $ARG_NPROC_PER_NODE"
    71  	fi
    72  
    73  	set -x
    74  	python3 $dist \
    75  		./main_wl.py \
    76  		--workspace $WS \
    77  		--tag "${ARG_NNODES} nodes ${ARG_NPROC_PER_NODE} GPU per node" \
    78  		--raport-file $raport \
    79  		-j5 \
    80  		-p 1 \
    81  		-b $bs \
    82  		${usecast:+--$usecast} \
    83  		--epochs $epochs ${prof:+--prof $prof} ${usetl} \
    84  		--data-backend $backend \
    85  		"$@" \
    86  		$data
    87  	set +x
    88  }
    89  
    90  function train {
    91  	typeset be=$1
    92  	typeset ds=$2
    93  	typeset -i bs=$3
    94  	typeset cast=$4
    95  	typeset -i epochs=$5
    96  	typeset -i prof=$6
    97  	typeset tl=$7
    98  
    99  	typeset -i obs=$((bs * ARG_NPROC_PER_NODE * ARG_NNODES))
   100  
   101  	runit $be $ds $bs $cast $epochs $prof $tl \
   102  		--training-only --lr 2.048 \
   103  		--optimizer-batch-size $obs \
   104  		--warmup 8 --arch resnet50 -c fanin \
   105  		--label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05
   106  }
   107  
   108  function inference {
   109  	typeset be=$1
   110  	typeset ds=$2
   111  	typeset -i bs=$3
   112  	typeset cast=$4
   113  	typeset -i epochs=$5
   114  	typeset -i prof=$6
   115  	typeset tl=$7
   116  
   117  	runit $be $ds $bs $cast $epochs $prof $tl \
   118  		--arch resnet50 --evaluate
   119  }
   120  
   121  # hack because no Dockerfile at the moment
   122  # cat main_wl.py | sed 's/torch.set_num_threads(8)/torch.set_num_threads(4)/' > main_wl_fixed.py
   123  
   124  # read config
   125  [[ -f /benchrc/benchrc ]] || whinge "/bench/benchrc absent"
   126  . /benchrc/benchrc || whinge "sourcing benchrc failed"
   127  
   128  # benchrc supplies ${redis_svc}
   129  rsignal INCR benchr
   130  
   131  if [[ $what == "inference" ]]; then
   132  	echo inference $be $ds $bs $cast $epochs $prof $testloaders
   133  	$dryrun || inference $be $ds $bs $cast $epochs $prof $testloaders
   134  else
   135  	echo train $be $ds $bs $cast $epochs $prof $testloaders
   136  	$dryrun || train $be $ds $bs $cast $epochs $prof $testloaders
   137  fi
   138  
   139  rsignal INCR benchc
   140  
   141  sleep 1000	# control script cleans up