github.com/kubeflow/training-operator@v1.7.0/scripts/gha/setup-training-operator.sh (about)

     1  #!/bin/bash
     2  
     3  # Copyright 2021 The Kubernetes Authors.
     4  #
     5  # Licensed under the Apache License, Version 2.0 (the "License");
     6  # you may not use this file except in compliance with the License.
     7  # You may obtain a copy of the License at
     8  #
     9  #     http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  set -o errexit
    18  set -o nounset
    19  set -o pipefail
    20  
    21  echo "Kind load newly locally built image"
    22  # use cluster name which is used in github actions kind create
    23  kind load docker-image ${TRAINING_CI_IMAGE} --name ${KIND_CLUSTER}
    24  
    25  echo "Update training operator manifest with newly built image"
    26  cd manifests/overlays/standalone
    27  kustomize edit set image kubeflow/training-operator=${TRAINING_CI_IMAGE}
    28  
    29  echo "Installing training operator manifests"
    30  kustomize build . | kubectl apply -f -
    31  
    32  if [ "${GANG_SCHEDULER_NAME}" = "scheduler-plugins" ]; then
    33    SCHEDULER_PLUGINS_VERSION=$(go list -m -f "{{.Version}}" sigs.k8s.io/scheduler-plugins)
    34    git clone https://github.com/kubernetes-sigs/scheduler-plugins.git -b "${SCHEDULER_PLUGINS_VERSION}"
    35  
    36    echo "Installing Scheduler Plugins ${SCHEDULER_PLUGINS_VERSION}..."
    37    helm install scheduler-plugins scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ --create-namespace \
    38      --namespace scheduler-plugins \
    39      --set controller.image="registry.k8s.io/scheduler-plugins/controller:${SCHEDULER_PLUGINS_VERSION}" \
    40      --set scheduler.image="registry.k8s.io/scheduler-plugins/kube-scheduler:${SCHEDULER_PLUGINS_VERSION}"
    41  
    42    echo "Configure gang-scheduling using scheduler-plugins to training-operator"
    43    kubectl patch -n kubeflow deployments training-operator --type='json' \
    44      -p='[{"op": "add", "path": "/spec/template/spec/containers/0/command/1", "value": "--gang-scheduler-name=scheduler-plugins"}]'
    45  elif  [ "${GANG_SCHEDULER_NAME}" = "volcano" ]; then
    46    VOLCANO_SCHEDULER_VERSION=$(go list -m -f "{{.Version}}" volcano.sh/apis)
    47  
    48    # patch scheduler first so that it is ready when scheduler-deployment installing finished
    49    echo "Configure gang-scheduling using volcano to training-operator"
    50    kubectl patch -n kubeflow deployments training-operator --type='json' \
    51      -p='[{"op": "add", "path": "/spec/template/spec/containers/0/command/1", "value": "--gang-scheduler-name=volcano"}]'
    52  
    53    echo "Installing volcano scheduler ${VOLCANO_SCHEDULER_VERSION}..."
    54    kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/${VOLCANO_SCHEDULER_VERSION}/installer/volcano-development.yaml
    55  fi
    56  
    57  TIMEOUT=30
    58  until kubectl get pods -n kubeflow | grep training-operator | grep 1/1 || [[ $TIMEOUT -eq 1 ]]; do
    59    sleep 10
    60    TIMEOUT=$(( TIMEOUT - 1 ))
    61  done
    62  if [ "${GANG_SCHEDULER_NAME}" = "scheduler-plugins" ]; then
    63    kubectl wait pods --for=condition=ready -n scheduler-plugins --timeout "${TIMEOUT}s" --all || \
    64      (kubectl get pods -n scheduler-plugins && kubectl describe pods -n scheduler-plugins; exit 1)
    65  fi
    66  
    67  # wait for volcano up
    68  if [ "${GANG_SCHEDULER_NAME}" = "volcano" ]; then
    69    kubectl rollout status deployment -n volcano-system volcano-admission --timeout "${TIMEOUT}s" && \
    70    kubectl rollout status deployment -n volcano-system volcano-scheduler --timeout "${TIMEOUT}s" && \
    71    kubectl rollout status deployment -n volcano-system volcano-controllers --timeout "${TIMEOUT}s" || \
    72      (kubectl get pods -n volcano-system && kubectl describe pods -n volcano-system; exit 1)
    73  fi
    74  
    75  kubectl version
    76  kubectl cluster-info
    77  kubectl get nodes
    78  kubectl get pods -n kubeflow
    79  kubectl describe pods -n kubeflow