github.com/kubeflow/training-operator@v1.7.0/scripts/gha/setup-training-operator.sh (about) 1 #!/bin/bash 2 3 # Copyright 2021 The Kubernetes Authors. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 set -o errexit 18 set -o nounset 19 set -o pipefail 20 21 echo "Kind load newly locally built image" 22 # use cluster name which is used in github actions kind create 23 kind load docker-image ${TRAINING_CI_IMAGE} --name ${KIND_CLUSTER} 24 25 echo "Update training operator manifest with newly built image" 26 cd manifests/overlays/standalone 27 kustomize edit set image kubeflow/training-operator=${TRAINING_CI_IMAGE} 28 29 echo "Installing training operator manifests" 30 kustomize build . | kubectl apply -f - 31 32 if [ "${GANG_SCHEDULER_NAME}" = "scheduler-plugins" ]; then 33 SCHEDULER_PLUGINS_VERSION=$(go list -m -f "{{.Version}}" sigs.k8s.io/scheduler-plugins) 34 git clone https://github.com/kubernetes-sigs/scheduler-plugins.git -b "${SCHEDULER_PLUGINS_VERSION}" 35 36 echo "Installing Scheduler Plugins ${SCHEDULER_PLUGINS_VERSION}..." 37 helm install scheduler-plugins scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ --create-namespace \ 38 --namespace scheduler-plugins \ 39 --set controller.image="registry.k8s.io/scheduler-plugins/controller:${SCHEDULER_PLUGINS_VERSION}" \ 40 --set scheduler.image="registry.k8s.io/scheduler-plugins/kube-scheduler:${SCHEDULER_PLUGINS_VERSION}" 41 42 echo "Configure gang-scheduling using scheduler-plugins to training-operator" 43 kubectl patch -n kubeflow deployments training-operator --type='json' \ 44 -p='[{"op": "add", "path": "/spec/template/spec/containers/0/command/1", "value": "--gang-scheduler-name=scheduler-plugins"}]' 45 elif [ "${GANG_SCHEDULER_NAME}" = "volcano" ]; then 46 VOLCANO_SCHEDULER_VERSION=$(go list -m -f "{{.Version}}" volcano.sh/apis) 47 48 # patch scheduler first so that it is ready when scheduler-deployment installing finished 49 echo "Configure gang-scheduling using volcano to training-operator" 50 kubectl patch -n kubeflow deployments training-operator --type='json' \ 51 -p='[{"op": "add", "path": "/spec/template/spec/containers/0/command/1", "value": "--gang-scheduler-name=volcano"}]' 52 53 echo "Installing volcano scheduler ${VOLCANO_SCHEDULER_VERSION}..." 54 kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/${VOLCANO_SCHEDULER_VERSION}/installer/volcano-development.yaml 55 fi 56 57 TIMEOUT=30 58 until kubectl get pods -n kubeflow | grep training-operator | grep 1/1 || [[ $TIMEOUT -eq 1 ]]; do 59 sleep 10 60 TIMEOUT=$(( TIMEOUT - 1 )) 61 done 62 if [ "${GANG_SCHEDULER_NAME}" = "scheduler-plugins" ]; then 63 kubectl wait pods --for=condition=ready -n scheduler-plugins --timeout "${TIMEOUT}s" --all || \ 64 (kubectl get pods -n scheduler-plugins && kubectl describe pods -n scheduler-plugins; exit 1) 65 fi 66 67 # wait for volcano up 68 if [ "${GANG_SCHEDULER_NAME}" = "volcano" ]; then 69 kubectl rollout status deployment -n volcano-system volcano-admission --timeout "${TIMEOUT}s" && \ 70 kubectl rollout status deployment -n volcano-system volcano-scheduler --timeout "${TIMEOUT}s" && \ 71 kubectl rollout status deployment -n volcano-system volcano-controllers --timeout "${TIMEOUT}s" || \ 72 (kubectl get pods -n volcano-system && kubectl describe pods -n volcano-system; exit 1) 73 fi 74 75 kubectl version 76 kubectl cluster-info 77 kubectl get nodes 78 kubectl get pods -n kubeflow 79 kubectl describe pods -n kubeflow