github.com/kubeflow/training-operator@v1.7.0/cmd/training-operator.v1/main.go (about) 1 /* 2 Copyright 2021. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package main 18 19 import ( 20 "errors" 21 "flag" 22 "os" 23 "strings" 24 25 "go.uber.org/zap/zapcore" 26 "k8s.io/apimachinery/pkg/runtime" 27 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 28 clientgoscheme "k8s.io/client-go/kubernetes/scheme" 29 _ "k8s.io/client-go/plugin/pkg/client/auth" 30 ctrl "sigs.k8s.io/controller-runtime" 31 "sigs.k8s.io/controller-runtime/pkg/healthz" 32 "sigs.k8s.io/controller-runtime/pkg/log/zap" 33 schedulerpluginsv1alpha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" 34 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" 35 volcanoclient "volcano.sh/apis/pkg/client/clientset/versioned" 36 37 kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 38 "github.com/kubeflow/training-operator/pkg/config" 39 controllerv1 "github.com/kubeflow/training-operator/pkg/controller.v1" 40 "github.com/kubeflow/training-operator/pkg/controller.v1/common" 41 //+kubebuilder:scaffold:imports 42 ) 43 44 const ( 45 // EnvKubeflowNamespace is a environment variable for namespace when deployed on kubernetes 46 EnvKubeflowNamespace = "KUBEFLOW_NAMESPACE" 47 ) 48 49 var ( 50 scheme = runtime.NewScheme() 51 setupLog = ctrl.Log.WithName("setup") 52 ) 53 54 func init() { 55 utilruntime.Must(clientgoscheme.AddToScheme(scheme)) 56 utilruntime.Must(kubeflowv1.AddToScheme(scheme)) 57 utilruntime.Must(v1beta1.AddToScheme(scheme)) 58 utilruntime.Must(schedulerpluginsv1alpha1.AddToScheme(scheme)) 59 //+kubebuilder:scaffold:scheme 60 } 61 62 func main() { 63 var metricsAddr string 64 var enableLeaderElection bool 65 var leaderElectionID string 66 var probeAddr string 67 var enabledSchemes controllerv1.EnabledSchemes 68 var gangSchedulerName string 69 var namespace string 70 var monitoringPort int 71 var controllerThreads int 72 flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.") 73 flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") 74 flag.BoolVar(&enableLeaderElection, "leader-elect", false, 75 "Enable leader election for controller manager. "+ 76 "Enabling this will ensure there is only one active controller manager.") 77 flag.StringVar(&leaderElectionID, "leader-election-id", "1ca428e5.training-operator.kubeflow.org", "The ID for leader election.") 78 flag.Var(&enabledSchemes, "enable-scheme", "Enable scheme(s) as --enable-scheme=tfjob --enable-scheme=pytorchjob, case insensitive."+ 79 " Now supporting TFJob, PyTorchJob, MXNetJob, XGBoostJob, PaddleJob. By default, all supported schemes will be enabled.") 80 flag.StringVar(&gangSchedulerName, "gang-scheduler-name", "", "Now Supporting volcano and scheduler-plugins."+ 81 " Note: If you set another scheduler name, the training-operator assumes it's the scheduler-plugins.") 82 flag.StringVar(&namespace, "namespace", os.Getenv(EnvKubeflowNamespace), "The namespace to monitor kubeflow jobs. If unset, it monitors all namespaces cluster-wide."+ 83 "If set, it only monitors kubeflow jobs in the given namespace.") 84 flag.IntVar(&monitoringPort, "monitoring-port", 9443, "Endpoint port for displaying monitoring metrics. "+ 85 "It can be set to \"0\" to disable the metrics serving.") 86 flag.IntVar(&controllerThreads, "controller-threads", 1, "Number of worker threads used by the controller.") 87 88 // PyTorch related flags 89 flag.StringVar(&config.Config.PyTorchInitContainerImage, "pytorch-init-container-image", 90 config.PyTorchInitContainerImageDefault, "The image for pytorch init container") 91 flag.StringVar(&config.Config.PyTorchInitContainerTemplateFile, "pytorch-init-container-template-file", 92 config.PyTorchInitContainerTemplateFileDefault, "The template file for pytorch init container") 93 flag.IntVar(&config.Config.PyTorchInitContainerMaxTries, "pytorch-init-container-max-tries", 94 config.PyTorchInitContainerMaxTriesDefault, "The number of tries for the pytorch init container") 95 96 // MPI related flags 97 flag.StringVar(&config.Config.MPIKubectlDeliveryImage, "mpi-kubectl-delivery-image", 98 config.MPIKubectlDeliveryImageDefault, "The image for mpi launcher init container") 99 100 opts := zap.Options{ 101 Development: true, 102 StacktraceLevel: zapcore.DPanicLevel, 103 } 104 opts.BindFlags(flag.CommandLine) 105 flag.Parse() 106 107 ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) 108 109 mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ 110 Scheme: scheme, 111 MetricsBindAddress: metricsAddr, 112 Port: monitoringPort, 113 HealthProbeBindAddress: probeAddr, 114 LeaderElection: enableLeaderElection, 115 LeaderElectionID: leaderElectionID, 116 Namespace: namespace, 117 }) 118 if err != nil { 119 setupLog.Error(err, "unable to start manager") 120 os.Exit(1) 121 } 122 123 // Set up controllers using goroutines to start the manager quickly. 124 go setupControllers(mgr, enabledSchemes, gangSchedulerName, controllerThreads) 125 126 //+kubebuilder:scaffold:builder 127 128 if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { 129 setupLog.Error(err, "unable to set up health check") 130 os.Exit(1) 131 } 132 if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { 133 setupLog.Error(err, "unable to set up ready check") 134 os.Exit(1) 135 } 136 137 setupLog.Info("starting manager") 138 if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { 139 setupLog.Error(err, "problem running manager") 140 os.Exit(1) 141 } 142 } 143 144 func setupControllers(mgr ctrl.Manager, enabledSchemes controllerv1.EnabledSchemes, gangSchedulerName string, controllerThreads int) { 145 setupLog.Info("registering controllers...") 146 147 // Prepare GangSchedulingSetupFunc 148 gangSchedulingSetupFunc := common.GenNonGangSchedulerSetupFunc() 149 if strings.EqualFold(gangSchedulerName, string(common.GangSchedulerVolcano)) { 150 cfg := mgr.GetConfig() 151 volcanoClientSet := volcanoclient.NewForConfigOrDie(cfg) 152 gangSchedulingSetupFunc = common.GenVolcanoSetupFunc(volcanoClientSet) 153 } else if gangSchedulerName != "" { 154 gangSchedulingSetupFunc = common.GenSchedulerPluginsSetupFunc(mgr.GetClient(), gangSchedulerName) 155 } 156 157 // TODO: We need a general manager. all rest reconciler addsToManager 158 // Based on the user configuration, we start different controllers 159 if enabledSchemes.Empty() { 160 enabledSchemes.FillAll() 161 } 162 errMsg := "failed to set up controllers" 163 for _, s := range enabledSchemes { 164 setupFunc, supported := controllerv1.SupportedSchemeReconciler[s] 165 if !supported { 166 setupLog.Error(errors.New(errMsg), "scheme is not supported", "scheme", s) 167 os.Exit(1) 168 } 169 if err := setupFunc(mgr, gangSchedulingSetupFunc, controllerThreads); err != nil { 170 setupLog.Error(errors.New(errMsg), "unable to create controller", "scheme", s) 171 os.Exit(1) 172 } 173 } 174 }