github.com/kubeflow/training-operator@v1.7.0/cmd/training-operator.v1/main.go (about)

     1  /*
     2  Copyright 2021.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package main
    18  
    19  import (
    20  	"errors"
    21  	"flag"
    22  	"os"
    23  	"strings"
    24  
    25  	"go.uber.org/zap/zapcore"
    26  	"k8s.io/apimachinery/pkg/runtime"
    27  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    28  	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
    29  	_ "k8s.io/client-go/plugin/pkg/client/auth"
    30  	ctrl "sigs.k8s.io/controller-runtime"
    31  	"sigs.k8s.io/controller-runtime/pkg/healthz"
    32  	"sigs.k8s.io/controller-runtime/pkg/log/zap"
    33  	schedulerpluginsv1alpha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1"
    34  	"volcano.sh/apis/pkg/apis/scheduling/v1beta1"
    35  	volcanoclient "volcano.sh/apis/pkg/client/clientset/versioned"
    36  
    37  	kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    38  	"github.com/kubeflow/training-operator/pkg/config"
    39  	controllerv1 "github.com/kubeflow/training-operator/pkg/controller.v1"
    40  	"github.com/kubeflow/training-operator/pkg/controller.v1/common"
    41  	//+kubebuilder:scaffold:imports
    42  )
    43  
    44  const (
    45  	// EnvKubeflowNamespace is a environment variable for namespace when deployed on kubernetes
    46  	EnvKubeflowNamespace = "KUBEFLOW_NAMESPACE"
    47  )
    48  
    49  var (
    50  	scheme   = runtime.NewScheme()
    51  	setupLog = ctrl.Log.WithName("setup")
    52  )
    53  
    54  func init() {
    55  	utilruntime.Must(clientgoscheme.AddToScheme(scheme))
    56  	utilruntime.Must(kubeflowv1.AddToScheme(scheme))
    57  	utilruntime.Must(v1beta1.AddToScheme(scheme))
    58  	utilruntime.Must(schedulerpluginsv1alpha1.AddToScheme(scheme))
    59  	//+kubebuilder:scaffold:scheme
    60  }
    61  
    62  func main() {
    63  	var metricsAddr string
    64  	var enableLeaderElection bool
    65  	var leaderElectionID string
    66  	var probeAddr string
    67  	var enabledSchemes controllerv1.EnabledSchemes
    68  	var gangSchedulerName string
    69  	var namespace string
    70  	var monitoringPort int
    71  	var controllerThreads int
    72  	flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
    73  	flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
    74  	flag.BoolVar(&enableLeaderElection, "leader-elect", false,
    75  		"Enable leader election for controller manager. "+
    76  			"Enabling this will ensure there is only one active controller manager.")
    77  	flag.StringVar(&leaderElectionID, "leader-election-id", "1ca428e5.training-operator.kubeflow.org", "The ID for leader election.")
    78  	flag.Var(&enabledSchemes, "enable-scheme", "Enable scheme(s) as --enable-scheme=tfjob --enable-scheme=pytorchjob, case insensitive."+
    79  		" Now supporting TFJob, PyTorchJob, MXNetJob, XGBoostJob, PaddleJob. By default, all supported schemes will be enabled.")
    80  	flag.StringVar(&gangSchedulerName, "gang-scheduler-name", "", "Now Supporting volcano and scheduler-plugins."+
    81  		" Note: If you set another scheduler name, the training-operator assumes it's the scheduler-plugins.")
    82  	flag.StringVar(&namespace, "namespace", os.Getenv(EnvKubeflowNamespace), "The namespace to monitor kubeflow jobs. If unset, it monitors all namespaces cluster-wide."+
    83  		"If set, it only monitors kubeflow jobs in the given namespace.")
    84  	flag.IntVar(&monitoringPort, "monitoring-port", 9443, "Endpoint port for displaying monitoring metrics. "+
    85  		"It can be set to \"0\" to disable the metrics serving.")
    86  	flag.IntVar(&controllerThreads, "controller-threads", 1, "Number of worker threads used by the controller.")
    87  
    88  	// PyTorch related flags
    89  	flag.StringVar(&config.Config.PyTorchInitContainerImage, "pytorch-init-container-image",
    90  		config.PyTorchInitContainerImageDefault, "The image for pytorch init container")
    91  	flag.StringVar(&config.Config.PyTorchInitContainerTemplateFile, "pytorch-init-container-template-file",
    92  		config.PyTorchInitContainerTemplateFileDefault, "The template file for pytorch init container")
    93  	flag.IntVar(&config.Config.PyTorchInitContainerMaxTries, "pytorch-init-container-max-tries",
    94  		config.PyTorchInitContainerMaxTriesDefault, "The number of tries for the pytorch init container")
    95  
    96  	// MPI related flags
    97  	flag.StringVar(&config.Config.MPIKubectlDeliveryImage, "mpi-kubectl-delivery-image",
    98  		config.MPIKubectlDeliveryImageDefault, "The image for mpi launcher init container")
    99  
   100  	opts := zap.Options{
   101  		Development:     true,
   102  		StacktraceLevel: zapcore.DPanicLevel,
   103  	}
   104  	opts.BindFlags(flag.CommandLine)
   105  	flag.Parse()
   106  
   107  	ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
   108  
   109  	mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
   110  		Scheme:                 scheme,
   111  		MetricsBindAddress:     metricsAddr,
   112  		Port:                   monitoringPort,
   113  		HealthProbeBindAddress: probeAddr,
   114  		LeaderElection:         enableLeaderElection,
   115  		LeaderElectionID:       leaderElectionID,
   116  		Namespace:              namespace,
   117  	})
   118  	if err != nil {
   119  		setupLog.Error(err, "unable to start manager")
   120  		os.Exit(1)
   121  	}
   122  
   123  	// Set up controllers using goroutines to start the manager quickly.
   124  	go setupControllers(mgr, enabledSchemes, gangSchedulerName, controllerThreads)
   125  
   126  	//+kubebuilder:scaffold:builder
   127  
   128  	if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
   129  		setupLog.Error(err, "unable to set up health check")
   130  		os.Exit(1)
   131  	}
   132  	if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
   133  		setupLog.Error(err, "unable to set up ready check")
   134  		os.Exit(1)
   135  	}
   136  
   137  	setupLog.Info("starting manager")
   138  	if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
   139  		setupLog.Error(err, "problem running manager")
   140  		os.Exit(1)
   141  	}
   142  }
   143  
   144  func setupControllers(mgr ctrl.Manager, enabledSchemes controllerv1.EnabledSchemes, gangSchedulerName string, controllerThreads int) {
   145  	setupLog.Info("registering controllers...")
   146  
   147  	// Prepare GangSchedulingSetupFunc
   148  	gangSchedulingSetupFunc := common.GenNonGangSchedulerSetupFunc()
   149  	if strings.EqualFold(gangSchedulerName, string(common.GangSchedulerVolcano)) {
   150  		cfg := mgr.GetConfig()
   151  		volcanoClientSet := volcanoclient.NewForConfigOrDie(cfg)
   152  		gangSchedulingSetupFunc = common.GenVolcanoSetupFunc(volcanoClientSet)
   153  	} else if gangSchedulerName != "" {
   154  		gangSchedulingSetupFunc = common.GenSchedulerPluginsSetupFunc(mgr.GetClient(), gangSchedulerName)
   155  	}
   156  
   157  	// TODO: We need a general manager. all rest reconciler addsToManager
   158  	// Based on the user configuration, we start different controllers
   159  	if enabledSchemes.Empty() {
   160  		enabledSchemes.FillAll()
   161  	}
   162  	errMsg := "failed to set up controllers"
   163  	for _, s := range enabledSchemes {
   164  		setupFunc, supported := controllerv1.SupportedSchemeReconciler[s]
   165  		if !supported {
   166  			setupLog.Error(errors.New(errMsg), "scheme is not supported", "scheme", s)
   167  			os.Exit(1)
   168  		}
   169  		if err := setupFunc(mgr, gangSchedulingSetupFunc, controllerThreads); err != nil {
   170  			setupLog.Error(errors.New(errMsg), "unable to create controller", "scheme", s)
   171  			os.Exit(1)
   172  		}
   173  	}
   174  }