sigs.k8s.io/kueue@v0.6.2/apis/config/v1beta1/configuration_types.go (about)

     1  /*
     2  Copyright 2022 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package v1beta1
    18  
    19  import (
    20  	"time"
    21  
    22  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    23  	configv1alpha1 "k8s.io/component-base/config/v1alpha1"
    24  )
    25  
    26  // +k8s:defaulter-gen=true
    27  // +kubebuilder:object:root=true
    28  
    29  // Configuration is the Schema for the kueueconfigurations API
    30  type Configuration struct {
    31  	metav1.TypeMeta `json:",inline"`
    32  
    33  	// Namespace is the namespace in which kueue is deployed. It is used as part of DNSName of the webhook Service.
    34  	// If not set, the value is set from the file /var/run/secrets/kubernetes.io/serviceaccount/namespace
    35  	// If the file doesn't exist, default value is kueue-system.
    36  	Namespace *string `json:"namespace,omitempty"`
    37  
    38  	// ControllerManager returns the configurations for controllers
    39  	ControllerManager `json:",inline"`
    40  
    41  	// ManageJobsWithoutQueueName controls whether or not Kueue reconciles
    42  	// batch/v1.Jobs that don't set the annotation kueue.x-k8s.io/queue-name.
    43  	// If set to true, then those jobs will be suspended and never started unless
    44  	// they are assigned a queue and eventually admitted. This also applies to
    45  	// jobs created before starting the kueue controller.
    46  	// Defaults to false; therefore, those jobs are not managed and if they are created
    47  	// unsuspended, they will start immediately.
    48  	ManageJobsWithoutQueueName bool `json:"manageJobsWithoutQueueName"`
    49  
    50  	// InternalCertManagement is configuration for internalCertManagement
    51  	InternalCertManagement *InternalCertManagement `json:"internalCertManagement,omitempty"`
    52  
    53  	// WaitForPodsReady is configuration to provide simple all-or-nothing
    54  	// scheduling semantics for jobs to ensure they get resources assigned.
    55  	// This is achieved by blocking the start of new jobs until the previously
    56  	// started job has all pods running (ready).
    57  	WaitForPodsReady *WaitForPodsReady `json:"waitForPodsReady,omitempty"`
    58  
    59  	// ClientConnection provides additional configuration options for Kubernetes
    60  	// API server client.
    61  	ClientConnection *ClientConnection `json:"clientConnection,omitempty"`
    62  
    63  	// Integrations provide configuration options for AI/ML/Batch frameworks
    64  	// integrations (including K8S job).
    65  	Integrations *Integrations `json:"integrations,omitempty"`
    66  
    67  	// QueueVisibility is configuration to expose the information about the top
    68  	// pending workloads.
    69  	QueueVisibility *QueueVisibility `json:"queueVisibility,omitempty"`
    70  
    71  	// MultiKueue controls the behaviour of the MultiKueue AdmissionCheck Controller.
    72  	MultiKueue *MultiKueue `json:"multiKueue,omitempty"`
    73  }
    74  
    75  type ControllerManager struct {
    76  	// Webhook contains the controllers webhook configuration
    77  	// +optional
    78  	Webhook ControllerWebhook `json:"webhook,omitempty"`
    79  
    80  	// LeaderElection is the LeaderElection config to be used when configuring
    81  	// the manager.Manager leader election
    82  	// +optional
    83  	LeaderElection *configv1alpha1.LeaderElectionConfiguration `json:"leaderElection,omitempty"`
    84  
    85  	// Metrics contains the controller metrics configuration
    86  	// +optional
    87  	Metrics ControllerMetrics `json:"metrics,omitempty"`
    88  
    89  	// Health contains the controller health configuration
    90  	// +optional
    91  	Health ControllerHealth `json:"health,omitempty"`
    92  
    93  	// PprofBindAddress is the TCP address that the controller should bind to
    94  	// for serving pprof.
    95  	// It can be set to "" or "0" to disable the pprof serving.
    96  	// Since pprof may contain sensitive information, make sure to protect it
    97  	// before exposing it to public.
    98  	// +optional
    99  	PprofBindAddress string `json:"pprofBindAddress,omitempty"`
   100  
   101  	// Controller contains global configuration options for controllers
   102  	// registered within this manager.
   103  	// +optional
   104  	Controller *ControllerConfigurationSpec `json:"controller,omitempty"`
   105  }
   106  
   107  // ControllerWebhook defines the webhook server for the controller.
   108  type ControllerWebhook struct {
   109  	// Port is the port that the webhook server serves at.
   110  	// It is used to set webhook.Server.Port.
   111  	// +optional
   112  	Port *int `json:"port,omitempty"`
   113  
   114  	// Host is the hostname that the webhook server binds to.
   115  	// It is used to set webhook.Server.Host.
   116  	// +optional
   117  	Host string `json:"host,omitempty"`
   118  
   119  	// CertDir is the directory that contains the server key and certificate.
   120  	// if not set, webhook server would look up the server key and certificate in
   121  	// {TempDir}/k8s-webhook-server/serving-certs. The server key and certificate
   122  	// must be named tls.key and tls.crt, respectively.
   123  	// +optional
   124  	CertDir string `json:"certDir,omitempty"`
   125  }
   126  
   127  // ControllerMetrics defines the metrics configs.
   128  type ControllerMetrics struct {
   129  	// BindAddress is the TCP address that the controller should bind to
   130  	// for serving prometheus metrics.
   131  	// It can be set to "0" to disable the metrics serving.
   132  	// +optional
   133  	BindAddress string `json:"bindAddress,omitempty"`
   134  
   135  	// EnableClusterQueueResources, if true the cluster queue resource usage and quotas
   136  	// metrics will be reported.
   137  	// +optional
   138  	EnableClusterQueueResources bool `json:"enableClusterQueueResources,omitempty"`
   139  }
   140  
   141  // ControllerHealth defines the health configs.
   142  type ControllerHealth struct {
   143  	// HealthProbeBindAddress is the TCP address that the controller should bind to
   144  	// for serving health probes
   145  	// It can be set to "0" or "" to disable serving the health probe.
   146  	// +optional
   147  	HealthProbeBindAddress string `json:"healthProbeBindAddress,omitempty"`
   148  
   149  	// ReadinessEndpointName, defaults to "readyz"
   150  	// +optional
   151  	ReadinessEndpointName string `json:"readinessEndpointName,omitempty"`
   152  
   153  	// LivenessEndpointName, defaults to "healthz"
   154  	// +optional
   155  	LivenessEndpointName string `json:"livenessEndpointName,omitempty"`
   156  }
   157  
   158  // ControllerConfigurationSpec defines the global configuration for
   159  // controllers registered with the manager.
   160  type ControllerConfigurationSpec struct {
   161  	// GroupKindConcurrency is a map from a Kind to the number of concurrent reconciliation
   162  	// allowed for that controller.
   163  	//
   164  	// When a controller is registered within this manager using the builder utilities,
   165  	// users have to specify the type the controller reconciles in the For(...) call.
   166  	// If the object's kind passed matches one of the keys in this map, the concurrency
   167  	// for that controller is set to the number specified.
   168  	//
   169  	// The key is expected to be consistent in form with GroupKind.String(),
   170  	// e.g. ReplicaSet in apps group (regardless of version) would be `ReplicaSet.apps`.
   171  	//
   172  	// +optional
   173  	GroupKindConcurrency map[string]int `json:"groupKindConcurrency,omitempty"`
   174  
   175  	// CacheSyncTimeout refers to the time limit set to wait for syncing caches.
   176  	// Defaults to 2 minutes if not set.
   177  	// +optional
   178  	CacheSyncTimeout *time.Duration `json:"cacheSyncTimeout,omitempty"`
   179  }
   180  
   181  type WaitForPodsReady struct {
   182  	// Enable when true, indicates that each admitted workload
   183  	// blocks the admission of all other workloads from all queues until it is in the
   184  	// `PodsReady` condition. If false, all workloads start as soon as they are
   185  	// admitted and do not block admission of other workloads. The PodsReady
   186  	// condition is only added if this setting is enabled. It defaults to false.
   187  	Enable bool `json:"enable,omitempty"`
   188  
   189  	// Timeout defines the time for an admitted workload to reach the
   190  	// PodsReady=true condition. When the timeout is reached, the workload admission
   191  	// is cancelled and requeued in the same cluster queue. Defaults to 5min.
   192  	// +optional
   193  	Timeout *metav1.Duration `json:"timeout,omitempty"`
   194  
   195  	// BlockAdmission when true, cluster queue will block admissions for all subsequent jobs
   196  	// until the jobs reach the PodsReady=true condition. It defaults to false if Enable is false
   197  	// and defaults to true otherwise.
   198  	BlockAdmission *bool `json:"blockAdmission,omitempty"`
   199  
   200  	// RequeuingStrategy defines the strategy for requeuing a Workload.
   201  	// +optional
   202  	RequeuingStrategy *RequeuingStrategy `json:"requeuingStrategy,omitempty"`
   203  }
   204  
   205  type MultiKueue struct {
   206  	// GCInterval defines the time interval between two consecutive garbage collection runs.
   207  	// Defaults to 1min. If 0, the garbage collection is disabled.
   208  	// +optional
   209  	GCInterval *metav1.Duration `json:"gcInterval"`
   210  
   211  	// Origin defines a label value used to track the creator of workloads in the worker
   212  	// clusters.
   213  	// This is used by multikueue in components like its garbage collector to identify
   214  	// remote objects that ware created by this multikueue manager cluster and delete
   215  	// them if their local counterpart no longer exists.
   216  	// +optional
   217  	Origin *string `json:"origin,omitempty"`
   218  }
   219  
   220  type RequeuingStrategy struct {
   221  	// Timestamp defines the timestamp used for re-queuing a Workload
   222  	// that was evicted due to Pod readiness. The possible values are:
   223  	//
   224  	// - `Eviction` (default) indicates from Workload `Evicted` condition with `PodsReadyTimeout` reason.
   225  	// - `Creation` indicates from Workload .metadata.creationTimestamp.
   226  	//
   227  	// +optional
   228  	Timestamp *RequeuingTimestamp `json:"timestamp,omitempty"`
   229  
   230  	// BackoffLimitCount defines the maximum number of re-queuing retries.
   231  	// Once the number is reached, the workload is deactivated (`.spec.activate`=`false`).
   232  	// When it is null, the workloads will repeatedly and endless re-queueing.
   233  	//
   234  	// Every backoff duration is about "1.41284738^(n-1)+Rand" where the "n" represents the "workloadStatus.requeueState.count",
   235  	// and the "Rand" represents the random jitter. During this time, the workload is taken as an inadmissible and
   236  	// other workloads will have a chance to be admitted.
   237  	// For example, when the "waitForPodsReady.timeout" is the default, the workload deactivation time is as follows:
   238  	//   {backoffLimitCount, workloadDeactivationSeconds}
   239  	//     ~= {1, 601}, {2, 902}, ...,{5, 1811}, ...,{10, 3374}, ...,{20, 8730}, ...,{30, 86400(=24 hours)}, ...
   240  	//
   241  	// Defaults to null.
   242  	// +optional
   243  	BackoffLimitCount *int32 `json:"backoffLimitCount,omitempty"`
   244  }
   245  
   246  type RequeuingTimestamp string
   247  
   248  const (
   249  	// CreationTimestamp timestamp (from Workload .metadata.creationTimestamp).
   250  	CreationTimestamp RequeuingTimestamp = "Creation"
   251  
   252  	// EvictionTimestamp timestamp (from Workload .status.conditions).
   253  	EvictionTimestamp RequeuingTimestamp = "Eviction"
   254  )
   255  
   256  type InternalCertManagement struct {
   257  	// Enable controls whether to enable internal cert management or not.
   258  	// Defaults to true. If you want to use a third-party management, e.g. cert-manager,
   259  	// set it to false. See the user guide for more information.
   260  	Enable *bool `json:"enable,omitempty"`
   261  
   262  	// WebhookServiceName is the name of the Service used as part of the DNSName.
   263  	// Defaults to kueue-webhook-service.
   264  	WebhookServiceName *string `json:"webhookServiceName,omitempty"`
   265  
   266  	// WebhookSecretName is the name of the Secret used to store CA and server certs.
   267  	// Defaults to kueue-webhook-server-cert.
   268  	WebhookSecretName *string `json:"webhookSecretName,omitempty"`
   269  }
   270  
   271  type ClientConnection struct {
   272  	// QPS controls the number of queries per second allowed for K8S api server
   273  	// connection.
   274  	QPS *float32 `json:"qps,omitempty"`
   275  
   276  	// Burst allows extra queries to accumulate when a client is exceeding its rate.
   277  	Burst *int32 `json:"burst,omitempty"`
   278  }
   279  
   280  type Integrations struct {
   281  	// List of framework names to be enabled.
   282  	// Possible options:
   283  	//  - "batch/job"
   284  	//  - "kubeflow.org/mpijob"
   285  	//  - "ray.io/rayjob"
   286  	//  - "ray.io/raycluster"
   287  	//  - "jobset.x-k8s.io/jobset"
   288  	//  - "kubeflow.org/mxjob"
   289  	//  - "kubeflow.org/paddlejob"
   290  	//  - "kubeflow.org/pytorchjob"
   291  	//  - "kubeflow.org/tfjob"
   292  	//  - "kubeflow.org/xgboostjob"
   293  	//  - "pod"
   294  	Frameworks []string `json:"frameworks,omitempty"`
   295  	// PodOptions defines kueue controller behaviour for pod objects
   296  	PodOptions *PodIntegrationOptions `json:"podOptions,omitempty"`
   297  }
   298  
   299  type PodIntegrationOptions struct {
   300  	// NamespaceSelector can be used to omit some namespaces from pod reconciliation
   301  	NamespaceSelector *metav1.LabelSelector `json:"namespaceSelector,omitempty"`
   302  	// PodSelector can be used to choose what pods to reconcile
   303  	PodSelector *metav1.LabelSelector `json:"podSelector,omitempty"`
   304  }
   305  
   306  type QueueVisibility struct {
   307  	// ClusterQueues is configuration to expose the information
   308  	// about the top pending workloads in the cluster queue.
   309  	ClusterQueues *ClusterQueueVisibility `json:"clusterQueues,omitempty"`
   310  
   311  	// UpdateIntervalSeconds specifies the time interval for updates to the structure
   312  	// of the top pending workloads in the queues.
   313  	// The minimum value is 1.
   314  	// Defaults to 5.
   315  	UpdateIntervalSeconds int32 `json:"updateIntervalSeconds,omitempty"`
   316  }
   317  
   318  type ClusterQueueVisibility struct {
   319  	// MaxCount indicates the maximal number of pending workloads exposed in the
   320  	// cluster queue status.  When the value is set to 0, then ClusterQueue
   321  	// visibility updates are disabled.
   322  	// The maximal value is 4000.
   323  	// Defaults to 10.
   324  	MaxCount int32 `json:"maxCount,omitempty"`
   325  }