github.com/kubewharf/katalyst-core@v0.5.3/pkg/controller/tide/tide.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package tide 18 19 import ( 20 "context" 21 "fmt" 22 "reflect" 23 "sort" 24 "time" 25 26 corev1 "k8s.io/api/core/v1" 27 "k8s.io/apimachinery/pkg/api/errors" 28 "k8s.io/apimachinery/pkg/api/resource" 29 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 "k8s.io/apimachinery/pkg/labels" 31 "k8s.io/apimachinery/pkg/types" 32 "k8s.io/apimachinery/pkg/util/intstr" 33 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 34 "k8s.io/apimachinery/pkg/util/wait" 35 "k8s.io/autoscaler/cluster-autoscaler/simulator" 36 corelisters "k8s.io/client-go/listers/core/v1" 37 "k8s.io/client-go/tools/cache" 38 "k8s.io/client-go/util/workqueue" 39 "k8s.io/klog/v2" 40 podv1 "k8s.io/kubernetes/pkg/api/v1/pod" 41 "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" 42 nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node" 43 44 apis "github.com/kubewharf/katalyst-api/pkg/apis/tide/v1alpha1" 45 listers "github.com/kubewharf/katalyst-api/pkg/client/listers/tide/v1alpha1" 46 katalystbase "github.com/kubewharf/katalyst-core/cmd/base" 47 "github.com/kubewharf/katalyst-core/pkg/client" 48 "github.com/kubewharf/katalyst-core/pkg/config/controller" 49 "github.com/kubewharf/katalyst-core/pkg/config/generic" 50 "github.com/kubewharf/katalyst-core/pkg/metrics" 51 ) 52 53 const ( 54 tideControllerName = "tide" 55 tideCycleWorkerCount = 1 56 ) 57 58 const ( 59 tidePeriod = 10 * time.Second 60 ) 61 62 type OnlinePodChecker func(pod *corev1.Pod) bool 63 64 type NodeInfo struct { 65 NodeUsage 66 } 67 68 // NodeUsage stores a node's info, pods on it, thresholds and its resource usage 69 type NodeUsage struct { 70 node *corev1.Node 71 usage map[corev1.ResourceName]*resource.Quantity 72 allPods []*corev1.Pod 73 } 74 75 type Tide struct { 76 ctx context.Context 77 78 client *client.GenericClientSet 79 80 checker simulator.PredicateChecker 81 82 nodeListerSynced cache.InformerSynced 83 nodeLister corelisters.NodeLister 84 podListerSynced cache.InformerSynced 85 podLister corelisters.PodLister 86 tideListerSynced cache.InformerSynced 87 tideLister listers.TideNodePoolLister 88 89 // queue for node 90 syncQueue workqueue.RateLimitingInterface 91 92 // metricsEmitter for emit metrics 93 metricsEmitter metrics.MetricEmitter 94 } 95 96 func NewTide(ctx context.Context, 97 controlCtx *katalystbase.GenericContext, 98 _ *generic.GenericConfiguration, 99 _ *controller.GenericControllerConfiguration, 100 ) (*Tide, error) { 101 tide := &Tide{ 102 ctx: ctx, 103 client: controlCtx.Client, 104 syncQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), 105 tideControllerName), 106 } 107 checker, err := simulator.NewSchedulerBasedPredicateChecker(controlCtx.Client.KubeClient, ctx.Done()) 108 if err != nil { 109 return nil, err 110 } 111 tide.checker = checker 112 113 controlCtx.KubeInformerFactory.Core().V1().Nodes().Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 114 AddFunc: tide.addNodeEventHandle, 115 UpdateFunc: tide.updateNodeEventHandle, 116 }) 117 tide.nodeListerSynced = controlCtx.KubeInformerFactory.Core().V1().Nodes().Informer().HasSynced 118 tide.nodeLister = controlCtx.KubeInformerFactory.Core().V1().Nodes().Lister() 119 120 tide.podListerSynced = controlCtx.KubeInformerFactory.Core().V1().Pods().Informer().HasSynced 121 tide.podLister = controlCtx.KubeInformerFactory.Core().V1().Pods().Lister() 122 123 controlCtx.InternalInformerFactory.Tide() 124 controlCtx.InternalInformerFactory.Tide().V1alpha1().TideNodePools().Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 125 AddFunc: tide.addTideNodePoolEventHandle, 126 UpdateFunc: tide.updateTideNodePoolEventHandle, 127 DeleteFunc: tide.deleteTideNodePoolEventHandle, 128 }) 129 tide.tideLister = controlCtx.InternalInformerFactory.Tide().V1alpha1().TideNodePools().Lister() 130 tide.tideListerSynced = controlCtx.InternalInformerFactory.Tide().V1alpha1().TideNodePools().Informer().HasSynced 131 132 tide.metricsEmitter = controlCtx.EmitterPool.GetDefaultMetricsEmitter() 133 134 return tide, nil 135 } 136 137 func (t *Tide) Run() { 138 defer utilruntime.HandleCrash() 139 defer t.syncQueue.ShutDown() 140 141 defer klog.Infof("Shutting down %s controller", tideControllerName) 142 143 if !cache.WaitForCacheSync(t.ctx.Done(), t.nodeListerSynced, t.tideListerSynced, t.podListerSynced) { 144 utilruntime.HandleError(fmt.Errorf("unable to sync caches for %s controller", tideControllerName)) 145 return 146 } 147 klog.Infof("Caches are synced for %s controller", tideControllerName) 148 klog.Infof("start %d workers for %s controller", tideCycleWorkerCount, tideControllerName) 149 150 go wait.Until(t.periodSync, tidePeriod, t.ctx.Done()) 151 for i := 0; i < tideCycleWorkerCount; i++ { 152 go wait.Until(t.worker, time.Second, t.ctx.Done()) 153 } 154 155 <-t.ctx.Done() 156 } 157 158 func (t *Tide) addNodeEventHandle(obj interface{}) { 159 tideNodePoolList, err := t.tideLister.List(labels.Everything()) 160 if err != nil { 161 klog.Errorf("list tide hybrid node pool failed: %v", err) 162 return 163 } 164 n, ok := obj.(*corev1.Node) 165 if !ok { 166 klog.Errorf("cannot convert obj to *apis.TideNodePool: %v", obj) 167 return 168 } 169 for _, tideNodePool := range tideNodePoolList { 170 if labels.SelectorFromSet(tideNodePool.Spec.NodeConfigs.NodeSelector). 171 Matches(labels.Set(n.GetLabels())) { 172 klog.Infof("start to sync node pool, name: %s", tideNodePool.Name) 173 t.enqueueWorkItem(tideNodePool) 174 } 175 } 176 } 177 178 func (t *Tide) updateNodeEventHandle(old, cur interface{}) { 179 tideNodePoolList, err := t.tideLister.List(labels.Everything()) 180 if err != nil { 181 klog.Errorf("list tide hybrid node pool failed: %v", err) 182 return 183 } 184 n, ok := old.(*corev1.Node) 185 if !ok { 186 klog.Errorf("cannot convert obj to *apis.TideNodePool: %v", old) 187 return 188 } 189 for _, tideNodePool := range tideNodePoolList { 190 if labels.SelectorFromSet(tideNodePool.Spec.NodeConfigs.NodeSelector). 191 Matches(labels.Set(n.GetLabels())) { 192 klog.Infof("start to sync node pool, name: %s", tideNodePool.Name) 193 t.enqueueWorkItem(tideNodePool) 194 } 195 } 196 } 197 198 func (t *Tide) addTideNodePoolEventHandle(obj interface{}) { 199 c, ok := obj.(*apis.TideNodePool) 200 if !ok { 201 klog.Errorf("cannot convert obj to *apis.TideNodePool: %v", obj) 202 return 203 } 204 klog.V(4).Infof("notice addition of tide node pool %s", c.Name) 205 206 t.enqueueWorkItem(obj) 207 } 208 209 func (t *Tide) updateTideNodePoolEventHandle(_, new interface{}) { 210 c, ok := new.(*apis.TideNodePool) 211 if !ok { 212 klog.Errorf("cannot convert oldObj to *apis.TideNodePool: %v", c) 213 return 214 } 215 klog.V(4).Infof("notice addition of tide node pool %s", c.Name) 216 217 t.enqueueWorkItem(new) 218 } 219 220 func (t *Tide) deleteTideNodePoolEventHandle(obj interface{}) { 221 c, ok := obj.(*apis.TideNodePool) 222 if !ok { 223 klog.Errorf("cannot convert oldObj to *apis.TideNodePool: %v", c) 224 return 225 } 226 klog.V(4).Infof("notice addition of tide node pool %s", c.Name) 227 228 t.enqueueWorkItem(obj) 229 } 230 231 func (t *Tide) worker() { 232 for t.processNextWorkItem(context.Background()) { 233 } 234 } 235 236 // processNextWorkItem dequeues items, processes them, and marks them done. 237 // It enforces that the sync is never invoked concurrently with the same key. 238 func (t *Tide) processNextWorkItem(ctx context.Context) bool { 239 key, quit := t.syncQueue.Get() 240 if quit { 241 return false 242 } 243 defer t.syncQueue.Done(key) 244 245 err := t.sync(ctx, key.(string)) 246 if err == nil { 247 t.syncQueue.Forget(key) 248 return true 249 } 250 251 utilruntime.HandleError(fmt.Errorf("sync %q failed with %v", key, err)) 252 t.syncQueue.AddRateLimited(key) 253 254 return true 255 } 256 257 // enqueueWorkItem enqueues the given node in the work queue. 258 func (t *Tide) enqueueWorkItem(obj interface{}) { 259 key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj) 260 if err != nil { 261 utilruntime.HandleError(fmt.Errorf("Cound't get key for object %+v: %v", obj, err)) 262 return 263 } 264 t.syncQueue.Add(key) 265 } 266 267 // sync syncs the given node. 268 func (t *Tide) sync(ctx context.Context, key string) error { 269 // TODO 270 _, name, err := cache.SplitMetaNamespaceKey(key) 271 if err != nil { 272 return err 273 } 274 tideNodePool, err := t.tideLister.Get(name) 275 if errors.IsNotFound(err) { 276 klog.Infof("node has been deleted %v", key) 277 return nil 278 } 279 if err != nil { 280 return err 281 } 282 283 err = t.Reconcile(ctx, tideNodePool.DeepCopy()) 284 if err != nil { 285 return err 286 } 287 288 return nil 289 } 290 291 func (t *Tide) reconcileDelete(ctx context.Context, tideNodePool *apis.TideNodePool) error { 292 nodes, err := t.nodeLister.List(labels.SelectorFromSet(map[string]string{LabelNodePoolKey: tideNodePool.Name})) 293 if err != nil { 294 klog.Errorf("fail to list nodes: %v", err) 295 return err 296 } 297 for _, node := range nodes { 298 if err := t.cleanNode(ctx, node.DeepCopy(), tideNodePool); err != nil { 299 return err 300 } 301 } 302 // Remove finalizer first 303 controllerutil.RemoveFinalizer(tideNodePool, NodePoolFinalizer) 304 _, err = t.client.InternalClient.TideV1alpha1().TideNodePools().Update(ctx, tideNodePool, metav1.UpdateOptions{}) 305 return err 306 } 307 308 func (t *Tide) cleanNode(ctx context.Context, node *corev1.Node, pool *apis.TideNodePool) error { 309 nodePoolWrapper := NewNodePoolWrapper(pool) 310 var foundIndexes []int 311 for i := range node.Spec.Taints { 312 if node.Spec.Taints[i].Key == nodePoolWrapper.GetEvictOnlinePodTaint().Key || 313 node.Spec.Taints[i].Key == nodePoolWrapper.GetEvictOfflinePodTaint().Key { 314 foundIndexes = append(foundIndexes, i) 315 } 316 } 317 if len(foundIndexes) >= 0 { 318 for i := len(foundIndexes) - 1; i >= 0; i-- { 319 s := foundIndexes[i] 320 node.Spec.Taints = append(node.Spec.Taints[:s], node.Spec.Taints[s+1:]...) 321 } 322 } 323 324 delete(node.Labels, nodePoolWrapper.GetOnlineLabel().Key) 325 delete(node.Labels, nodePoolWrapper.GetOfflineLabel().Key) 326 delete(node.Labels, nodePoolWrapper.GetTideLabel().Key) 327 delete(node.Labels, LabelReserveNode) 328 delete(node.Labels, LabelNodeTypeKey) 329 delete(node.Labels, LabelNodePoolKey) 330 _, err := t.client.KubeClient.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{}) 331 return err 332 } 333 334 func (t *Tide) Reconcile(ctx context.Context, tideNodePool *apis.TideNodePool) error { 335 logger := klog.FromContext(ctx).WithValues("tideNodePool", tideNodePool.GetName()) 336 logger.V(2).Info("start Reconcile") 337 defer logger.V(2).Info("end Reconcile") 338 // Add finalizer first 339 if !controllerutil.ContainsFinalizer(tideNodePool, NodePoolFinalizer) && tideNodePool.DeletionTimestamp.IsZero() { 340 controllerutil.AddFinalizer(tideNodePool, NodePoolFinalizer) 341 tideNodePool, err := t.client.InternalClient.TideV1alpha1().TideNodePools().Update(ctx, tideNodePool, metav1.UpdateOptions{}) 342 if err != nil { 343 klog.ErrorS(err, "fail to add finalizer", "rule", tideNodePool.Name) 344 return err 345 } 346 } 347 // process deletion 348 if !tideNodePool.DeletionTimestamp.IsZero() { 349 return t.reconcileDelete(ctx, tideNodePool) 350 } 351 nodes, err := t.nodeLister.List(labels.SelectorFromSet(tideNodePool.Spec.NodeConfigs.NodeSelector)) 352 if err != nil { 353 klog.Errorf("fail to list nodes: %v", err) 354 return err 355 } 356 onlineNodesExpectCount, err := intstr.GetScaledValueFromIntOrPercent(tideNodePool.Spec.NodeConfigs.Reserve.Online, len(nodes), true) 357 if err != nil { 358 klog.Errorf("fail to get online nodes number: %v", err) 359 return err 360 } 361 offlineNodesExpectCount, err := intstr.GetScaledValueFromIntOrPercent(tideNodePool.Spec.NodeConfigs.Reserve.Offline, len(nodes), false) 362 if err != nil { 363 klog.Errorf("fail to get offline nodes number: %v", err) 364 return err 365 } 366 nodePoolWrapper := NewNodePoolWrapper(tideNodePool) 367 reserveOnlineNodes, reserveOfflineNodes, tideNodes, unknownNodes := classifyNodes(nodes, NewNodePoolWrapper(tideNodePool)) 368 onlineNodeCount, offlineNodeCount := len(reserveOnlineNodes), len(reserveOfflineNodes) 369 for i := 0; i < len(reserveOnlineNodes) && onlineNodeCount > onlineNodesExpectCount; i++ { 370 nodePoolWrapper.SetNodeToTide(reserveOnlineNodes[i]) 371 if _, err := t.client.KubeClient.CoreV1().Nodes().Update(ctx, reserveOnlineNodes[i], metav1.UpdateOptions{}); err != nil { 372 klog.Errorf("fail to convert online reserve nodes to tide: %v", err) 373 return err 374 } 375 onlineNodeCount-- 376 } 377 378 for i := 0; i < len(reserveOfflineNodes) && offlineNodeCount > offlineNodesExpectCount; i++ { 379 nodePoolWrapper.SetNodeToTide(reserveOfflineNodes[i]) 380 if _, err := t.client.KubeClient.CoreV1().Nodes().Update(ctx, reserveOfflineNodes[i], metav1.UpdateOptions{}); err != nil { 381 klog.Errorf("fail to convert offline reserve nodes to tide: %v", err) 382 return err 383 } 384 offlineNodeCount-- 385 } 386 387 for i := range unknownNodes { 388 if onlineNodeCount < onlineNodesExpectCount { 389 nodePoolWrapper.SetNodeToOnlineReserve(unknownNodes[i]) 390 if _, err := t.client.KubeClient.CoreV1().Nodes().Update(ctx, unknownNodes[i], metav1.UpdateOptions{}); err != nil { 391 klog.Errorf("fail to convert new nodes to reserve: %v", err) 392 return err 393 } 394 reserveOnlineNodes = append(reserveOnlineNodes, unknownNodes[i]) 395 onlineNodeCount++ 396 } else if offlineNodeCount < offlineNodesExpectCount { 397 nodePoolWrapper.SetNodeToOfflineReserve(unknownNodes[i]) 398 if _, err := t.client.KubeClient.CoreV1().Nodes().Update(ctx, unknownNodes[i], metav1.UpdateOptions{}); err != nil { 399 klog.Errorf("fail to convert new nodes to reserve: %v", err) 400 return err 401 } 402 reserveOfflineNodes = append(reserveOfflineNodes, unknownNodes[i]) 403 offlineNodeCount++ 404 } else { 405 nodePoolWrapper.SetNodeToTideOnline(unknownNodes[i]) 406 if _, err := t.client.KubeClient.CoreV1().Nodes().Update(ctx, unknownNodes[i], metav1.UpdateOptions{}); err != nil { 407 klog.Errorf("fail to convert offline reserve nodes to tide: %v", err) 408 return err 409 } 410 tideNodes = append(tideNodes, unknownNodes[i]) 411 } 412 } 413 414 if err := t.UpdateStatusByNodes(ctx, tideNodePool, reserveOnlineNodes, reserveOfflineNodes, tideNodes); err != nil { 415 return err 416 } 417 onlineLabelSet := labels.SelectorFromSet(map[string]string{LabelPodTypeKey: LabelOnlinePodValue}) 418 onlinePodChecker := func(pod *corev1.Pod) bool { 419 return onlineLabelSet.Matches(labels.Set(pod.GetLabels())) 420 } 421 422 if err := t.RunOnce(ctx, 423 onlinePodChecker, 424 nodePoolWrapper); err != nil { 425 klog.Errorf("try to balance node failed: %v", err) 426 return err 427 } 428 return nil 429 } 430 431 func (t *Tide) UpdateStatusByNodes(ctx context.Context, tideNodePool *apis.TideNodePool, reserveOnlineNodes, reserveOfflineNodes, tideNodes []*corev1.Node) error { 432 newTideNodePool := tideNodePool.DeepCopy() 433 434 var onlineNodeNames, offlineNodeNames, tideNodeNames []string 435 for i := range reserveOnlineNodes { 436 onlineNodeNames = append(onlineNodeNames, reserveOnlineNodes[i].Name) 437 } 438 for i := range reserveOfflineNodes { 439 offlineNodeNames = append(offlineNodeNames, reserveOfflineNodes[i].Name) 440 } 441 442 for i := range tideNodes { 443 tideNodeNames = append(tideNodeNames, tideNodes[i].Name) 444 } 445 446 sortNodeName(onlineNodeNames) 447 sortNodeName(offlineNodeNames) 448 sortNodeName(tideNodeNames) 449 newTideNodePool.Status.ReserveNodes.OnlineNodes = onlineNodeNames 450 newTideNodePool.Status.ReserveNodes.OfflineNodes = offlineNodeNames 451 newTideNodePool.Status.TideNodes.Nodes = tideNodeNames 452 if reflect.DeepEqual(newTideNodePool.Status, tideNodePool.Status) { 453 return nil 454 } 455 _, err := t.client.InternalClient.TideV1alpha1().TideNodePools().UpdateStatus(ctx, newTideNodePool, metav1.UpdateOptions{}) 456 return err 457 } 458 459 func sortNodeName(data []string) { 460 sort.SliceStable(data, func(i, j int) bool { 461 return data[i] > data[j] 462 }) 463 } 464 465 func classifyNodes(nodes []*corev1.Node, tideNodePool NodePoolWrapper) ( 466 reserveOnlineNodes []*corev1.Node, 467 reserveOfflineNodes []*corev1.Node, 468 tideNodes []*corev1.Node, 469 unknownNodes []*corev1.Node, 470 ) { 471 for i, node := range nodes { 472 nodeLabels := labels.Set(node.GetLabels()) 473 switch { 474 case tideNodePool.GetOnlineReserveNodeSelector().Matches(nodeLabels): 475 reserveOnlineNodes = append(reserveOnlineNodes, nodes[i].DeepCopy()) 476 case tideNodePool.GetOfflineReserveNodeSelector().Matches(nodeLabels): 477 reserveOfflineNodes = append(reserveOfflineNodes, nodes[i].DeepCopy()) 478 case tideNodePool.GetTideNodeSelector().Matches(nodeLabels): 479 tideNodes = append(tideNodes, nodes[i].DeepCopy()) 480 case !tideNodePool.GetNodePoolSelector().Matches(nodeLabels): 481 unknownNodes = append(unknownNodes, nodes[i].DeepCopy()) 482 default: 483 // do nothing 484 } 485 } 486 return 487 } 488 489 func (t *Tide) GetNodePoolInfo(nodes []*corev1.Node, onlinePodChecker OnlinePodChecker) (simulator.ClusterSnapshot, []*corev1.Pod, error) { 490 clusterSnapshot := simulator.NewBasicClusterSnapshot() 491 pods, err := t.podLister.List(labels.Everything()) 492 if err != nil { 493 return nil, nil, err 494 } 495 var pendingPods []*corev1.Pod 496 knownNodes := map[string]bool{} 497 for i := range nodes { 498 if err := clusterSnapshot.AddNode(nodes[i].DeepCopy()); err != nil { 499 return nil, nil, err 500 } 501 knownNodes[nodes[i].Name] = true 502 } 503 504 for i, pod := range pods { 505 if knownNodes[pod.Spec.NodeName] { 506 if err := clusterSnapshot.AddPod(pods[i].DeepCopy(), pod.Spec.NodeName); err != nil { 507 return nil, nil, err 508 } 509 } else if checkPendingOnlinePod(pods[i], onlinePodChecker) { 510 pendingPods = append(pendingPods, pods[i]) 511 } 512 } 513 return clusterSnapshot, pendingPods, nil 514 } 515 516 func checkPendingOnlinePod(pod *corev1.Pod, onlinePodChecker OnlinePodChecker) bool { 517 if !(pod.Spec.NodeName == "" && pod.Status.Phase != corev1.PodSucceeded && pod.Status.Phase != corev1.PodFailed) { 518 return false 519 } 520 _, condition := podv1.GetPodCondition(&pod.Status, corev1.PodScheduled) 521 if condition == nil { 522 return false 523 } 524 return condition.Status == corev1.ConditionFalse && condition.Reason == corev1.PodReasonUnschedulable && onlinePodChecker(pod) 525 } 526 527 func (t *Tide) RunOnce(ctx context.Context, onlinePodChecker OnlinePodChecker, tideNodePool NodePoolWrapper) error { 528 logger := klog.FromContext(ctx).WithValues("tideNodePool", tideNodePool.GetName()) 529 nodeList, err := t.nodeLister.List(labels.Everything()) 530 if err != nil { 531 return err 532 } 533 534 clusterSnapshot, pendingPods, err := t.GetNodePoolInfo(nodeList, onlinePodChecker) 535 if err != nil { 536 return err 537 } 538 // assuming that the online business is pending 539 // prioritizing the resolution of the online business pending issue is recommended 540 if len(pendingPods) != 0 { 541 offlineNodesInfos, err := getNodeUsageWithSelector(clusterSnapshot, []corev1.ResourceName{"cpu", "memory"}, tideNodePool.GetOfflineTideNodeSelector()) 542 if err != nil { 543 return err 544 } 545 if len(offlineNodesInfos) <= 0 { 546 logger.Info("no offline node in tidal") 547 return nil 548 } 549 for _, pod := range pendingPods { 550 _, err := t.checker.FitsAnyNode(clusterSnapshot, pod) 551 if err == nil { 552 logger.Info("pod can fit node", "pod", types.NamespacedName{ 553 Namespace: pod.Namespace, 554 Name: pod.Name, 555 }) 556 continue 557 } 558 for j := range offlineNodesInfos { 559 offlineNodesInfo := offlineNodesInfos[j] 560 nodeInfo, err := clusterSnapshot.NodeInfos().Get(offlineNodesInfo.node.Name) 561 if err != nil { 562 return err 563 } 564 clusterSnapshot.RemoveNode(offlineNodesInfo.node.Name) 565 node := t.changeNodeToOnline(nodeInfo.Node(), tideNodePool) 566 clusterSnapshot.AddNode(node) 567 _, err = t.checker.FitsAnyNode(clusterSnapshot, pod) 568 if err != nil { 569 logger.Info("pod not fit offline node after release offline node, skip", "pod", types.NamespacedName{ 570 Namespace: pod.Namespace, 571 Name: pod.Name, 572 }) 573 // rollback node to offline 574 clusterSnapshot.RemoveNode(offlineNodesInfo.node.Name) 575 node := t.changeNodeToOffline(nodeInfo.Node(), tideNodePool) 576 clusterSnapshot.AddNode(node) 577 continue 578 } 579 if _, err := t.client.KubeClient.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{}); err != nil { 580 return fmt.Errorf("update node offline to online failed: %v", err) 581 } else { 582 logger.Info("release offline node to online node", "pod", types.NamespacedName{ 583 Namespace: pod.Namespace, 584 Name: pod.Name, 585 }, "node", node.Name) 586 return nil 587 } 588 } 589 } 590 logger.Info("not need release offline node") 591 } 592 // 1. select the online node with the lowest usage 593 // 2. pre-schedule all online pods (request from largest to smallest) and check whether they can all be scheduled normally 594 // 3. start triggering scheduling by tainting 595 onlineNodesInfos, err := getNodeUsageWithSelector(clusterSnapshot, []corev1.ResourceName{"cpu", "memory"}, tideNodePool.GetOnlineTideNodeSelector()) 596 if err != nil { 597 return err 598 } 599 // skip if no online node 600 if len(onlineNodesInfos) <= 1 { 601 logger.Info("no online node in tidal") 602 return nil 603 } 604 onlineNodesInfo := onlineNodesInfos[0] 605 podsInNode := onlineNodesInfo.allPods 606 nodeInfo, err := clusterSnapshot.NodeInfos().Get(onlineNodesInfo.node.Name) 607 if err != nil { 608 return err 609 } 610 clusterSnapshot.RemoveNode(onlineNodesInfo.node.Name) 611 for _, pod := range podsInNode { 612 if onlinePodChecker(pod) { 613 pod.Spec.NodeName = "" 614 nodeName, err := t.checker.FitsAnyNode(clusterSnapshot, pod) 615 if err != nil { 616 logger.Info("can not release online node to offline", "node", onlineNodesInfo.node.Name) 617 return nil 618 } 619 pod.Spec.NodeName = nodeName 620 clusterSnapshot.AddPod(pod, nodeName) 621 } 622 } 623 node := t.changeNodeToOffline(nodeInfo.Node(), tideNodePool) 624 if _, err := t.client.KubeClient.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{}); err != nil { 625 return err 626 } 627 logger.Info("release online node success", "node", node.Name) 628 return nil 629 } 630 631 func (t *Tide) changeNodeToOnline(node *corev1.Node, pool NodePoolWrapper) *corev1.Node { 632 found := false 633 for i := range node.Spec.Taints { 634 if node.Spec.Taints[i].Key == pool.GetEvictOnlinePodTaint().Key || node.Spec.Taints[i].Key == pool.GetEvictOfflinePodTaint().Key { 635 node.Spec.Taints[i].Key = pool.GetEvictOfflinePodTaint().Key 636 node.Spec.Taints[i].Value = pool.GetEvictOfflinePodTaint().Value 637 node.Spec.Taints[i].Effect = corev1.TaintEffect(pool.GetEvictOfflinePodTaint().Effect) 638 found = true 639 } 640 } 641 if !found { 642 node.Spec.Taints = append(node.Spec.Taints, corev1.Taint{ 643 Key: pool.GetEvictOfflinePodTaint().Key, 644 Value: pool.GetEvictOnlinePodTaint().Value, 645 Effect: corev1.TaintEffect(pool.GetEvictOnlinePodTaint().Effect), 646 }) 647 } 648 delete(node.Labels, pool.GetOfflineLabel().Key) 649 node.Labels[pool.GetOnlineLabel().Key] = pool.GetOnlineLabel().Value 650 return node 651 } 652 653 func (t *Tide) changeNodeToOffline(node *corev1.Node, pool NodePoolWrapper) *corev1.Node { 654 found := false 655 for i := range node.Spec.Taints { 656 if node.Spec.Taints[i].Key == pool.GetEvictOnlinePodTaint().Key || node.Spec.Taints[i].Key == pool.GetEvictOfflinePodTaint().Key { 657 node.Spec.Taints[i].Key = pool.GetEvictOnlinePodTaint().Key 658 node.Spec.Taints[i].Value = pool.GetEvictOnlinePodTaint().Value 659 node.Spec.Taints[i].Effect = corev1.TaintEffect(pool.GetEvictOnlinePodTaint().Effect) 660 found = true 661 } 662 } 663 if !found { 664 node.Spec.Taints = append(node.Spec.Taints, corev1.Taint{ 665 Key: pool.GetEvictOfflinePodTaint().Key, 666 Value: pool.GetEvictOnlinePodTaint().Value, 667 Effect: corev1.TaintEffect(pool.GetEvictOnlinePodTaint().Effect), 668 }) 669 } 670 delete(node.Labels, pool.GetOnlineLabel().Key) 671 node.Labels[pool.GetOfflineLabel().Key] = pool.GetOfflineLabel().Value 672 return node 673 } 674 675 func getNodeUsageWithSelector( 676 nodes simulator.ClusterSnapshot, 677 resourceNames []corev1.ResourceName, 678 selector labels.Selector, 679 ) ([]NodeUsage, error) { 680 var nodeUsageList []NodeUsage 681 nodeInfos, err := nodes.NodeInfos().List() 682 if err != nil { 683 return nil, err 684 } 685 for _, node := range nodeInfos { 686 var pods []*corev1.Pod 687 for i := range node.Pods { 688 pods = append(pods, node.Pods[i].Pod) 689 } 690 if !selector.Matches(labels.Set(node.Node().GetLabels())) { 691 continue 692 } 693 nodeUsageList = append(nodeUsageList, NodeUsage{ 694 node: node.Node(), 695 usage: nodeutil.NodeUtilization(pods, resourceNames), 696 allPods: pods, 697 }) 698 } 699 // nodes are sorted by usage rate 700 sort.Slice(nodeUsageList, func(i, j int) bool { 701 ti := nodeUsageList[i].usage[corev1.ResourceMemory].Value() + nodeUsageList[i].usage[corev1.ResourceCPU].MilliValue() + nodeUsageList[i].usage[corev1.ResourcePods].Value() 702 tj := nodeUsageList[j].usage[corev1.ResourceMemory].Value() + nodeUsageList[j].usage[corev1.ResourceCPU].MilliValue() + nodeUsageList[j].usage[corev1.ResourcePods].Value() 703 // extended resources 704 for name := range nodeUsageList[i].usage { 705 if !nodeutil.IsBasicResource(name) { 706 ti = ti + nodeUsageList[i].usage[name].Value() 707 tj = tj + nodeUsageList[j].usage[name].Value() 708 } 709 } 710 return ti < tj 711 }) 712 return nodeUsageList, nil 713 } 714 715 func (t *Tide) periodSync() { 716 targetSelector := labels.Everything() 717 tides, err := t.tideLister.List(targetSelector) 718 if err != nil { 719 klog.Errorf("failed to list all tide node pool") 720 return 721 } 722 723 for _, tide := range tides { 724 t.enqueueWorkItem(tide) 725 } 726 }