github.com/kubewharf/katalyst-core@v0.5.3/pkg/controller/lifecycle/agent-healthz/helper/taint_cnr.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package helper
    18  
    19  import (
    20  	"context"
    21  	"time"
    22  
    23  	corev1 "k8s.io/api/core/v1"
    24  	"k8s.io/apimachinery/pkg/api/equality"
    25  	"k8s.io/apimachinery/pkg/api/errors"
    26  	"k8s.io/apimachinery/pkg/util/wait"
    27  	corelisters "k8s.io/client-go/listers/core/v1"
    28  	"k8s.io/klog/v2"
    29  	"k8s.io/kubernetes/pkg/controller/nodelifecycle/scheduler"
    30  
    31  	apis "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1"
    32  	listers "github.com/kubewharf/katalyst-api/pkg/client/listers/node/v1alpha1"
    33  	"github.com/kubewharf/katalyst-core/pkg/client/control"
    34  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    35  	"github.com/kubewharf/katalyst-core/pkg/util"
    36  	"github.com/kubewharf/katalyst-core/pkg/util/general"
    37  )
    38  
    39  const (
    40  	metricsNameUntaintedCNRCount = "untainted_cnr_count"
    41  	metricsNameTaintedCNRCount   = "tainted_cnr_count"
    42  )
    43  
    44  const TaintNameNoScheduler = "TaintNameNoScheduler"
    45  
    46  var TaintNoScheduler = &apis.Taint{
    47  	Key:    corev1.TaintNodeUnschedulable,
    48  	Effect: apis.TaintEffectNoScheduleForReclaimedTasks,
    49  }
    50  
    51  var allTaints = []*apis.Taint{
    52  	TaintNoScheduler,
    53  }
    54  
    55  // CNRTaintItem records the detailed item to perform cnr-taints
    56  type CNRTaintItem struct {
    57  	Taints map[string]*apis.Taint
    58  }
    59  
    60  type CNRTaintHelper struct {
    61  	ctx        context.Context
    62  	emitter    metrics.MetricEmitter
    63  	cnrControl control.CNRControl
    64  
    65  	checker *HealthzHelper
    66  	queue   *scheduler.RateLimitedTimedQueue
    67  
    68  	nodeLister corelisters.NodeLister
    69  	cnrLister  listers.CustomNodeResourceLister
    70  }
    71  
    72  // NewTaintHelper todo add logic here
    73  func NewTaintHelper(ctx context.Context, emitter metrics.MetricEmitter, cnrControl control.CNRControl,
    74  	nodeLister corelisters.NodeLister, cnrLister listers.CustomNodeResourceLister,
    75  	queue *scheduler.RateLimitedTimedQueue, checker *HealthzHelper,
    76  ) *CNRTaintHelper {
    77  	return &CNRTaintHelper{
    78  		ctx:        ctx,
    79  		emitter:    emitter,
    80  		cnrControl: cnrControl,
    81  
    82  		checker: checker,
    83  		queue:   queue,
    84  
    85  		nodeLister: nodeLister,
    86  		cnrLister:  cnrLister,
    87  	}
    88  }
    89  
    90  func (t *CNRTaintHelper) Run() {
    91  	go wait.Until(t.doTaint, scheduler.NodeEvictionPeriod, t.ctx.Done())
    92  }
    93  
    94  // doTaint is used to pop nodes from to-be-tainted queue,
    95  // and then trigger the taint actions
    96  func (t *CNRTaintHelper) doTaint() {
    97  	t.queue.Try(func(value scheduler.TimedValue) (bool, time.Duration) {
    98  		node := value.Value
    99  
   100  		cnr, err := t.cnrLister.Get(value.Value)
   101  		if errors.IsNotFound(err) {
   102  			klog.Warningf("cnr %v no longer present in cnrLister", value.Value)
   103  			return true, 0
   104  		} else if err != nil {
   105  			klog.Errorf("cannot find cnr for node %v err %v", node, err)
   106  			// retry in 50 millisecond
   107  			return false, 50 * time.Millisecond
   108  		}
   109  
   110  		// second confirm that we should taint cnr
   111  		item := value.UID.(*CNRTaintItem)
   112  		needTaint := t.checker.CheckAllAgentReady(node)
   113  		if needTaint && len(item.Taints) != 0 {
   114  			if err := t.taintCNR(cnr, item); err != nil {
   115  				klog.Warningf("failed to taint for cnr %v: %v", value.Value, err)
   116  				return false, 0
   117  			}
   118  		}
   119  
   120  		return true, 0
   121  	})
   122  }
   123  
   124  func (t *CNRTaintHelper) taintCNR(cnr *apis.CustomNodeResource, item *CNRTaintItem) error {
   125  	var err error
   126  	var newCNR *apis.CustomNodeResource
   127  	for _, taint := range item.Taints {
   128  		newCNR, _, err = util.AddOrUpdateCNRTaint(cnr, taint)
   129  		if err != nil {
   130  			return err
   131  		}
   132  	}
   133  
   134  	if equality.Semantic.DeepEqual(cnr, newCNR) {
   135  		general.Infof("taint already exits, not need to update")
   136  		return nil
   137  	}
   138  
   139  	_, err = t.cnrControl.PatchCNRSpecAndMetadata(t.ctx, cnr.Name, cnr, newCNR)
   140  	if err != nil {
   141  		_ = t.emitter.StoreInt64(metricsNameTaintedCNRCount, 1, metrics.MetricTypeNameCount,
   142  			[]metrics.MetricTag{
   143  				{Key: "status", Val: "failed"},
   144  				{Key: "name", Val: cnr.Name},
   145  			}...)
   146  		return err
   147  	}
   148  	_ = t.emitter.StoreInt64(metricsNameTaintedCNRCount, 1, metrics.MetricTypeNameCount,
   149  		[]metrics.MetricTag{
   150  			{Key: "status", Val: "success"},
   151  			{Key: "name", Val: cnr.Name},
   152  		}...)
   153  
   154  	return nil
   155  }
   156  
   157  // TryUNTaintCNR is used to delete taint info from CNR
   158  func (t *CNRTaintHelper) TryUNTaintCNR(name string) error {
   159  	cnr, err := t.cnrLister.Get(name)
   160  	if errors.IsNotFound(err) {
   161  		klog.Warningf("cnr %v no longer present in cnrLister", name)
   162  		return nil
   163  	} else if err != nil {
   164  		return err
   165  	}
   166  
   167  	var newCNR *apis.CustomNodeResource
   168  	for _, taint := range allTaints {
   169  		newCNR, _, err = util.RemoveCNRTaint(cnr, taint)
   170  		if err != nil {
   171  			return err
   172  		}
   173  	}
   174  
   175  	if equality.Semantic.DeepEqual(cnr, newCNR) {
   176  		klog.V(5).InfoS("taint already disappears, not need to update", "cnr", cnr.Name)
   177  		return nil
   178  	}
   179  
   180  	_, err = t.cnrControl.PatchCNRSpecAndMetadata(t.ctx, cnr.Name, cnr, newCNR)
   181  	if err != nil {
   182  		_ = t.emitter.StoreInt64(metricsNameUntaintedCNRCount, 1, metrics.MetricTypeNameCount,
   183  			[]metrics.MetricTag{
   184  				{Key: "status", Val: "failed"},
   185  				{Key: "name", Val: cnr.Name},
   186  			}...)
   187  		return err
   188  	}
   189  	_ = t.emitter.StoreInt64(metricsNameUntaintedCNRCount, 1, metrics.MetricTypeNameCount,
   190  		[]metrics.MetricTag{
   191  			{Key: "status", Val: "success"},
   192  			{Key: "name", Val: cnr.Name},
   193  		}...)
   194  
   195  	return nil
   196  }