github.com/kubewharf/katalyst-core@v0.5.3/pkg/controller/lifecycle/agent-healthz/helper/healthz_check.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package helper
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sync"
    23  	"time"
    24  
    25  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    26  	"k8s.io/apimachinery/pkg/labels"
    27  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    28  	"k8s.io/apimachinery/pkg/util/sets"
    29  	"k8s.io/apimachinery/pkg/util/wait"
    30  	corelisters "k8s.io/client-go/listers/core/v1"
    31  	"k8s.io/client-go/tools/cache"
    32  	"k8s.io/klog/v2"
    33  
    34  	listers "github.com/kubewharf/katalyst-api/pkg/client/listers/node/v1alpha1"
    35  	"github.com/kubewharf/katalyst-core/pkg/config/controller"
    36  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    37  	"github.com/kubewharf/katalyst-core/pkg/util/native"
    38  )
    39  
    40  // agentStatus is inner-defined status definition for agent
    41  type agentStatus string
    42  
    43  const (
    44  	agentReady    agentStatus = "Ready"
    45  	agentNotReady agentStatus = "NotReady"
    46  	agentNotFound agentStatus = "NotFound"
    47  )
    48  
    49  const (
    50  	metricsNameAgentNotReady      = "agent_not_ready"
    51  	metricsNameAgentNotFound      = "agent_not_found"
    52  	metricsNameAgentReadyTotal    = "agent_ready_total"
    53  	metricsNameAgentNotReadyTotal = "agent_not_ready_total"
    54  	metricsNameAgentNotFoundTotal = "agent_not_found_total"
    55  
    56  	metricsTagKeyAgentName = "agentName"
    57  	metricsTagKeyNodeName  = "nodeName"
    58  )
    59  
    60  type healthData struct {
    61  	probeTimestamp metav1.Time
    62  	status         agentStatus
    63  }
    64  
    65  // heartBeatMap is used to store health related info for each agent
    66  type heartBeatMap struct {
    67  	lock        sync.RWMutex
    68  	nodeHealths map[string]map[string]*healthData // map from node->pod->data
    69  }
    70  
    71  func newHeartBeatMap() *heartBeatMap {
    72  	return &heartBeatMap{
    73  		nodeHealths: make(map[string]map[string]*healthData),
    74  	}
    75  }
    76  
    77  func (c *heartBeatMap) setHeartBeatInfo(node, agent string, status agentStatus, timestamp metav1.Time) {
    78  	c.lock.Lock()
    79  	defer c.lock.Unlock()
    80  
    81  	if c.nodeHealths[node] == nil {
    82  		c.nodeHealths[node] = make(map[string]*healthData)
    83  	}
    84  
    85  	if c.nodeHealths[node][agent] == nil {
    86  		nodeHealth := &healthData{
    87  			status:         status,
    88  			probeTimestamp: timestamp,
    89  		}
    90  		c.nodeHealths[node][agent] = nodeHealth
    91  		return
    92  	}
    93  
    94  	if status != c.nodeHealths[node][agent].status {
    95  		c.nodeHealths[node][agent].probeTimestamp = timestamp
    96  		c.nodeHealths[node][agent].status = status
    97  	}
    98  }
    99  
   100  func (c *heartBeatMap) getHeartBeatInfo(node, agent string) (healthData, bool) {
   101  	c.lock.RLock()
   102  	defer c.lock.RUnlock()
   103  
   104  	if nodeHealth, nodeOk := c.nodeHealths[node]; nodeOk && nodeHealth != nil {
   105  		if agentHealth, agentOk := nodeHealth[agent]; agentOk && agentHealth != nil {
   106  			return *agentHealth, true
   107  		}
   108  	}
   109  	return healthData{}, false
   110  }
   111  
   112  func (c *heartBeatMap) rangeNode(f func(node string) bool) {
   113  	c.lock.Lock()
   114  	defer c.lock.Unlock()
   115  
   116  	for node := range c.nodeHealths {
   117  		shouldContinue := f(node)
   118  		if !shouldContinue {
   119  			break
   120  		}
   121  	}
   122  }
   123  
   124  type HealthzHelper struct {
   125  	ctx     context.Context
   126  	emitter metrics.MetricEmitter
   127  
   128  	checkWindow          time.Duration
   129  	unhealthyPeriod      time.Duration
   130  	agentUnhealthyPeriod map[string]time.Duration
   131  	nodeSelector         labels.Selector
   132  	agentSelectors       map[string]labels.Selector
   133  
   134  	podIndexer cache.Indexer
   135  	nodeLister corelisters.NodeLister
   136  	cnrLister  listers.CustomNodeResourceLister
   137  
   138  	healthzMap *heartBeatMap
   139  }
   140  
   141  // NewHealthzHelper todo add logic here
   142  func NewHealthzHelper(ctx context.Context, conf *controller.LifeCycleConfig, emitter metrics.MetricEmitter,
   143  	nodeSelector labels.Selector, agentSelectors map[string]labels.Selector, podIndexer cache.Indexer,
   144  	nodeLister corelisters.NodeLister, cnrLister listers.CustomNodeResourceLister,
   145  ) *HealthzHelper {
   146  	return &HealthzHelper{
   147  		ctx:     ctx,
   148  		emitter: emitter,
   149  
   150  		checkWindow:          conf.CheckWindow,
   151  		unhealthyPeriod:      conf.UnhealthyPeriods,
   152  		agentUnhealthyPeriod: conf.AgentUnhealthyPeriods,
   153  		nodeSelector:         nodeSelector,
   154  		agentSelectors:       agentSelectors,
   155  
   156  		podIndexer: podIndexer,
   157  		nodeLister: nodeLister,
   158  		cnrLister:  cnrLister,
   159  
   160  		healthzMap: newHeartBeatMap(),
   161  	}
   162  }
   163  
   164  func (h *HealthzHelper) Run() {
   165  	go wait.Until(h.syncHeartBeatMap, h.checkWindow, h.ctx.Done())
   166  }
   167  
   168  // CheckAllAgentReady checks whether all agents are ready
   169  func (h *HealthzHelper) CheckAllAgentReady(node string) bool {
   170  	for agent := range h.agentSelectors {
   171  		if !h.CheckAgentReady(node, agent) {
   172  			return false
   173  		}
   174  	}
   175  	return true
   176  }
   177  
   178  // CheckAgentReady checks whether the given agent is ready
   179  func (h *HealthzHelper) CheckAgentReady(node string, agent string) bool {
   180  	period := h.unhealthyPeriod
   181  	if p, ok := h.agentUnhealthyPeriod[agent]; ok {
   182  		period = p
   183  	}
   184  
   185  	health, found := h.healthzMap.getHeartBeatInfo(node, agent)
   186  	if found && health.status != agentReady && metav1.Now().After(health.probeTimestamp.Add(period)) {
   187  		return false
   188  	}
   189  	return true
   190  }
   191  
   192  // syncHeartBeatMap is used to periodically sync health state ans s
   193  func (h *HealthzHelper) syncHeartBeatMap() {
   194  	nodes, err := h.nodeLister.List(h.nodeSelector)
   195  	if err != nil {
   196  		klog.Errorf("List nodes error: %v", err)
   197  		return
   198  	}
   199  
   200  	totalReadyNode := make(map[string]int64)
   201  	totalNotReadyNode := make(map[string]int64)
   202  	totalNotFoundNode := make(map[string]int64)
   203  	currentNodes := sets.String{}
   204  	for _, node := range nodes {
   205  		baseTags := []metrics.MetricTag{{Key: metricsTagKeyNodeName, Val: node.Name}}
   206  		pods, err := native.GetPodsAssignedToNode(node.Name, h.podIndexer)
   207  		if err != nil {
   208  			utilruntime.HandleError(fmt.Errorf("unable to list pods from node %q: %v", node.Name, err))
   209  			continue
   210  		}
   211  
   212  		currentNodes.Insert(node.Name)
   213  		for agent, selector := range h.agentSelectors {
   214  			agentFound := false
   215  			metricsTags := append(baseTags, metrics.MetricTag{Key: metricsTagKeyAgentName, Val: agent})
   216  			// todo: it may be too time-consuming to walk through all pods
   217  			for _, pod := range pods {
   218  				if !selector.Matches(labels.Set(pod.Labels)) {
   219  					continue
   220  				}
   221  				agentFound = true
   222  
   223  				klog.Infof("agent %v for node %v found: %v", agent, node.Name, pod.Name)
   224  				if native.PodIsReady(pod) {
   225  					h.healthzMap.setHeartBeatInfo(node.Name, agent, agentReady, metav1.Now())
   226  					totalReadyNode[agent]++
   227  					klog.Infof("agent %v for node %v found: %v, is ready", agent, node.Name, pod.Name)
   228  					break
   229  				} else {
   230  					h.healthzMap.setHeartBeatInfo(node.Name, agent, agentNotReady, metav1.Now())
   231  					totalNotReadyNode[agent]++
   232  					klog.Errorf("agent %v for node %v is not ready", agent, node.Name)
   233  					_ = h.emitter.StoreInt64(metricsNameAgentNotReady, 1, metrics.MetricTypeNameRaw, metricsTags...)
   234  				}
   235  			}
   236  
   237  			if !agentFound {
   238  				h.healthzMap.setHeartBeatInfo(node.Name, agent, agentNotFound, metav1.Now())
   239  				totalNotFoundNode[agent]++
   240  				klog.Errorf("agent %v for node %v is not found", agent, node.Name)
   241  				_ = h.emitter.StoreInt64(metricsNameAgentNotFound, 1, metrics.MetricTypeNameRaw, metricsTags...)
   242  			}
   243  		}
   244  	}
   245  
   246  	for agent := range h.agentSelectors {
   247  		tag := metrics.MetricTag{Key: metricsTagKeyAgentName, Val: agent}
   248  		_ = h.emitter.StoreInt64(metricsNameAgentReadyTotal, totalReadyNode[agent],
   249  			metrics.MetricTypeNameRaw, tag)
   250  		_ = h.emitter.StoreInt64(metricsNameAgentNotReadyTotal, totalNotReadyNode[agent],
   251  			metrics.MetricTypeNameRaw, tag)
   252  		_ = h.emitter.StoreInt64(metricsNameAgentNotFoundTotal, totalNotFoundNode[agent],
   253  			metrics.MetricTypeNameRaw, tag)
   254  	}
   255  
   256  	h.healthzMap.rangeNode(func(node string) bool {
   257  		if !currentNodes.Has(node) {
   258  			delete(h.healthzMap.nodeHealths, node)
   259  		}
   260  		return false
   261  	})
   262  }