github.com/kubewharf/katalyst-core@v0.5.3/pkg/controller/lifecycle/agent-healthz/helper/healthz_check.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package helper 18 19 import ( 20 "context" 21 "fmt" 22 "sync" 23 "time" 24 25 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 26 "k8s.io/apimachinery/pkg/labels" 27 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 28 "k8s.io/apimachinery/pkg/util/sets" 29 "k8s.io/apimachinery/pkg/util/wait" 30 corelisters "k8s.io/client-go/listers/core/v1" 31 "k8s.io/client-go/tools/cache" 32 "k8s.io/klog/v2" 33 34 listers "github.com/kubewharf/katalyst-api/pkg/client/listers/node/v1alpha1" 35 "github.com/kubewharf/katalyst-core/pkg/config/controller" 36 "github.com/kubewharf/katalyst-core/pkg/metrics" 37 "github.com/kubewharf/katalyst-core/pkg/util/native" 38 ) 39 40 // agentStatus is inner-defined status definition for agent 41 type agentStatus string 42 43 const ( 44 agentReady agentStatus = "Ready" 45 agentNotReady agentStatus = "NotReady" 46 agentNotFound agentStatus = "NotFound" 47 ) 48 49 const ( 50 metricsNameAgentNotReady = "agent_not_ready" 51 metricsNameAgentNotFound = "agent_not_found" 52 metricsNameAgentReadyTotal = "agent_ready_total" 53 metricsNameAgentNotReadyTotal = "agent_not_ready_total" 54 metricsNameAgentNotFoundTotal = "agent_not_found_total" 55 56 metricsTagKeyAgentName = "agentName" 57 metricsTagKeyNodeName = "nodeName" 58 ) 59 60 type healthData struct { 61 probeTimestamp metav1.Time 62 status agentStatus 63 } 64 65 // heartBeatMap is used to store health related info for each agent 66 type heartBeatMap struct { 67 lock sync.RWMutex 68 nodeHealths map[string]map[string]*healthData // map from node->pod->data 69 } 70 71 func newHeartBeatMap() *heartBeatMap { 72 return &heartBeatMap{ 73 nodeHealths: make(map[string]map[string]*healthData), 74 } 75 } 76 77 func (c *heartBeatMap) setHeartBeatInfo(node, agent string, status agentStatus, timestamp metav1.Time) { 78 c.lock.Lock() 79 defer c.lock.Unlock() 80 81 if c.nodeHealths[node] == nil { 82 c.nodeHealths[node] = make(map[string]*healthData) 83 } 84 85 if c.nodeHealths[node][agent] == nil { 86 nodeHealth := &healthData{ 87 status: status, 88 probeTimestamp: timestamp, 89 } 90 c.nodeHealths[node][agent] = nodeHealth 91 return 92 } 93 94 if status != c.nodeHealths[node][agent].status { 95 c.nodeHealths[node][agent].probeTimestamp = timestamp 96 c.nodeHealths[node][agent].status = status 97 } 98 } 99 100 func (c *heartBeatMap) getHeartBeatInfo(node, agent string) (healthData, bool) { 101 c.lock.RLock() 102 defer c.lock.RUnlock() 103 104 if nodeHealth, nodeOk := c.nodeHealths[node]; nodeOk && nodeHealth != nil { 105 if agentHealth, agentOk := nodeHealth[agent]; agentOk && agentHealth != nil { 106 return *agentHealth, true 107 } 108 } 109 return healthData{}, false 110 } 111 112 func (c *heartBeatMap) rangeNode(f func(node string) bool) { 113 c.lock.Lock() 114 defer c.lock.Unlock() 115 116 for node := range c.nodeHealths { 117 shouldContinue := f(node) 118 if !shouldContinue { 119 break 120 } 121 } 122 } 123 124 type HealthzHelper struct { 125 ctx context.Context 126 emitter metrics.MetricEmitter 127 128 checkWindow time.Duration 129 unhealthyPeriod time.Duration 130 agentUnhealthyPeriod map[string]time.Duration 131 nodeSelector labels.Selector 132 agentSelectors map[string]labels.Selector 133 134 podIndexer cache.Indexer 135 nodeLister corelisters.NodeLister 136 cnrLister listers.CustomNodeResourceLister 137 138 healthzMap *heartBeatMap 139 } 140 141 // NewHealthzHelper todo add logic here 142 func NewHealthzHelper(ctx context.Context, conf *controller.LifeCycleConfig, emitter metrics.MetricEmitter, 143 nodeSelector labels.Selector, agentSelectors map[string]labels.Selector, podIndexer cache.Indexer, 144 nodeLister corelisters.NodeLister, cnrLister listers.CustomNodeResourceLister, 145 ) *HealthzHelper { 146 return &HealthzHelper{ 147 ctx: ctx, 148 emitter: emitter, 149 150 checkWindow: conf.CheckWindow, 151 unhealthyPeriod: conf.UnhealthyPeriods, 152 agentUnhealthyPeriod: conf.AgentUnhealthyPeriods, 153 nodeSelector: nodeSelector, 154 agentSelectors: agentSelectors, 155 156 podIndexer: podIndexer, 157 nodeLister: nodeLister, 158 cnrLister: cnrLister, 159 160 healthzMap: newHeartBeatMap(), 161 } 162 } 163 164 func (h *HealthzHelper) Run() { 165 go wait.Until(h.syncHeartBeatMap, h.checkWindow, h.ctx.Done()) 166 } 167 168 // CheckAllAgentReady checks whether all agents are ready 169 func (h *HealthzHelper) CheckAllAgentReady(node string) bool { 170 for agent := range h.agentSelectors { 171 if !h.CheckAgentReady(node, agent) { 172 return false 173 } 174 } 175 return true 176 } 177 178 // CheckAgentReady checks whether the given agent is ready 179 func (h *HealthzHelper) CheckAgentReady(node string, agent string) bool { 180 period := h.unhealthyPeriod 181 if p, ok := h.agentUnhealthyPeriod[agent]; ok { 182 period = p 183 } 184 185 health, found := h.healthzMap.getHeartBeatInfo(node, agent) 186 if found && health.status != agentReady && metav1.Now().After(health.probeTimestamp.Add(period)) { 187 return false 188 } 189 return true 190 } 191 192 // syncHeartBeatMap is used to periodically sync health state ans s 193 func (h *HealthzHelper) syncHeartBeatMap() { 194 nodes, err := h.nodeLister.List(h.nodeSelector) 195 if err != nil { 196 klog.Errorf("List nodes error: %v", err) 197 return 198 } 199 200 totalReadyNode := make(map[string]int64) 201 totalNotReadyNode := make(map[string]int64) 202 totalNotFoundNode := make(map[string]int64) 203 currentNodes := sets.String{} 204 for _, node := range nodes { 205 baseTags := []metrics.MetricTag{{Key: metricsTagKeyNodeName, Val: node.Name}} 206 pods, err := native.GetPodsAssignedToNode(node.Name, h.podIndexer) 207 if err != nil { 208 utilruntime.HandleError(fmt.Errorf("unable to list pods from node %q: %v", node.Name, err)) 209 continue 210 } 211 212 currentNodes.Insert(node.Name) 213 for agent, selector := range h.agentSelectors { 214 agentFound := false 215 metricsTags := append(baseTags, metrics.MetricTag{Key: metricsTagKeyAgentName, Val: agent}) 216 // todo: it may be too time-consuming to walk through all pods 217 for _, pod := range pods { 218 if !selector.Matches(labels.Set(pod.Labels)) { 219 continue 220 } 221 agentFound = true 222 223 klog.Infof("agent %v for node %v found: %v", agent, node.Name, pod.Name) 224 if native.PodIsReady(pod) { 225 h.healthzMap.setHeartBeatInfo(node.Name, agent, agentReady, metav1.Now()) 226 totalReadyNode[agent]++ 227 klog.Infof("agent %v for node %v found: %v, is ready", agent, node.Name, pod.Name) 228 break 229 } else { 230 h.healthzMap.setHeartBeatInfo(node.Name, agent, agentNotReady, metav1.Now()) 231 totalNotReadyNode[agent]++ 232 klog.Errorf("agent %v for node %v is not ready", agent, node.Name) 233 _ = h.emitter.StoreInt64(metricsNameAgentNotReady, 1, metrics.MetricTypeNameRaw, metricsTags...) 234 } 235 } 236 237 if !agentFound { 238 h.healthzMap.setHeartBeatInfo(node.Name, agent, agentNotFound, metav1.Now()) 239 totalNotFoundNode[agent]++ 240 klog.Errorf("agent %v for node %v is not found", agent, node.Name) 241 _ = h.emitter.StoreInt64(metricsNameAgentNotFound, 1, metrics.MetricTypeNameRaw, metricsTags...) 242 } 243 } 244 } 245 246 for agent := range h.agentSelectors { 247 tag := metrics.MetricTag{Key: metricsTagKeyAgentName, Val: agent} 248 _ = h.emitter.StoreInt64(metricsNameAgentReadyTotal, totalReadyNode[agent], 249 metrics.MetricTypeNameRaw, tag) 250 _ = h.emitter.StoreInt64(metricsNameAgentNotReadyTotal, totalNotReadyNode[agent], 251 metrics.MetricTypeNameRaw, tag) 252 _ = h.emitter.StoreInt64(metricsNameAgentNotFoundTotal, totalNotFoundNode[agent], 253 metrics.MetricTypeNameRaw, tag) 254 } 255 256 h.healthzMap.rangeNode(func(node string) bool { 257 if !currentNodes.Has(node) { 258 delete(h.healthzMap.nodeHealths, node) 259 } 260 return false 261 }) 262 }