github.com/netdata/go.d.plugin@v0.58.1/modules/k8s_state/collect.go (about) 1 // SPDX-License-Identifier: GPL-3.0-or-later 2 3 package k8s_state 4 5 import ( 6 "errors" 7 "fmt" 8 "strings" 9 "time" 10 11 "github.com/netdata/go.d.plugin/agent/module" 12 13 corev1 "k8s.io/api/core/v1" 14 ) 15 16 const precision = 1000 17 18 func (ks *KubeState) collect() (map[string]int64, error) { 19 if ks.discoverer == nil { 20 return nil, errors.New("nil discoverer") 21 } 22 23 ks.once.Do(func() { 24 ks.startTime = time.Now() 25 in := make(chan resource) 26 27 ks.wg.Add(1) 28 go func() { defer ks.wg.Done(); ks.runUpdateState(in) }() 29 30 ks.wg.Add(1) 31 go func() { defer ks.wg.Done(); ks.discoverer.run(ks.ctx, in) }() 32 33 ks.kubeClusterID = ks.getKubeClusterID() 34 ks.kubeClusterName = ks.getKubeClusterName() 35 if chart := ks.Charts().Get(discoveryStatusChart.ID); chart != nil { 36 chart.Labels = []module.Label{ 37 {Key: labelKeyClusterID, Value: ks.kubeClusterID, Source: module.LabelSourceK8s}, 38 {Key: labelKeyClusterName, Value: ks.kubeClusterName, Source: module.LabelSourceK8s}, 39 } 40 } 41 }) 42 43 mx := map[string]int64{ 44 "discovery_node_discoverer_state": 1, 45 "discovery_pod_discoverer_state": 1, 46 } 47 48 if !ks.discoverer.ready() || time.Since(ks.startTime) < ks.initDelay { 49 return mx, nil 50 } 51 52 ks.state.Lock() 53 defer ks.state.Unlock() 54 55 ks.collectKubeState(mx) 56 57 return mx, nil 58 } 59 60 func (ks *KubeState) collectKubeState(mx map[string]int64) { 61 for _, ns := range ks.state.nodes { 62 ns.resetStats() 63 } 64 ks.collectPodsState(mx) 65 ks.collectNodesState(mx) 66 } 67 68 func (ks *KubeState) collectPodsState(mx map[string]int64) { 69 now := time.Now() 70 for _, ps := range ks.state.pods { 71 if ps.deleted { 72 delete(ks.state.pods, podSource(ps.namespace, ps.name)) 73 ks.removePodCharts(ps) 74 continue 75 } 76 if ps.new { 77 ps.new = false 78 ks.addPodCharts(ps) 79 ps.unscheduled = ps.nodeName == "" 80 } else if ps.unscheduled && ps.nodeName != "" { 81 ps.unscheduled = false 82 ks.updatePodChartsNodeLabel(ps) 83 } 84 85 ns := ks.state.nodes[nodeSource(ps.nodeName)] 86 if ns != nil { 87 ns.stats.pods++ 88 ns.stats.reqCPU += ps.reqCPU 89 ns.stats.limitCPU += ps.limitCPU 90 ns.stats.reqMem += ps.reqMem 91 ns.stats.limitMem += ps.limitMem 92 ns.stats.podsCondPodReady += condStatusToInt(ps.condPodReady) 93 ns.stats.podsCondPodScheduled += condStatusToInt(ps.condPodScheduled) 94 ns.stats.podsCondPodInitialized += condStatusToInt(ps.condPodInitialized) 95 ns.stats.podsCondContainersReady += condStatusToInt(ps.condContainersReady) 96 ns.stats.podsReadinessReady += boolToInt(ps.condPodReady == corev1.ConditionTrue) 97 ns.stats.podsReadinessUnready += boolToInt(ps.condPodReady != corev1.ConditionTrue) 98 ns.stats.podsPhasePending += boolToInt(ps.phase == corev1.PodPending) 99 ns.stats.podsPhaseRunning += boolToInt(ps.phase == corev1.PodRunning) 100 ns.stats.podsPhaseSucceeded += boolToInt(ps.phase == corev1.PodSucceeded) 101 ns.stats.podsPhaseFailed += boolToInt(ps.phase == corev1.PodFailed) 102 for _, cs := range ps.initContainers { 103 ns.stats.initContainers++ 104 ns.stats.initContStateRunning += boolToInt(cs.stateRunning) 105 ns.stats.initContStateWaiting += boolToInt(cs.stateWaiting) 106 ns.stats.initContStateTerminated += boolToInt(cs.stateTerminated) 107 } 108 for _, cs := range ps.containers { 109 ns.stats.containers++ 110 ns.stats.contStateRunning += boolToInt(cs.stateRunning) 111 ns.stats.contStateWaiting += boolToInt(cs.stateWaiting) 112 ns.stats.contStateTerminated += boolToInt(cs.stateTerminated) 113 } 114 } 115 116 px := fmt.Sprintf("pod_%s_", ps.id()) 117 118 mx[px+"cond_podready"] = condStatusToInt(ps.condPodReady) 119 mx[px+"cond_podscheduled"] = condStatusToInt(ps.condPodScheduled) 120 mx[px+"cond_podinitialized"] = condStatusToInt(ps.condPodInitialized) 121 mx[px+"cond_containersready"] = condStatusToInt(ps.condContainersReady) 122 mx[px+"phase_running"] = boolToInt(ps.phase == corev1.PodRunning) 123 mx[px+"phase_failed"] = boolToInt(ps.phase == corev1.PodFailed) 124 mx[px+"phase_succeeded"] = boolToInt(ps.phase == corev1.PodSucceeded) 125 mx[px+"phase_pending"] = boolToInt(ps.phase == corev1.PodPending) 126 mx[px+"age"] = int64(now.Sub(ps.creationTime).Seconds()) 127 mx[px+"cpu_requests_used"] = ps.reqCPU 128 mx[px+"cpu_limits_used"] = ps.limitCPU 129 mx[px+"mem_requests_used"] = ps.reqMem 130 mx[px+"mem_limits_used"] = ps.limitMem 131 132 mx[px+"init_containers"] = int64(len(ps.initContainers)) 133 mx[px+"containers"] = int64(len(ps.containers)) 134 135 mx[px+"init_containers_state_running"] = 0 136 mx[px+"init_containers_state_waiting"] = 0 137 mx[px+"init_containers_state_terminated"] = 0 138 for _, cs := range ps.initContainers { 139 mx[px+"init_containers_state_running"] += boolToInt(cs.stateRunning) 140 mx[px+"init_containers_state_waiting"] += boolToInt(cs.stateWaiting) 141 mx[px+"init_containers_state_terminated"] += boolToInt(cs.stateTerminated) 142 } 143 mx[px+"containers_state_running"] = 0 144 mx[px+"containers_state_waiting"] = 0 145 mx[px+"containers_state_terminated"] = 0 146 for _, cs := range ps.containers { 147 if cs.new { 148 cs.new = false 149 ks.addContainerCharts(ps, cs) 150 } 151 mx[px+"containers_state_running"] += boolToInt(cs.stateRunning) 152 mx[px+"containers_state_waiting"] += boolToInt(cs.stateWaiting) 153 mx[px+"containers_state_terminated"] += boolToInt(cs.stateTerminated) 154 155 ppx := fmt.Sprintf("%scontainer_%s_", px, cs.name) 156 mx[ppx+"state_running"] = boolToInt(cs.stateRunning) 157 mx[ppx+"state_waiting"] = boolToInt(cs.stateWaiting) 158 mx[ppx+"state_terminated"] = boolToInt(cs.stateTerminated) 159 mx[ppx+"readiness"] = boolToInt(cs.ready) 160 mx[ppx+"restarts"] = cs.restarts 161 for _, r := range cs.stateWaitingReasons { 162 if r.new { 163 r.new = false 164 ks.addContainerWaitingStateReasonToChart(ps, cs, r.reason) 165 } 166 mx[ppx+"state_waiting_reason_"+r.reason] = boolToInt(r.active) 167 } 168 for _, r := range cs.stateTerminatedReasons { 169 if r.new { 170 r.new = false 171 ks.addContainerTerminatedStateReasonToChart(ps, cs, r.reason) 172 } 173 mx[ppx+"state_terminated_reason_"+r.reason] = boolToInt(r.active) 174 } 175 } 176 } 177 } 178 179 func (ks *KubeState) collectNodesState(mx map[string]int64) { 180 now := time.Now() 181 for _, ns := range ks.state.nodes { 182 if ns.deleted { 183 delete(ks.state.nodes, nodeSource(ns.name)) 184 ks.removeNodeCharts(ns) 185 continue 186 } 187 if ns.new { 188 ns.new = false 189 ks.addNodeCharts(ns) 190 } 191 192 px := fmt.Sprintf("node_%s_", ns.id()) 193 194 for typ, cond := range ns.conditions { 195 if cond.new { 196 cond.new = false 197 ks.addNodeConditionToCharts(ns, typ) 198 } 199 mx[px+"cond_"+strings.ToLower(typ)] = condStatusToInt(cond.status) 200 } 201 202 mx[px+"age"] = int64(now.Sub(ns.creationTime).Seconds()) 203 mx[px+"alloc_pods_util"] = calcPercentage(ns.stats.pods, ns.allocatablePods) 204 mx[px+"pods_readiness_ready"] = ns.stats.podsReadinessReady 205 mx[px+"pods_readiness_unready"] = ns.stats.podsReadinessUnready 206 mx[px+"pods_readiness"] = calcPercentage(ns.stats.podsReadinessReady, ns.stats.pods) 207 mx[px+"pods_phase_running"] = ns.stats.podsPhaseRunning 208 mx[px+"pods_phase_failed"] = ns.stats.podsPhaseFailed 209 mx[px+"pods_phase_succeeded"] = ns.stats.podsPhaseSucceeded 210 mx[px+"pods_phase_pending"] = ns.stats.podsPhasePending 211 mx[px+"pods_cond_podready"] = ns.stats.podsCondPodReady 212 mx[px+"pods_cond_podscheduled"] = ns.stats.podsCondPodScheduled 213 mx[px+"pods_cond_podinitialized"] = ns.stats.podsCondPodInitialized 214 mx[px+"pods_cond_containersready"] = ns.stats.podsCondContainersReady 215 mx[px+"pods_cond_containersready"] = ns.stats.podsCondContainersReady 216 mx[px+"schedulability_schedulable"] = boolToInt(!ns.unSchedulable) 217 mx[px+"schedulability_unschedulable"] = boolToInt(ns.unSchedulable) 218 mx[px+"alloc_pods_available"] = ns.allocatablePods - ns.stats.pods 219 mx[px+"alloc_pods_allocated"] = ns.stats.pods 220 mx[px+"alloc_cpu_requests_util"] = calcPercentage(ns.stats.reqCPU, ns.allocatableCPU) 221 mx[px+"alloc_cpu_limits_util"] = calcPercentage(ns.stats.limitCPU, ns.allocatableCPU) 222 mx[px+"alloc_mem_requests_util"] = calcPercentage(ns.stats.reqMem, ns.allocatableMem) 223 mx[px+"alloc_mem_limits_util"] = calcPercentage(ns.stats.limitMem, ns.allocatableMem) 224 mx[px+"alloc_cpu_requests_used"] = ns.stats.reqCPU 225 mx[px+"alloc_cpu_limits_used"] = ns.stats.limitCPU 226 mx[px+"alloc_mem_requests_used"] = ns.stats.reqMem 227 mx[px+"alloc_mem_limits_used"] = ns.stats.limitMem 228 mx[px+"init_containers"] = ns.stats.initContainers 229 mx[px+"containers"] = ns.stats.containers 230 mx[px+"containers_state_running"] = ns.stats.contStateRunning 231 mx[px+"containers_state_waiting"] = ns.stats.contStateWaiting 232 mx[px+"containers_state_terminated"] = ns.stats.contStateTerminated 233 mx[px+"init_containers_state_running"] = ns.stats.initContStateRunning 234 mx[px+"init_containers_state_waiting"] = ns.stats.initContStateWaiting 235 mx[px+"init_containers_state_terminated"] = ns.stats.initContStateTerminated 236 } 237 } 238 239 func boolToInt(v bool) int64 { 240 if v { 241 return 1 242 } 243 return 0 244 } 245 246 func condStatusToInt(cs corev1.ConditionStatus) int64 { 247 switch cs { 248 case corev1.ConditionFalse: 249 return 0 250 case corev1.ConditionTrue: 251 return 1 252 case corev1.ConditionUnknown: 253 return 0 254 default: 255 return 0 256 } 257 } 258 259 func calcPercentage(value, total int64) int64 { 260 if total == 0 { 261 return 0 262 } 263 return int64(float64(value) / float64(total) * 100 * precision) 264 }