k8s.io/kubernetes@v1.29.3/pkg/proxy/healthcheck/proxier_health.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package healthcheck 18 19 import ( 20 "fmt" 21 "net/http" 22 "sync" 23 "time" 24 25 v1 "k8s.io/api/core/v1" 26 "k8s.io/klog/v2" 27 "k8s.io/kubernetes/pkg/proxy/metrics" 28 "k8s.io/utils/clock" 29 ) 30 31 const ( 32 // ToBeDeletedTaint is a taint used by the CLuster Autoscaler before marking a node for deletion. Defined in 33 // https://github.com/kubernetes/autoscaler/blob/e80ab518340f88f364fe3ef063f8303755125971/cluster-autoscaler/utils/deletetaint/delete.go#L36 34 ToBeDeletedTaint = "ToBeDeletedByClusterAutoscaler" 35 ) 36 37 // ProxierHealthServer allows callers to: 38 // 1. run a http server with /healthz and /livez endpoint handlers. 39 // 2. update healthz timestamps before and after synchronizing dataplane. 40 // 3. sync node status, for reporting unhealthy /healthz response 41 // if the node is marked for deletion by autoscaler. 42 // 4. get proxy health by verifying that the delay between QueuedUpdate() 43 // calls and Updated() calls exceeded healthTimeout or not. 44 type ProxierHealthServer struct { 45 listener listener 46 httpFactory httpServerFactory 47 clock clock.Clock 48 49 addr string 50 healthTimeout time.Duration 51 52 lock sync.RWMutex 53 lastUpdatedMap map[v1.IPFamily]time.Time 54 oldestPendingQueuedMap map[v1.IPFamily]time.Time 55 nodeEligible bool 56 } 57 58 // NewProxierHealthServer returns a proxier health http server. 59 func NewProxierHealthServer(addr string, healthTimeout time.Duration) *ProxierHealthServer { 60 return newProxierHealthServer(stdNetListener{}, stdHTTPServerFactory{}, clock.RealClock{}, addr, healthTimeout) 61 } 62 63 func newProxierHealthServer(listener listener, httpServerFactory httpServerFactory, c clock.Clock, addr string, healthTimeout time.Duration) *ProxierHealthServer { 64 return &ProxierHealthServer{ 65 listener: listener, 66 httpFactory: httpServerFactory, 67 clock: c, 68 addr: addr, 69 healthTimeout: healthTimeout, 70 71 lastUpdatedMap: make(map[v1.IPFamily]time.Time), 72 oldestPendingQueuedMap: make(map[v1.IPFamily]time.Time), 73 // The node is eligible (and thus the proxy healthy) while it's starting up 74 // and until we've processed the first node event that indicates the 75 // contrary. 76 nodeEligible: true, 77 } 78 } 79 80 // Updated should be called when the proxier of the given IP family has successfully updated 81 // the service rules to reflect the current state and should be considered healthy now. 82 func (hs *ProxierHealthServer) Updated(ipFamily v1.IPFamily) { 83 hs.lock.Lock() 84 defer hs.lock.Unlock() 85 delete(hs.oldestPendingQueuedMap, ipFamily) 86 hs.lastUpdatedMap[ipFamily] = hs.clock.Now() 87 } 88 89 // QueuedUpdate should be called when the proxier receives a Service or Endpoints event 90 // from API Server containing information that requires updating service rules. It 91 // indicates that the proxier for the given IP family has received changes but has not 92 // yet pushed them to its backend. If the proxier does not call Updated within the 93 // healthTimeout time then it will be considered unhealthy. 94 func (hs *ProxierHealthServer) QueuedUpdate(ipFamily v1.IPFamily) { 95 hs.lock.Lock() 96 defer hs.lock.Unlock() 97 // Set oldestPendingQueuedMap[ipFamily] only if it's currently unset 98 if _, set := hs.oldestPendingQueuedMap[ipFamily]; !set { 99 hs.oldestPendingQueuedMap[ipFamily] = hs.clock.Now() 100 } 101 } 102 103 // IsHealthy returns only the proxier's health state, following the same 104 // definition the HTTP server defines, but ignoring the state of the Node. 105 func (hs *ProxierHealthServer) IsHealthy() bool { 106 isHealthy, _ := hs.isHealthy() 107 return isHealthy 108 } 109 110 func (hs *ProxierHealthServer) isHealthy() (bool, time.Time) { 111 hs.lock.RLock() 112 defer hs.lock.RUnlock() 113 114 var lastUpdated time.Time 115 currentTime := hs.clock.Now() 116 117 for ipFamily, proxierLastUpdated := range hs.lastUpdatedMap { 118 119 if proxierLastUpdated.After(lastUpdated) { 120 lastUpdated = proxierLastUpdated 121 } 122 123 if _, set := hs.oldestPendingQueuedMap[ipFamily]; !set { 124 // the proxier is healthy while it's starting up 125 // or the proxier is fully synced. 126 continue 127 } 128 129 if currentTime.Sub(hs.oldestPendingQueuedMap[ipFamily]) < hs.healthTimeout { 130 // there's an unprocessed update queued for this proxier, but it's not late yet. 131 continue 132 } 133 return false, proxierLastUpdated 134 } 135 return true, lastUpdated 136 } 137 138 // SyncNode syncs the node and determines if it is eligible or not. Eligible is 139 // defined as being: not tainted by ToBeDeletedTaint and not deleted. 140 func (hs *ProxierHealthServer) SyncNode(node *v1.Node) { 141 hs.lock.Lock() 142 defer hs.lock.Unlock() 143 144 if !node.DeletionTimestamp.IsZero() { 145 hs.nodeEligible = false 146 return 147 } 148 for _, taint := range node.Spec.Taints { 149 if taint.Key == ToBeDeletedTaint { 150 hs.nodeEligible = false 151 return 152 } 153 } 154 hs.nodeEligible = true 155 } 156 157 // NodeEligible returns nodeEligible field of ProxierHealthServer. 158 func (hs *ProxierHealthServer) NodeEligible() bool { 159 hs.lock.RLock() 160 defer hs.lock.RUnlock() 161 return hs.nodeEligible 162 } 163 164 // Run starts the healthz HTTP server and blocks until it exits. 165 func (hs *ProxierHealthServer) Run() error { 166 serveMux := http.NewServeMux() 167 serveMux.Handle("/healthz", healthzHandler{hs: hs}) 168 serveMux.Handle("/livez", livezHandler{hs: hs}) 169 server := hs.httpFactory.New(hs.addr, serveMux) 170 171 listener, err := hs.listener.Listen(hs.addr) 172 if err != nil { 173 return fmt.Errorf("failed to start proxier healthz on %s: %v", hs.addr, err) 174 } 175 176 klog.V(3).InfoS("Starting healthz HTTP server", "address", hs.addr) 177 178 if err := server.Serve(listener); err != nil { 179 return fmt.Errorf("proxier healthz closed with error: %v", err) 180 } 181 return nil 182 } 183 184 type healthzHandler struct { 185 hs *ProxierHealthServer 186 } 187 188 func (h healthzHandler) ServeHTTP(resp http.ResponseWriter, req *http.Request) { 189 nodeEligible := h.hs.NodeEligible() 190 healthy, lastUpdated := h.hs.isHealthy() 191 currentTime := h.hs.clock.Now() 192 193 healthy = healthy && nodeEligible 194 resp.Header().Set("Content-Type", "application/json") 195 resp.Header().Set("X-Content-Type-Options", "nosniff") 196 if !healthy { 197 metrics.ProxyHealthzTotal.WithLabelValues("503").Inc() 198 resp.WriteHeader(http.StatusServiceUnavailable) 199 } else { 200 metrics.ProxyHealthzTotal.WithLabelValues("200").Inc() 201 resp.WriteHeader(http.StatusOK) 202 // In older releases, the returned "lastUpdated" time indicated the last 203 // time the proxier sync loop ran, even if nothing had changed. To 204 // preserve compatibility, we use the same semantics: the returned 205 // lastUpdated value is "recent" if the server is healthy. The kube-proxy 206 // metrics provide more detailed information. 207 lastUpdated = currentTime 208 } 209 fmt.Fprintf(resp, `{"lastUpdated": %q,"currentTime": %q, "nodeEligible": %v}`, lastUpdated, currentTime, nodeEligible) 210 } 211 212 type livezHandler struct { 213 hs *ProxierHealthServer 214 } 215 216 func (h livezHandler) ServeHTTP(resp http.ResponseWriter, req *http.Request) { 217 healthy, lastUpdated := h.hs.isHealthy() 218 currentTime := h.hs.clock.Now() 219 resp.Header().Set("Content-Type", "application/json") 220 resp.Header().Set("X-Content-Type-Options", "nosniff") 221 if !healthy { 222 metrics.ProxyLivezTotal.WithLabelValues("503").Inc() 223 resp.WriteHeader(http.StatusServiceUnavailable) 224 } else { 225 metrics.ProxyLivezTotal.WithLabelValues("200").Inc() 226 resp.WriteHeader(http.StatusOK) 227 // In older releases, the returned "lastUpdated" time indicated the last 228 // time the proxier sync loop ran, even if nothing had changed. To 229 // preserve compatibility, we use the same semantics: the returned 230 // lastUpdated value is "recent" if the server is healthy. The kube-proxy 231 // metrics provide more detailed information. 232 lastUpdated = currentTime 233 } 234 fmt.Fprintf(resp, `{"lastUpdated": %q,"currentTime": %q}`, lastUpdated, currentTime) 235 }