k8s.io/kubernetes@v1.29.3/pkg/proxy/endpoints.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package proxy 18 19 import ( 20 "net" 21 "strconv" 22 "sync" 23 "time" 24 25 "k8s.io/client-go/tools/events" 26 "k8s.io/klog/v2" 27 28 v1 "k8s.io/api/core/v1" 29 discovery "k8s.io/api/discovery/v1" 30 "k8s.io/apimachinery/pkg/types" 31 "k8s.io/apimachinery/pkg/util/sets" 32 "k8s.io/kubernetes/pkg/proxy/metrics" 33 ) 34 35 var supportedEndpointSliceAddressTypes = sets.New[string]( 36 string(discovery.AddressTypeIPv4), 37 string(discovery.AddressTypeIPv6), 38 ) 39 40 // BaseEndpointInfo contains base information that defines an endpoint. 41 // This could be used directly by proxier while processing endpoints, 42 // or can be used for constructing a more specific EndpointInfo struct 43 // defined by the proxier if needed. 44 type BaseEndpointInfo struct { 45 // Cache this values to improve performance 46 ip string 47 port int 48 // endpoint is the same as net.JoinHostPort(ip,port) 49 endpoint string 50 51 // isLocal indicates whether the endpoint is running on same host as kube-proxy. 52 isLocal bool 53 54 // ready indicates whether this endpoint is ready and NOT terminating, unless 55 // PublishNotReadyAddresses is set on the service, in which case it will just 56 // always be true. 57 ready bool 58 // serving indicates whether this endpoint is ready regardless of its terminating state. 59 // For pods this is true if it has a ready status regardless of its deletion timestamp. 60 serving bool 61 // terminating indicates whether this endpoint is terminating. 62 // For pods this is true if it has a non-nil deletion timestamp. 63 terminating bool 64 65 // zoneHints represent the zone hints for the endpoint. This is based on 66 // endpoint.hints.forZones[*].name in the EndpointSlice API. 67 zoneHints sets.Set[string] 68 } 69 70 var _ Endpoint = &BaseEndpointInfo{} 71 72 // String is part of proxy.Endpoint interface. 73 func (info *BaseEndpointInfo) String() string { 74 return info.endpoint 75 } 76 77 // IP returns just the IP part of the endpoint, it's a part of proxy.Endpoint interface. 78 func (info *BaseEndpointInfo) IP() string { 79 return info.ip 80 } 81 82 // Port returns just the Port part of the endpoint. 83 func (info *BaseEndpointInfo) Port() int { 84 return info.port 85 } 86 87 // IsLocal is part of proxy.Endpoint interface. 88 func (info *BaseEndpointInfo) IsLocal() bool { 89 return info.isLocal 90 } 91 92 // IsReady returns true if an endpoint is ready and not terminating. 93 func (info *BaseEndpointInfo) IsReady() bool { 94 return info.ready 95 } 96 97 // IsServing returns true if an endpoint is ready, regardless of if the 98 // endpoint is terminating. 99 func (info *BaseEndpointInfo) IsServing() bool { 100 return info.serving 101 } 102 103 // IsTerminating retruns true if an endpoint is terminating. For pods, 104 // that is any pod with a deletion timestamp. 105 func (info *BaseEndpointInfo) IsTerminating() bool { 106 return info.terminating 107 } 108 109 // ZoneHints returns the zone hint for the endpoint. 110 func (info *BaseEndpointInfo) ZoneHints() sets.Set[string] { 111 return info.zoneHints 112 } 113 114 func newBaseEndpointInfo(ip string, port int, isLocal, ready, serving, terminating bool, zoneHints sets.Set[string]) *BaseEndpointInfo { 115 return &BaseEndpointInfo{ 116 ip: ip, 117 port: port, 118 endpoint: net.JoinHostPort(ip, strconv.Itoa(port)), 119 isLocal: isLocal, 120 ready: ready, 121 serving: serving, 122 terminating: terminating, 123 zoneHints: zoneHints, 124 } 125 } 126 127 type makeEndpointFunc func(info *BaseEndpointInfo, svcPortName *ServicePortName) Endpoint 128 129 // This handler is invoked by the apply function on every change. This function should not modify the 130 // EndpointsMap's but just use the changes for any Proxier specific cleanup. 131 type processEndpointsMapChangeFunc func(oldEndpointsMap, newEndpointsMap EndpointsMap) 132 133 // EndpointsChangeTracker carries state about uncommitted changes to an arbitrary number of 134 // Endpoints, keyed by their namespace and name. 135 type EndpointsChangeTracker struct { 136 // lock protects lastChangeTriggerTimes 137 lock sync.Mutex 138 139 processEndpointsMapChange processEndpointsMapChangeFunc 140 // endpointSliceCache holds a simplified version of endpoint slices. 141 endpointSliceCache *EndpointSliceCache 142 // Map from the Endpoints namespaced-name to the times of the triggers that caused the endpoints 143 // object to change. Used to calculate the network-programming-latency. 144 lastChangeTriggerTimes map[types.NamespacedName][]time.Time 145 // record the time when the endpointsChangeTracker was created so we can ignore the endpoints 146 // that were generated before, because we can't estimate the network-programming-latency on those. 147 // This is specially problematic on restarts, because we process all the endpoints that may have been 148 // created hours or days before. 149 trackerStartTime time.Time 150 } 151 152 // NewEndpointsChangeTracker initializes an EndpointsChangeTracker 153 func NewEndpointsChangeTracker(hostname string, makeEndpointInfo makeEndpointFunc, ipFamily v1.IPFamily, recorder events.EventRecorder, processEndpointsMapChange processEndpointsMapChangeFunc) *EndpointsChangeTracker { 154 return &EndpointsChangeTracker{ 155 lastChangeTriggerTimes: make(map[types.NamespacedName][]time.Time), 156 trackerStartTime: time.Now(), 157 processEndpointsMapChange: processEndpointsMapChange, 158 endpointSliceCache: NewEndpointSliceCache(hostname, ipFamily, recorder, makeEndpointInfo), 159 } 160 } 161 162 // EndpointSliceUpdate updates given service's endpoints change map based on the <previous, current> endpoints pair. 163 // It returns true if items changed, otherwise return false. Will add/update/delete items of EndpointsChangeTracker. 164 // If removeSlice is true, slice will be removed, otherwise it will be added or updated. 165 func (ect *EndpointsChangeTracker) EndpointSliceUpdate(endpointSlice *discovery.EndpointSlice, removeSlice bool) bool { 166 if !supportedEndpointSliceAddressTypes.Has(string(endpointSlice.AddressType)) { 167 klog.V(4).InfoS("EndpointSlice address type not supported by kube-proxy", "addressType", endpointSlice.AddressType) 168 return false 169 } 170 171 // This should never happen 172 if endpointSlice == nil { 173 klog.ErrorS(nil, "Nil endpointSlice passed to EndpointSliceUpdate") 174 return false 175 } 176 177 namespacedName, _, err := endpointSliceCacheKeys(endpointSlice) 178 if err != nil { 179 klog.InfoS("Error getting endpoint slice cache keys", "err", err) 180 return false 181 } 182 183 metrics.EndpointChangesTotal.Inc() 184 185 ect.lock.Lock() 186 defer ect.lock.Unlock() 187 188 changeNeeded := ect.endpointSliceCache.updatePending(endpointSlice, removeSlice) 189 190 if changeNeeded { 191 metrics.EndpointChangesPending.Inc() 192 // In case of Endpoints deletion, the LastChangeTriggerTime annotation is 193 // by-definition coming from the time of last update, which is not what 194 // we want to measure. So we simply ignore it in this cases. 195 // TODO(wojtek-t, robscott): Address the problem for EndpointSlice deletion 196 // when other EndpointSlice for that service still exist. 197 if removeSlice { 198 delete(ect.lastChangeTriggerTimes, namespacedName) 199 } else if t := getLastChangeTriggerTime(endpointSlice.Annotations); !t.IsZero() && t.After(ect.trackerStartTime) { 200 ect.lastChangeTriggerTimes[namespacedName] = 201 append(ect.lastChangeTriggerTimes[namespacedName], t) 202 } 203 } 204 205 return changeNeeded 206 } 207 208 // checkoutChanges returns a map of pending endpointsChanges and marks them as 209 // applied. 210 func (ect *EndpointsChangeTracker) checkoutChanges() map[types.NamespacedName]*endpointsChange { 211 metrics.EndpointChangesPending.Set(0) 212 213 return ect.endpointSliceCache.checkoutChanges() 214 } 215 216 // checkoutTriggerTimes applies the locally cached trigger times to a map of 217 // trigger times that have been passed in and empties the local cache. 218 func (ect *EndpointsChangeTracker) checkoutTriggerTimes(lastChangeTriggerTimes *map[types.NamespacedName][]time.Time) { 219 ect.lock.Lock() 220 defer ect.lock.Unlock() 221 222 for k, v := range ect.lastChangeTriggerTimes { 223 prev, ok := (*lastChangeTriggerTimes)[k] 224 if !ok { 225 (*lastChangeTriggerTimes)[k] = v 226 } else { 227 (*lastChangeTriggerTimes)[k] = append(prev, v...) 228 } 229 } 230 ect.lastChangeTriggerTimes = make(map[types.NamespacedName][]time.Time) 231 } 232 233 // getLastChangeTriggerTime returns the time.Time value of the 234 // EndpointsLastChangeTriggerTime annotation stored in the given endpoints 235 // object or the "zero" time if the annotation wasn't set or was set 236 // incorrectly. 237 func getLastChangeTriggerTime(annotations map[string]string) time.Time { 238 // TODO(#81360): ignore case when Endpoint is deleted. 239 if _, ok := annotations[v1.EndpointsLastChangeTriggerTime]; !ok { 240 // It's possible that the Endpoints object won't have the 241 // EndpointsLastChangeTriggerTime annotation set. In that case return 242 // the 'zero value', which is ignored in the upstream code. 243 return time.Time{} 244 } 245 val, err := time.Parse(time.RFC3339Nano, annotations[v1.EndpointsLastChangeTriggerTime]) 246 if err != nil { 247 klog.ErrorS(err, "Error while parsing EndpointsLastChangeTriggerTimeAnnotation", 248 "value", annotations[v1.EndpointsLastChangeTriggerTime]) 249 // In case of error val = time.Zero, which is ignored in the upstream code. 250 } 251 return val 252 } 253 254 // endpointsChange contains all changes to endpoints that happened since proxy 255 // rules were synced. For a single object, changes are accumulated, i.e. 256 // previous is state from before applying the changes, current is state after 257 // applying the changes. 258 type endpointsChange struct { 259 previous EndpointsMap 260 current EndpointsMap 261 } 262 263 // UpdateEndpointsMapResult is the updated results after applying endpoints changes. 264 type UpdateEndpointsMapResult struct { 265 // UpdatedServices lists the names of all services with added/updated/deleted 266 // endpoints since the last Update. 267 UpdatedServices sets.Set[types.NamespacedName] 268 269 // DeletedUDPEndpoints identifies UDP endpoints that have just been deleted. 270 // Existing conntrack NAT entries pointing to these endpoints must be deleted to 271 // ensure that no further traffic for the Service gets delivered to them. 272 DeletedUDPEndpoints []ServiceEndpoint 273 274 // NewlyActiveUDPServices identifies UDP Services that have just gone from 0 to 275 // non-0 endpoints. Existing conntrack entries caching the fact that these 276 // services are black holes must be deleted to ensure that traffic can immediately 277 // begin flowing to the new endpoints. 278 NewlyActiveUDPServices []ServicePortName 279 280 // List of the trigger times for all endpoints objects that changed. It's used to export the 281 // network programming latency. 282 // NOTE(oxddr): this can be simplified to []time.Time if memory consumption becomes an issue. 283 LastChangeTriggerTimes map[types.NamespacedName][]time.Time 284 } 285 286 // EndpointsMap maps a service name to a list of all its Endpoints. 287 type EndpointsMap map[ServicePortName][]Endpoint 288 289 // Update updates em based on the changes in ect, returns information about the diff since 290 // the last Update, triggers processEndpointsMapChange on every change, and clears the 291 // changes map. 292 func (em EndpointsMap) Update(ect *EndpointsChangeTracker) UpdateEndpointsMapResult { 293 result := UpdateEndpointsMapResult{ 294 UpdatedServices: sets.New[types.NamespacedName](), 295 DeletedUDPEndpoints: make([]ServiceEndpoint, 0), 296 NewlyActiveUDPServices: make([]ServicePortName, 0), 297 LastChangeTriggerTimes: make(map[types.NamespacedName][]time.Time), 298 } 299 if ect == nil { 300 return result 301 } 302 303 changes := ect.checkoutChanges() 304 for nn, change := range changes { 305 if ect.processEndpointsMapChange != nil { 306 ect.processEndpointsMapChange(change.previous, change.current) 307 } 308 result.UpdatedServices.Insert(nn) 309 310 em.unmerge(change.previous) 311 em.merge(change.current) 312 detectStaleConntrackEntries(change.previous, change.current, &result.DeletedUDPEndpoints, &result.NewlyActiveUDPServices) 313 } 314 ect.checkoutTriggerTimes(&result.LastChangeTriggerTimes) 315 316 return result 317 } 318 319 // Merge ensures that the current EndpointsMap contains all <service, endpoints> pairs from the EndpointsMap passed in. 320 func (em EndpointsMap) merge(other EndpointsMap) { 321 for svcPortName := range other { 322 em[svcPortName] = other[svcPortName] 323 } 324 } 325 326 // Unmerge removes the <service, endpoints> pairs from the current EndpointsMap which are contained in the EndpointsMap passed in. 327 func (em EndpointsMap) unmerge(other EndpointsMap) { 328 for svcPortName := range other { 329 delete(em, svcPortName) 330 } 331 } 332 333 // getLocalEndpointIPs returns endpoints IPs if given endpoint is local - local means the endpoint is running in same host as kube-proxy. 334 func (em EndpointsMap) getLocalReadyEndpointIPs() map[types.NamespacedName]sets.Set[string] { 335 localIPs := make(map[types.NamespacedName]sets.Set[string]) 336 for svcPortName, epList := range em { 337 for _, ep := range epList { 338 // Only add ready endpoints for health checking. Terminating endpoints may still serve traffic 339 // but the health check signal should fail if there are only terminating endpoints on a node. 340 if !ep.IsReady() { 341 continue 342 } 343 344 if ep.IsLocal() { 345 nsn := svcPortName.NamespacedName 346 if localIPs[nsn] == nil { 347 localIPs[nsn] = sets.New[string]() 348 } 349 localIPs[nsn].Insert(ep.IP()) 350 } 351 } 352 } 353 return localIPs 354 } 355 356 // LocalReadyEndpoints returns a map of Service names to the number of local ready 357 // endpoints for that service. 358 func (em EndpointsMap) LocalReadyEndpoints() map[types.NamespacedName]int { 359 // TODO: If this will appear to be computationally expensive, consider 360 // computing this incrementally similarly to endpointsMap. 361 362 // (Note that we need to call getLocalEndpointIPs first to squash the data by IP, 363 // because the EndpointsMap is sorted by IP+port, not just IP, and we want to 364 // consider a Service pointing to 10.0.0.1:80 and 10.0.0.1:443 to have 1 endpoint, 365 // not 2.) 366 367 eps := make(map[types.NamespacedName]int) 368 localIPs := em.getLocalReadyEndpointIPs() 369 for nsn, ips := range localIPs { 370 eps[nsn] = len(ips) 371 } 372 return eps 373 } 374 375 // detectStaleConntrackEntries detects services that may be associated with stale conntrack entries. 376 // (See UpdateEndpointsMapResult.DeletedUDPEndpoints and .NewlyActiveUDPServices.) 377 func detectStaleConntrackEntries(oldEndpointsMap, newEndpointsMap EndpointsMap, deletedUDPEndpoints *[]ServiceEndpoint, newlyActiveUDPServices *[]ServicePortName) { 378 // Find the UDP endpoints that we were sending traffic to in oldEndpointsMap, but 379 // are no longer sending to newEndpointsMap. The proxier should make sure that 380 // conntrack does not accidentally route any new connections to them. 381 for svcPortName, epList := range oldEndpointsMap { 382 if svcPortName.Protocol != v1.ProtocolUDP { 383 continue 384 } 385 386 for _, ep := range epList { 387 // If the old endpoint wasn't Serving then there can't be stale 388 // conntrack entries since there was no traffic sent to it. 389 if !ep.IsServing() { 390 continue 391 } 392 393 deleted := true 394 // Check if the endpoint has changed, including if it went from 395 // serving to not serving. If it did change stale entries for the old 396 // endpoint have to be cleared. 397 for i := range newEndpointsMap[svcPortName] { 398 if newEndpointsMap[svcPortName][i].String() == ep.String() { 399 deleted = false 400 break 401 } 402 } 403 if deleted { 404 klog.V(4).InfoS("Deleted endpoint may have stale conntrack entries", "portName", svcPortName, "endpoint", ep) 405 *deletedUDPEndpoints = append(*deletedUDPEndpoints, ServiceEndpoint{Endpoint: ep.String(), ServicePortName: svcPortName}) 406 } 407 } 408 } 409 410 // Detect services that have gone from 0 to non-0 ready endpoints. If there were 411 // previously 0 endpoints, but someone tried to connect to it, then a conntrack 412 // entry may have been created blackholing traffic to that IP, which should be 413 // deleted now. 414 for svcPortName, epList := range newEndpointsMap { 415 if svcPortName.Protocol != v1.ProtocolUDP { 416 continue 417 } 418 419 epServing := 0 420 for _, ep := range epList { 421 if ep.IsServing() { 422 epServing++ 423 } 424 } 425 426 oldEpServing := 0 427 for _, ep := range oldEndpointsMap[svcPortName] { 428 if ep.IsServing() { 429 oldEpServing++ 430 } 431 } 432 433 if epServing > 0 && oldEpServing == 0 { 434 *newlyActiveUDPServices = append(*newlyActiveUDPServices, svcPortName) 435 } 436 } 437 }