k8s.io/kubernetes@v1.29.3/pkg/proxy/endpoints.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package proxy
    18  
    19  import (
    20  	"net"
    21  	"strconv"
    22  	"sync"
    23  	"time"
    24  
    25  	"k8s.io/client-go/tools/events"
    26  	"k8s.io/klog/v2"
    27  
    28  	v1 "k8s.io/api/core/v1"
    29  	discovery "k8s.io/api/discovery/v1"
    30  	"k8s.io/apimachinery/pkg/types"
    31  	"k8s.io/apimachinery/pkg/util/sets"
    32  	"k8s.io/kubernetes/pkg/proxy/metrics"
    33  )
    34  
    35  var supportedEndpointSliceAddressTypes = sets.New[string](
    36  	string(discovery.AddressTypeIPv4),
    37  	string(discovery.AddressTypeIPv6),
    38  )
    39  
    40  // BaseEndpointInfo contains base information that defines an endpoint.
    41  // This could be used directly by proxier while processing endpoints,
    42  // or can be used for constructing a more specific EndpointInfo struct
    43  // defined by the proxier if needed.
    44  type BaseEndpointInfo struct {
    45  	// Cache this values to improve performance
    46  	ip   string
    47  	port int
    48  	// endpoint is the same as net.JoinHostPort(ip,port)
    49  	endpoint string
    50  
    51  	// isLocal indicates whether the endpoint is running on same host as kube-proxy.
    52  	isLocal bool
    53  
    54  	// ready indicates whether this endpoint is ready and NOT terminating, unless
    55  	// PublishNotReadyAddresses is set on the service, in which case it will just
    56  	// always be true.
    57  	ready bool
    58  	// serving indicates whether this endpoint is ready regardless of its terminating state.
    59  	// For pods this is true if it has a ready status regardless of its deletion timestamp.
    60  	serving bool
    61  	// terminating indicates whether this endpoint is terminating.
    62  	// For pods this is true if it has a non-nil deletion timestamp.
    63  	terminating bool
    64  
    65  	// zoneHints represent the zone hints for the endpoint. This is based on
    66  	// endpoint.hints.forZones[*].name in the EndpointSlice API.
    67  	zoneHints sets.Set[string]
    68  }
    69  
    70  var _ Endpoint = &BaseEndpointInfo{}
    71  
    72  // String is part of proxy.Endpoint interface.
    73  func (info *BaseEndpointInfo) String() string {
    74  	return info.endpoint
    75  }
    76  
    77  // IP returns just the IP part of the endpoint, it's a part of proxy.Endpoint interface.
    78  func (info *BaseEndpointInfo) IP() string {
    79  	return info.ip
    80  }
    81  
    82  // Port returns just the Port part of the endpoint.
    83  func (info *BaseEndpointInfo) Port() int {
    84  	return info.port
    85  }
    86  
    87  // IsLocal is part of proxy.Endpoint interface.
    88  func (info *BaseEndpointInfo) IsLocal() bool {
    89  	return info.isLocal
    90  }
    91  
    92  // IsReady returns true if an endpoint is ready and not terminating.
    93  func (info *BaseEndpointInfo) IsReady() bool {
    94  	return info.ready
    95  }
    96  
    97  // IsServing returns true if an endpoint is ready, regardless of if the
    98  // endpoint is terminating.
    99  func (info *BaseEndpointInfo) IsServing() bool {
   100  	return info.serving
   101  }
   102  
   103  // IsTerminating retruns true if an endpoint is terminating. For pods,
   104  // that is any pod with a deletion timestamp.
   105  func (info *BaseEndpointInfo) IsTerminating() bool {
   106  	return info.terminating
   107  }
   108  
   109  // ZoneHints returns the zone hint for the endpoint.
   110  func (info *BaseEndpointInfo) ZoneHints() sets.Set[string] {
   111  	return info.zoneHints
   112  }
   113  
   114  func newBaseEndpointInfo(ip string, port int, isLocal, ready, serving, terminating bool, zoneHints sets.Set[string]) *BaseEndpointInfo {
   115  	return &BaseEndpointInfo{
   116  		ip:          ip,
   117  		port:        port,
   118  		endpoint:    net.JoinHostPort(ip, strconv.Itoa(port)),
   119  		isLocal:     isLocal,
   120  		ready:       ready,
   121  		serving:     serving,
   122  		terminating: terminating,
   123  		zoneHints:   zoneHints,
   124  	}
   125  }
   126  
   127  type makeEndpointFunc func(info *BaseEndpointInfo, svcPortName *ServicePortName) Endpoint
   128  
   129  // This handler is invoked by the apply function on every change. This function should not modify the
   130  // EndpointsMap's but just use the changes for any Proxier specific cleanup.
   131  type processEndpointsMapChangeFunc func(oldEndpointsMap, newEndpointsMap EndpointsMap)
   132  
   133  // EndpointsChangeTracker carries state about uncommitted changes to an arbitrary number of
   134  // Endpoints, keyed by their namespace and name.
   135  type EndpointsChangeTracker struct {
   136  	// lock protects lastChangeTriggerTimes
   137  	lock sync.Mutex
   138  
   139  	processEndpointsMapChange processEndpointsMapChangeFunc
   140  	// endpointSliceCache holds a simplified version of endpoint slices.
   141  	endpointSliceCache *EndpointSliceCache
   142  	// Map from the Endpoints namespaced-name to the times of the triggers that caused the endpoints
   143  	// object to change. Used to calculate the network-programming-latency.
   144  	lastChangeTriggerTimes map[types.NamespacedName][]time.Time
   145  	// record the time when the endpointsChangeTracker was created so we can ignore the endpoints
   146  	// that were generated before, because we can't estimate the network-programming-latency on those.
   147  	// This is specially problematic on restarts, because we process all the endpoints that may have been
   148  	// created hours or days before.
   149  	trackerStartTime time.Time
   150  }
   151  
   152  // NewEndpointsChangeTracker initializes an EndpointsChangeTracker
   153  func NewEndpointsChangeTracker(hostname string, makeEndpointInfo makeEndpointFunc, ipFamily v1.IPFamily, recorder events.EventRecorder, processEndpointsMapChange processEndpointsMapChangeFunc) *EndpointsChangeTracker {
   154  	return &EndpointsChangeTracker{
   155  		lastChangeTriggerTimes:    make(map[types.NamespacedName][]time.Time),
   156  		trackerStartTime:          time.Now(),
   157  		processEndpointsMapChange: processEndpointsMapChange,
   158  		endpointSliceCache:        NewEndpointSliceCache(hostname, ipFamily, recorder, makeEndpointInfo),
   159  	}
   160  }
   161  
   162  // EndpointSliceUpdate updates given service's endpoints change map based on the <previous, current> endpoints pair.
   163  // It returns true if items changed, otherwise return false. Will add/update/delete items of EndpointsChangeTracker.
   164  // If removeSlice is true, slice will be removed, otherwise it will be added or updated.
   165  func (ect *EndpointsChangeTracker) EndpointSliceUpdate(endpointSlice *discovery.EndpointSlice, removeSlice bool) bool {
   166  	if !supportedEndpointSliceAddressTypes.Has(string(endpointSlice.AddressType)) {
   167  		klog.V(4).InfoS("EndpointSlice address type not supported by kube-proxy", "addressType", endpointSlice.AddressType)
   168  		return false
   169  	}
   170  
   171  	// This should never happen
   172  	if endpointSlice == nil {
   173  		klog.ErrorS(nil, "Nil endpointSlice passed to EndpointSliceUpdate")
   174  		return false
   175  	}
   176  
   177  	namespacedName, _, err := endpointSliceCacheKeys(endpointSlice)
   178  	if err != nil {
   179  		klog.InfoS("Error getting endpoint slice cache keys", "err", err)
   180  		return false
   181  	}
   182  
   183  	metrics.EndpointChangesTotal.Inc()
   184  
   185  	ect.lock.Lock()
   186  	defer ect.lock.Unlock()
   187  
   188  	changeNeeded := ect.endpointSliceCache.updatePending(endpointSlice, removeSlice)
   189  
   190  	if changeNeeded {
   191  		metrics.EndpointChangesPending.Inc()
   192  		// In case of Endpoints deletion, the LastChangeTriggerTime annotation is
   193  		// by-definition coming from the time of last update, which is not what
   194  		// we want to measure. So we simply ignore it in this cases.
   195  		// TODO(wojtek-t, robscott): Address the problem for EndpointSlice deletion
   196  		// when other EndpointSlice for that service still exist.
   197  		if removeSlice {
   198  			delete(ect.lastChangeTriggerTimes, namespacedName)
   199  		} else if t := getLastChangeTriggerTime(endpointSlice.Annotations); !t.IsZero() && t.After(ect.trackerStartTime) {
   200  			ect.lastChangeTriggerTimes[namespacedName] =
   201  				append(ect.lastChangeTriggerTimes[namespacedName], t)
   202  		}
   203  	}
   204  
   205  	return changeNeeded
   206  }
   207  
   208  // checkoutChanges returns a map of pending endpointsChanges and marks them as
   209  // applied.
   210  func (ect *EndpointsChangeTracker) checkoutChanges() map[types.NamespacedName]*endpointsChange {
   211  	metrics.EndpointChangesPending.Set(0)
   212  
   213  	return ect.endpointSliceCache.checkoutChanges()
   214  }
   215  
   216  // checkoutTriggerTimes applies the locally cached trigger times to a map of
   217  // trigger times that have been passed in and empties the local cache.
   218  func (ect *EndpointsChangeTracker) checkoutTriggerTimes(lastChangeTriggerTimes *map[types.NamespacedName][]time.Time) {
   219  	ect.lock.Lock()
   220  	defer ect.lock.Unlock()
   221  
   222  	for k, v := range ect.lastChangeTriggerTimes {
   223  		prev, ok := (*lastChangeTriggerTimes)[k]
   224  		if !ok {
   225  			(*lastChangeTriggerTimes)[k] = v
   226  		} else {
   227  			(*lastChangeTriggerTimes)[k] = append(prev, v...)
   228  		}
   229  	}
   230  	ect.lastChangeTriggerTimes = make(map[types.NamespacedName][]time.Time)
   231  }
   232  
   233  // getLastChangeTriggerTime returns the time.Time value of the
   234  // EndpointsLastChangeTriggerTime annotation stored in the given endpoints
   235  // object or the "zero" time if the annotation wasn't set or was set
   236  // incorrectly.
   237  func getLastChangeTriggerTime(annotations map[string]string) time.Time {
   238  	// TODO(#81360): ignore case when Endpoint is deleted.
   239  	if _, ok := annotations[v1.EndpointsLastChangeTriggerTime]; !ok {
   240  		// It's possible that the Endpoints object won't have the
   241  		// EndpointsLastChangeTriggerTime annotation set. In that case return
   242  		// the 'zero value', which is ignored in the upstream code.
   243  		return time.Time{}
   244  	}
   245  	val, err := time.Parse(time.RFC3339Nano, annotations[v1.EndpointsLastChangeTriggerTime])
   246  	if err != nil {
   247  		klog.ErrorS(err, "Error while parsing EndpointsLastChangeTriggerTimeAnnotation",
   248  			"value", annotations[v1.EndpointsLastChangeTriggerTime])
   249  		// In case of error val = time.Zero, which is ignored in the upstream code.
   250  	}
   251  	return val
   252  }
   253  
   254  // endpointsChange contains all changes to endpoints that happened since proxy
   255  // rules were synced.  For a single object, changes are accumulated, i.e.
   256  // previous is state from before applying the changes, current is state after
   257  // applying the changes.
   258  type endpointsChange struct {
   259  	previous EndpointsMap
   260  	current  EndpointsMap
   261  }
   262  
   263  // UpdateEndpointsMapResult is the updated results after applying endpoints changes.
   264  type UpdateEndpointsMapResult struct {
   265  	// UpdatedServices lists the names of all services with added/updated/deleted
   266  	// endpoints since the last Update.
   267  	UpdatedServices sets.Set[types.NamespacedName]
   268  
   269  	// DeletedUDPEndpoints identifies UDP endpoints that have just been deleted.
   270  	// Existing conntrack NAT entries pointing to these endpoints must be deleted to
   271  	// ensure that no further traffic for the Service gets delivered to them.
   272  	DeletedUDPEndpoints []ServiceEndpoint
   273  
   274  	// NewlyActiveUDPServices identifies UDP Services that have just gone from 0 to
   275  	// non-0 endpoints. Existing conntrack entries caching the fact that these
   276  	// services are black holes must be deleted to ensure that traffic can immediately
   277  	// begin flowing to the new endpoints.
   278  	NewlyActiveUDPServices []ServicePortName
   279  
   280  	// List of the trigger times for all endpoints objects that changed. It's used to export the
   281  	// network programming latency.
   282  	// NOTE(oxddr): this can be simplified to []time.Time if memory consumption becomes an issue.
   283  	LastChangeTriggerTimes map[types.NamespacedName][]time.Time
   284  }
   285  
   286  // EndpointsMap maps a service name to a list of all its Endpoints.
   287  type EndpointsMap map[ServicePortName][]Endpoint
   288  
   289  // Update updates em based on the changes in ect, returns information about the diff since
   290  // the last Update, triggers processEndpointsMapChange on every change, and clears the
   291  // changes map.
   292  func (em EndpointsMap) Update(ect *EndpointsChangeTracker) UpdateEndpointsMapResult {
   293  	result := UpdateEndpointsMapResult{
   294  		UpdatedServices:        sets.New[types.NamespacedName](),
   295  		DeletedUDPEndpoints:    make([]ServiceEndpoint, 0),
   296  		NewlyActiveUDPServices: make([]ServicePortName, 0),
   297  		LastChangeTriggerTimes: make(map[types.NamespacedName][]time.Time),
   298  	}
   299  	if ect == nil {
   300  		return result
   301  	}
   302  
   303  	changes := ect.checkoutChanges()
   304  	for nn, change := range changes {
   305  		if ect.processEndpointsMapChange != nil {
   306  			ect.processEndpointsMapChange(change.previous, change.current)
   307  		}
   308  		result.UpdatedServices.Insert(nn)
   309  
   310  		em.unmerge(change.previous)
   311  		em.merge(change.current)
   312  		detectStaleConntrackEntries(change.previous, change.current, &result.DeletedUDPEndpoints, &result.NewlyActiveUDPServices)
   313  	}
   314  	ect.checkoutTriggerTimes(&result.LastChangeTriggerTimes)
   315  
   316  	return result
   317  }
   318  
   319  // Merge ensures that the current EndpointsMap contains all <service, endpoints> pairs from the EndpointsMap passed in.
   320  func (em EndpointsMap) merge(other EndpointsMap) {
   321  	for svcPortName := range other {
   322  		em[svcPortName] = other[svcPortName]
   323  	}
   324  }
   325  
   326  // Unmerge removes the <service, endpoints> pairs from the current EndpointsMap which are contained in the EndpointsMap passed in.
   327  func (em EndpointsMap) unmerge(other EndpointsMap) {
   328  	for svcPortName := range other {
   329  		delete(em, svcPortName)
   330  	}
   331  }
   332  
   333  // getLocalEndpointIPs returns endpoints IPs if given endpoint is local - local means the endpoint is running in same host as kube-proxy.
   334  func (em EndpointsMap) getLocalReadyEndpointIPs() map[types.NamespacedName]sets.Set[string] {
   335  	localIPs := make(map[types.NamespacedName]sets.Set[string])
   336  	for svcPortName, epList := range em {
   337  		for _, ep := range epList {
   338  			// Only add ready endpoints for health checking. Terminating endpoints may still serve traffic
   339  			// but the health check signal should fail if there are only terminating endpoints on a node.
   340  			if !ep.IsReady() {
   341  				continue
   342  			}
   343  
   344  			if ep.IsLocal() {
   345  				nsn := svcPortName.NamespacedName
   346  				if localIPs[nsn] == nil {
   347  					localIPs[nsn] = sets.New[string]()
   348  				}
   349  				localIPs[nsn].Insert(ep.IP())
   350  			}
   351  		}
   352  	}
   353  	return localIPs
   354  }
   355  
   356  // LocalReadyEndpoints returns a map of Service names to the number of local ready
   357  // endpoints for that service.
   358  func (em EndpointsMap) LocalReadyEndpoints() map[types.NamespacedName]int {
   359  	// TODO: If this will appear to be computationally expensive, consider
   360  	// computing this incrementally similarly to endpointsMap.
   361  
   362  	// (Note that we need to call getLocalEndpointIPs first to squash the data by IP,
   363  	// because the EndpointsMap is sorted by IP+port, not just IP, and we want to
   364  	// consider a Service pointing to 10.0.0.1:80 and 10.0.0.1:443 to have 1 endpoint,
   365  	// not 2.)
   366  
   367  	eps := make(map[types.NamespacedName]int)
   368  	localIPs := em.getLocalReadyEndpointIPs()
   369  	for nsn, ips := range localIPs {
   370  		eps[nsn] = len(ips)
   371  	}
   372  	return eps
   373  }
   374  
   375  // detectStaleConntrackEntries detects services that may be associated with stale conntrack entries.
   376  // (See UpdateEndpointsMapResult.DeletedUDPEndpoints and .NewlyActiveUDPServices.)
   377  func detectStaleConntrackEntries(oldEndpointsMap, newEndpointsMap EndpointsMap, deletedUDPEndpoints *[]ServiceEndpoint, newlyActiveUDPServices *[]ServicePortName) {
   378  	// Find the UDP endpoints that we were sending traffic to in oldEndpointsMap, but
   379  	// are no longer sending to newEndpointsMap. The proxier should make sure that
   380  	// conntrack does not accidentally route any new connections to them.
   381  	for svcPortName, epList := range oldEndpointsMap {
   382  		if svcPortName.Protocol != v1.ProtocolUDP {
   383  			continue
   384  		}
   385  
   386  		for _, ep := range epList {
   387  			// If the old endpoint wasn't Serving then there can't be stale
   388  			// conntrack entries since there was no traffic sent to it.
   389  			if !ep.IsServing() {
   390  				continue
   391  			}
   392  
   393  			deleted := true
   394  			// Check if the endpoint has changed, including if it went from
   395  			// serving to not serving. If it did change stale entries for the old
   396  			// endpoint have to be cleared.
   397  			for i := range newEndpointsMap[svcPortName] {
   398  				if newEndpointsMap[svcPortName][i].String() == ep.String() {
   399  					deleted = false
   400  					break
   401  				}
   402  			}
   403  			if deleted {
   404  				klog.V(4).InfoS("Deleted endpoint may have stale conntrack entries", "portName", svcPortName, "endpoint", ep)
   405  				*deletedUDPEndpoints = append(*deletedUDPEndpoints, ServiceEndpoint{Endpoint: ep.String(), ServicePortName: svcPortName})
   406  			}
   407  		}
   408  	}
   409  
   410  	// Detect services that have gone from 0 to non-0 ready endpoints. If there were
   411  	// previously 0 endpoints, but someone tried to connect to it, then a conntrack
   412  	// entry may have been created blackholing traffic to that IP, which should be
   413  	// deleted now.
   414  	for svcPortName, epList := range newEndpointsMap {
   415  		if svcPortName.Protocol != v1.ProtocolUDP {
   416  			continue
   417  		}
   418  
   419  		epServing := 0
   420  		for _, ep := range epList {
   421  			if ep.IsServing() {
   422  				epServing++
   423  			}
   424  		}
   425  
   426  		oldEpServing := 0
   427  		for _, ep := range oldEndpointsMap[svcPortName] {
   428  			if ep.IsServing() {
   429  				oldEpServing++
   430  			}
   431  		}
   432  
   433  		if epServing > 0 && oldEpServing == 0 {
   434  			*newlyActiveUDPServices = append(*newlyActiveUDPServices, svcPortName)
   435  		}
   436  	}
   437  }