github.com/cilium/cilium@v1.16.2/pkg/maps/ctmap/ctmap.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  package ctmap
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"fmt"
    10  	"io"
    11  	"math"
    12  	"net/netip"
    13  	"os"
    14  	"reflect"
    15  	"strings"
    16  
    17  	"github.com/cilium/ebpf"
    18  	"github.com/sirupsen/logrus"
    19  
    20  	"github.com/cilium/cilium/api/v1/models"
    21  	"github.com/cilium/cilium/pkg/bpf"
    22  	"github.com/cilium/cilium/pkg/controller"
    23  	"github.com/cilium/cilium/pkg/defaults"
    24  	"github.com/cilium/cilium/pkg/lock"
    25  	"github.com/cilium/cilium/pkg/logging"
    26  	"github.com/cilium/cilium/pkg/logging/logfields"
    27  	"github.com/cilium/cilium/pkg/maps/nat"
    28  	"github.com/cilium/cilium/pkg/maps/timestamp"
    29  	"github.com/cilium/cilium/pkg/metrics"
    30  	"github.com/cilium/cilium/pkg/option"
    31  	"github.com/cilium/cilium/pkg/time"
    32  	"github.com/cilium/cilium/pkg/tuple"
    33  	"github.com/cilium/cilium/pkg/u8proto"
    34  )
    35  
    36  var (
    37  	log = logging.DefaultLogger.WithField(logfields.LogSubsys, "map-ct")
    38  
    39  	// labelIPv6CTDumpInterrupts marks the count for conntrack dump resets (IPv6).
    40  	labelIPv6CTDumpInterrupts = map[string]string{
    41  		metrics.LabelDatapathArea:   "conntrack",
    42  		metrics.LabelDatapathName:   "dump_interrupts",
    43  		metrics.LabelDatapathFamily: "ipv6",
    44  	}
    45  	// labelIPv4CTDumpInterrupts marks the count for conntrack dump resets (IPv4).
    46  	labelIPv4CTDumpInterrupts = map[string]string{
    47  		metrics.LabelDatapathArea:   "conntrack",
    48  		metrics.LabelDatapathName:   "dump_interrupts",
    49  		metrics.LabelDatapathFamily: "ipv4",
    50  	}
    51  
    52  	mapInfo map[mapType]mapAttributes
    53  )
    54  
    55  const (
    56  	// mapCount counts the maximum number of CT maps that one endpoint may
    57  	// access at once.
    58  	mapCount = 4
    59  
    60  	// Map names for TCP CT tables are retained from Cilium 1.0 naming
    61  	// scheme to minimize disruption of ongoing connections during upgrade.
    62  	MapNamePrefix     = "cilium_ct"
    63  	MapNameTCP6       = MapNamePrefix + "6_"
    64  	MapNameTCP4       = MapNamePrefix + "4_"
    65  	MapNameTCP6Global = MapNameTCP6 + "global"
    66  	MapNameTCP4Global = MapNameTCP4 + "global"
    67  
    68  	// Map names for "any" protocols indicate CT for non-TCP protocols.
    69  	MapNameAny6       = MapNamePrefix + "_any6_"
    70  	MapNameAny4       = MapNamePrefix + "_any4_"
    71  	MapNameAny6Global = MapNameAny6 + "global"
    72  	MapNameAny4Global = MapNameAny4 + "global"
    73  
    74  	mapNumEntriesLocal = 64000
    75  
    76  	TUPLE_F_OUT     = 0
    77  	TUPLE_F_IN      = 1
    78  	TUPLE_F_RELATED = 2
    79  	TUPLE_F_SERVICE = 4
    80  
    81  	// MaxTime specifies the last possible time for GCFilter.Time
    82  	MaxTime = math.MaxUint32
    83  
    84  	metricsAlive   = "alive"
    85  	metricsDeleted = "deleted"
    86  
    87  	metricsIngress = "ingress"
    88  	metricsEgress  = "egress"
    89  )
    90  
    91  type action int
    92  
    93  const (
    94  	noAction action = iota
    95  	deleteEntry
    96  )
    97  
    98  var globalDeleteLock [mapTypeMax]lock.Mutex
    99  
   100  type mapAttributes struct {
   101  	natMapLock *lock.Mutex // Serializes concurrent accesses to natMap
   102  	natMap     *nat.Map
   103  }
   104  
   105  // CtMap interface represents a CT map, and can be reused to implement mock
   106  // maps for unit tests.
   107  type CtMap interface {
   108  	Open() error
   109  	Close() error
   110  	Path() (string, error)
   111  	DumpEntries() (string, error)
   112  	DumpWithCallback(bpf.DumpCallback) error
   113  	Count() (int, error)
   114  	Update(key bpf.MapKey, value bpf.MapValue) error
   115  }
   116  
   117  // A "Record" designates a map entry (key + value), but avoid "entry" because of
   118  // possible confusion with "CtEntry" (actually the value part).
   119  // This type is used for JSON dump and mock maps.
   120  type CtMapRecord struct {
   121  	Key   CtKey
   122  	Value CtEntry
   123  }
   124  
   125  // InitMapInfo builds the information about different CT maps for the
   126  // combination of L3/L4 protocols.
   127  func InitMapInfo(v4, v6, nodeport bool) {
   128  	global4Map, global6Map := nat.GlobalMaps(v4, v6, nodeport)
   129  	global4MapLock := &lock.Mutex{}
   130  	global6MapLock := &lock.Mutex{}
   131  
   132  	// SNAT also only works if the CT map is global so all local maps will be nil
   133  	mapInfo = map[mapType]mapAttributes{
   134  		mapTypeIPv4TCPGlobal: {natMap: global4Map, natMapLock: global4MapLock},
   135  		mapTypeIPv6TCPGlobal: {natMap: global6Map, natMapLock: global6MapLock},
   136  		mapTypeIPv4AnyGlobal: {natMap: global4Map, natMapLock: global4MapLock},
   137  		mapTypeIPv6AnyGlobal: {natMap: global6Map, natMapLock: global6MapLock},
   138  	}
   139  }
   140  
   141  // CtEndpoint represents an endpoint for the functions required to manage
   142  // conntrack maps for the endpoint.
   143  type CtEndpoint interface {
   144  	GetID() uint64
   145  }
   146  
   147  // Map represents an instance of a BPF connection tracking map.
   148  // It also implements the CtMap interface.
   149  type Map struct {
   150  	bpf.Map
   151  
   152  	mapType mapType
   153  	// define maps to the macro used in the datapath portion for the map
   154  	// name, for example 'CT_MAP4'.
   155  	define string
   156  
   157  	// This field indicates which cluster this ctmap is. Zero for global
   158  	// maps and non-zero for per-cluster maps.
   159  	clusterID uint32
   160  }
   161  
   162  // GCFilter contains the necessary fields to filter the CT maps.
   163  // Filtering by endpoint requires both EndpointID to be > 0 and
   164  // EndpointIP to be not nil.
   165  type GCFilter struct {
   166  	// RemoveExpired enables removal of all entries that have expired
   167  	RemoveExpired bool
   168  
   169  	// Time is the reference timestamp to remove expired entries. If
   170  	// RemoveExpired is true and lifetime is lesser than Time, the entry is
   171  	// removed
   172  	Time uint32
   173  
   174  	// ValidIPs is the list of valid IPs to scrub all entries for which the
   175  	// source or destination IP is *not* matching one of the valid IPs.
   176  	ValidIPs map[netip.Addr]struct{}
   177  
   178  	// MatchIPs is the list of IPs to remove from the conntrack table
   179  	MatchIPs map[netip.Addr]struct{}
   180  
   181  	// EmitCTEntry is called, when non-nil, if filtering by ValidIPs and MatchIPs
   182  	// passes. It has no impact on CT GC, but can be used to iterate over valid
   183  	// CT entries.
   184  	EmitCTEntryCB EmitCTEntryCBFunc
   185  }
   186  
   187  // EmitCTEntryCBFunc is the type used for the EmitCTEntryCB callback in GCFilter
   188  type EmitCTEntryCBFunc func(srcIP, dstIP netip.Addr, srcPort, dstPort uint16, nextHdr, flags uint8, entry *CtEntry)
   189  
   190  // DumpEntriesWithTimeDiff iterates through Map m and writes the values of the
   191  // ct entries in m to a string. If clockSource is not nil, it uses it to
   192  // compute the time difference of each entry from now and prints that too.
   193  func DumpEntriesWithTimeDiff(m CtMap, clockSource *models.ClockSource) (string, error) {
   194  	var toRemSecs func(uint32) string
   195  
   196  	if clockSource == nil {
   197  		toRemSecs = nil
   198  	} else {
   199  		now, err := timestamp.GetCTCurTime(clockSource)
   200  		if err != nil {
   201  			return "", err
   202  		}
   203  		tsConverter, err := timestamp.NewCTTimeToSecConverter(clockSource)
   204  		if err != nil {
   205  			return "", err
   206  		}
   207  		tsecNow := tsConverter(now)
   208  		toRemSecs = func(t uint32) string {
   209  			tsec := tsConverter(uint64(t))
   210  			diff := int64(tsec) - int64(tsecNow)
   211  			return fmt.Sprintf("remaining: %d sec(s)", diff)
   212  		}
   213  	}
   214  
   215  	var sb strings.Builder
   216  	cb := func(k bpf.MapKey, v bpf.MapValue) {
   217  		// No need to deep copy as the values are used to create new strings
   218  		key := k.(CtKey)
   219  		if !key.ToHost().Dump(&sb, true) {
   220  			return
   221  		}
   222  		value := v.(*CtEntry)
   223  		sb.WriteString(value.StringWithTimeDiff(toRemSecs))
   224  	}
   225  	// DumpWithCallback() must be called before sb.String().
   226  	err := m.DumpWithCallback(cb)
   227  	if err != nil {
   228  		return "", err
   229  	}
   230  	return sb.String(), err
   231  }
   232  
   233  // DoDumpEntries iterates through Map m and writes the values of the ct entries
   234  // in m to a string.
   235  func DoDumpEntries(m CtMap) (string, error) {
   236  	return DumpEntriesWithTimeDiff(m, nil)
   237  }
   238  
   239  // DumpEntries iterates through Map m and writes the values of the ct entries
   240  // in m to a string.
   241  func (m *Map) DumpEntries() (string, error) {
   242  	return DoDumpEntries(m)
   243  }
   244  
   245  // Count batch dumps the Map m and returns the count of the entries.
   246  func (m *Map) Count() (count int, err error) {
   247  	global := m.mapType.isGlobal()
   248  	v4 := m.mapType.isIPv4()
   249  	switch {
   250  	case global && v4:
   251  		return countBatch[CtKey4Global](m)
   252  	case global && !v4:
   253  		return countBatch[CtKey6Global](m)
   254  	case !global && v4:
   255  		return countBatch[CtKey4](m)
   256  	case !global && !v4:
   257  		return countBatch[CtKey6](m)
   258  	}
   259  	return
   260  }
   261  
   262  func countBatch[T any](m *Map) (count int, err error) {
   263  	// If we have a hash map of N = 2^n elements, then the first collision is
   264  	// expected [at random] when we insert around sqrt(2*N) elements. For
   265  	// example, for a map of size 1024, this is around 45 elements. In normal
   266  	// life input is not uniformly distributed, so there could be more
   267  	// collisions.
   268  	//
   269  	// In practice, we can expect maximum collision lengths (# of elements in a
   270  	// bucket ~= chunkSize) to be around 30-40. So anything like chunk_size=10%
   271  	// of map size should be pretty safe. If the chunkSize is not enough, then
   272  	// the kernel returns ENOSPC. In this case, it is possible to just set
   273  	// chunkSize *= 2 and try again. However, with the current chunkSize of
   274  	// 4096, we observe no issues dumping the maximum size of a CT map. As
   275  	// explained a bit below, 4096 was an optimal number considering idle
   276  	// memory usage and benchmarks (see commit msg).
   277  	//
   278  	// Credits to Anton for the above explanation of htab maps.
   279  	const chunkSize uint32 = 4096
   280  
   281  	// We can reuse the following buffers as the batch lookup does not care for
   282  	// the contents of the map. This saves on redundant memory allocations.
   283  	//
   284  	// The following is the number of KiB total that is allocated by Go for the
   285  	// following buffers based on the data type:
   286  	//   >>> (14*4096) / 1024 # CT IPv4 map key
   287  	//   56.0
   288  	//   >>> (38*4096) / 1024 # CT IPv6 map key
   289  	//   152.0
   290  	//   >>> (56*4096) / 1024 # CT map value
   291  	//   224.0
   292  	kout := make([]T, chunkSize)
   293  	vout := make([]CtEntry, chunkSize)
   294  
   295  	var cursor ebpf.MapBatchCursor
   296  	for {
   297  		c, batchErr := m.BatchLookup(&cursor, kout, vout, nil)
   298  		count += c
   299  		if batchErr != nil {
   300  			if errors.Is(batchErr, ebpf.ErrKeyNotExist) {
   301  				return count, nil // end of map, we're done iterating
   302  			}
   303  			return count, batchErr
   304  		}
   305  	}
   306  }
   307  
   308  // OpenCTMap is a convenience function to open CT maps. It is the
   309  // responsibility of the caller to ensure that m.Close() is called after this
   310  // function.
   311  func OpenCTMap(m CtMap) (path string, err error) {
   312  	path, err = m.Path()
   313  	if err == nil {
   314  		err = m.Open()
   315  	}
   316  	return
   317  }
   318  
   319  // newMap creates a new CT map of the specified type with the specified name.
   320  func newMap(mapName string, m mapType) *Map {
   321  	result := &Map{
   322  		Map: *bpf.NewMap(mapName,
   323  			ebpf.LRUHash,
   324  			m.key(),
   325  			m.value(),
   326  			m.maxEntries(),
   327  			0,
   328  		).WithPressureMetric(),
   329  		mapType: m,
   330  		define:  m.bpfDefine(),
   331  	}
   332  	return result
   333  }
   334  
   335  func purgeCtEntry6(m *Map, key CtKey, entry *CtEntry, natMap *nat.Map) error {
   336  	err := m.Delete(key)
   337  	if err != nil || natMap == nil {
   338  		return err
   339  	}
   340  
   341  	t := key.GetTupleKey()
   342  	tupleType := t.GetFlags()
   343  
   344  	if tupleType == tuple.TUPLE_F_OUT {
   345  		if entry.isDsrInternalEntry() {
   346  			// To delete NAT entries created by DSR
   347  			nat.DeleteSwappedMapping6(natMap, t.(*tuple.TupleKey6Global))
   348  		} else {
   349  			// To delete NAT entries created for SNAT
   350  			nat.DeleteMapping6(natMap, t.(*tuple.TupleKey6Global))
   351  
   352  		}
   353  	}
   354  
   355  	return nil
   356  }
   357  
   358  // doGC6 iterates through a CTv6 map and drops entries based on the given
   359  // filter.
   360  func doGC6(m *Map, filter *GCFilter) gcStats {
   361  	var natMap *nat.Map
   362  
   363  	if m.clusterID == 0 {
   364  		// global map handling
   365  		ctMap := mapInfo[m.mapType]
   366  		if ctMap.natMapLock != nil {
   367  			ctMap.natMapLock.Lock()
   368  			defer ctMap.natMapLock.Unlock()
   369  		}
   370  		natMap = ctMap.natMap
   371  	} else {
   372  		// per-cluster map handling
   373  		natm, err := nat.GetClusterNATMap(m.clusterID, nat.IPv6)
   374  		if err != nil {
   375  			log.WithError(err).Error("Unable to get per-cluster NAT map")
   376  		} else {
   377  			natMap = natm
   378  		}
   379  	}
   380  
   381  	stats := statStartGc(m)
   382  	defer stats.finish()
   383  
   384  	if natMap != nil {
   385  		err := natMap.Open()
   386  		if err == nil {
   387  			defer natMap.Close()
   388  		} else {
   389  			natMap = nil
   390  		}
   391  	}
   392  
   393  	filterCallback := func(key bpf.MapKey, value bpf.MapValue) {
   394  		entry := value.(*CtEntry)
   395  
   396  		switch obj := key.(type) {
   397  		case *CtKey6Global:
   398  			currentKey6Global := obj
   399  			// In CT entries, the source address of the conntrack entry (`SourceAddr`) is
   400  			// the destination of the packet received, therefore it's the packet's
   401  			// destination IP
   402  			action := filter.doFiltering(currentKey6Global.DestAddr.Addr(), currentKey6Global.SourceAddr.Addr(),
   403  				currentKey6Global.DestPort, currentKey6Global.SourcePort,
   404  				uint8(currentKey6Global.NextHeader), currentKey6Global.Flags, entry)
   405  
   406  			switch action {
   407  			case deleteEntry:
   408  				err := purgeCtEntry6(m, currentKey6Global, entry, natMap)
   409  				if err != nil {
   410  					log.WithError(err).WithField(logfields.Key, currentKey6Global.String()).Error("Unable to delete CT entry")
   411  				} else {
   412  					stats.deleted++
   413  				}
   414  			default:
   415  				stats.aliveEntries++
   416  			}
   417  		case *CtKey6:
   418  			currentKey6 := obj
   419  			// In CT entries, the source address of the conntrack entry (`SourceAddr`) is
   420  			// the destination of the packet received, therefore it's the packet's
   421  			// destination IP
   422  			action := filter.doFiltering(currentKey6.DestAddr.Addr(), currentKey6.SourceAddr.Addr(),
   423  				currentKey6.DestPort, currentKey6.SourcePort,
   424  				uint8(currentKey6.NextHeader), currentKey6.Flags, entry)
   425  
   426  			switch action {
   427  			case deleteEntry:
   428  				err := purgeCtEntry6(m, currentKey6, entry, natMap)
   429  				if err != nil {
   430  					log.WithError(err).WithField(logfields.Key, currentKey6.String()).Error("Unable to delete CT entry")
   431  				} else {
   432  					stats.deleted++
   433  				}
   434  			default:
   435  				stats.aliveEntries++
   436  			}
   437  		default:
   438  			log.Warningf("Encountered unknown type while scanning conntrack table: %v", reflect.TypeOf(key))
   439  		}
   440  	}
   441  
   442  	// See doGC4() comment.
   443  	globalDeleteLock[m.mapType].Lock()
   444  	stats.dumpError = m.DumpReliablyWithCallback(filterCallback, stats.DumpStats)
   445  	globalDeleteLock[m.mapType].Unlock()
   446  	return stats
   447  }
   448  
   449  func purgeCtEntry4(m *Map, key CtKey, entry *CtEntry, natMap *nat.Map) error {
   450  	err := m.Delete(key)
   451  	if err != nil || natMap == nil {
   452  		return err
   453  	}
   454  
   455  	t := key.GetTupleKey()
   456  	tupleType := t.GetFlags()
   457  
   458  	if tupleType == tuple.TUPLE_F_OUT {
   459  		if entry.isDsrInternalEntry() {
   460  			// To delete NAT entries created by DSR
   461  			nat.DeleteSwappedMapping4(natMap, t.(*tuple.TupleKey4Global))
   462  		} else {
   463  			// To delete NAT entries created for SNAT
   464  			nat.DeleteMapping4(natMap, t.(*tuple.TupleKey4Global))
   465  		}
   466  	}
   467  
   468  	return nil
   469  }
   470  
   471  // doGC4 iterates through a CTv4 map and drops entries based on the given
   472  // filter.
   473  func doGC4(m *Map, filter *GCFilter) gcStats {
   474  	var natMap *nat.Map
   475  
   476  	if m.clusterID == 0 {
   477  		// global map handling
   478  		ctMap := mapInfo[m.mapType]
   479  		if ctMap.natMapLock != nil {
   480  			ctMap.natMapLock.Lock()
   481  			defer ctMap.natMapLock.Unlock()
   482  		}
   483  		natMap = ctMap.natMap
   484  	} else {
   485  		// per-cluster map handling
   486  		natm, err := nat.GetClusterNATMap(m.clusterID, nat.IPv4)
   487  		if err != nil {
   488  			log.WithError(err).Error("Unable to get per-cluster NAT map")
   489  		} else {
   490  			natMap = natm
   491  		}
   492  	}
   493  
   494  	stats := statStartGc(m)
   495  	defer stats.finish()
   496  
   497  	if natMap != nil {
   498  		if err := natMap.Open(); err == nil {
   499  			defer natMap.Close()
   500  		} else {
   501  			natMap = nil
   502  		}
   503  	}
   504  
   505  	filterCallback := func(key bpf.MapKey, value bpf.MapValue) {
   506  		entry := value.(*CtEntry)
   507  
   508  		switch obj := key.(type) {
   509  		case *CtKey4Global:
   510  			currentKey4Global := obj
   511  			// In CT entries, the source address of the conntrack entry (`SourceAddr`) is
   512  			// the destination of the packet received, therefore it's the packet's
   513  			// destination IP
   514  			action := filter.doFiltering(currentKey4Global.DestAddr.Addr(), currentKey4Global.SourceAddr.Addr(),
   515  				currentKey4Global.DestPort, currentKey4Global.SourcePort,
   516  				uint8(currentKey4Global.NextHeader), currentKey4Global.Flags, entry)
   517  
   518  			switch action {
   519  			case deleteEntry:
   520  				err := purgeCtEntry4(m, currentKey4Global, entry, natMap)
   521  				if err != nil {
   522  					log.WithError(err).WithField(logfields.Key, currentKey4Global.String()).Error("Unable to delete CT entry")
   523  				} else {
   524  					stats.deleted++
   525  				}
   526  			default:
   527  				stats.aliveEntries++
   528  			}
   529  		case *CtKey4:
   530  			currentKey4 := obj
   531  			// In CT entries, the source address of the conntrack entry (`SourceAddr`) is
   532  			// the destination of the packet received, therefore it's the packet's
   533  			// destination IP
   534  			action := filter.doFiltering(currentKey4.DestAddr.Addr(), currentKey4.SourceAddr.Addr(),
   535  				currentKey4.DestPort, currentKey4.SourcePort,
   536  				uint8(currentKey4.NextHeader), currentKey4.Flags, entry)
   537  
   538  			switch action {
   539  			case deleteEntry:
   540  				err := purgeCtEntry4(m, currentKey4, entry, natMap)
   541  				if err != nil {
   542  					log.WithError(err).WithField(logfields.Key, currentKey4.String()).Error("Unable to delete CT entry")
   543  				} else {
   544  					stats.deleted++
   545  				}
   546  			default:
   547  				stats.aliveEntries++
   548  			}
   549  		default:
   550  			log.Warningf("Encountered unknown type while scanning conntrack table: %v", reflect.TypeOf(key))
   551  		}
   552  	}
   553  
   554  	// We serialize the deletions in order to avoid forced map walk restarts
   555  	// when keys are being evicted underneath us from concurrent goroutines.
   556  	globalDeleteLock[m.mapType].Lock()
   557  	stats.dumpError = m.DumpReliablyWithCallback(filterCallback, stats.DumpStats)
   558  	globalDeleteLock[m.mapType].Unlock()
   559  	return stats
   560  }
   561  
   562  func (f *GCFilter) doFiltering(srcIP, dstIP netip.Addr, srcPort, dstPort uint16, nextHdr, flags uint8, entry *CtEntry) action {
   563  	if f.RemoveExpired && entry.Lifetime < f.Time {
   564  		return deleteEntry
   565  	}
   566  	if f.ValidIPs != nil {
   567  		_, srcIPExists := f.ValidIPs[srcIP]
   568  		_, dstIPExists := f.ValidIPs[dstIP]
   569  		if !srcIPExists && !dstIPExists {
   570  			return deleteEntry
   571  		}
   572  	}
   573  
   574  	if f.MatchIPs != nil {
   575  		_, srcIPExists := f.MatchIPs[srcIP]
   576  		_, dstIPExists := f.MatchIPs[dstIP]
   577  		if srcIPExists || dstIPExists {
   578  			return deleteEntry
   579  		}
   580  	}
   581  
   582  	if f.EmitCTEntryCB != nil {
   583  		f.EmitCTEntryCB(srcIP, dstIP, srcPort, dstPort, nextHdr, flags, entry)
   584  	}
   585  
   586  	return noAction
   587  }
   588  
   589  func doGC(m *Map, filter *GCFilter) (int, error) {
   590  	if m.mapType.isIPv6() {
   591  		stats := doGC6(m, filter)
   592  		return int(stats.deleted), stats.dumpError
   593  	} else if m.mapType.isIPv4() {
   594  		stats := doGC4(m, filter)
   595  		return int(stats.deleted), stats.dumpError
   596  	}
   597  	log.Fatalf("Unsupported ct map type: %s", m.mapType.String())
   598  	return 0, fmt.Errorf("unsupported ct map type: %s", m.mapType.String())
   599  }
   600  
   601  // GC runs garbage collection for map m with name mapType with the given filter.
   602  // It returns how many items were deleted from m.
   603  func GC(m *Map, filter *GCFilter) (int, error) {
   604  	if filter.RemoveExpired {
   605  		t, _ := timestamp.GetCTCurTime(timestamp.GetClockSourceFromOptions())
   606  		filter.Time = uint32(t)
   607  	}
   608  
   609  	return doGC(m, filter)
   610  }
   611  
   612  // PurgeOrphanNATEntries removes orphan SNAT entries. We call an SNAT entry
   613  // orphan if it does not have a corresponding CT entry.
   614  //
   615  // Typically NAT entries should get removed along with their owning CT entry,
   616  // as part of purgeCtEntry*(). But stale NAT entries can get left behind if the
   617  // CT entry disappears for other reasons - for instance by LRU eviction, or
   618  // when the datapath re-purposes the CT entry.
   619  //
   620  // PurgeOrphanNATEntries() is triggered by the datapath via the GC signaling
   621  // mechanism. When the datapath SNAT fails to find free mapping after
   622  // SNAT_SIGNAL_THRES attempts, it sends the signal via the perf ring buffer.
   623  // The consumer of the buffer invokes the function.
   624  //
   625  // The SNAT is being used for the following cases:
   626  //  1. By NodePort BPF on an intermediate node before fwd'ing request from outside
   627  //     to a destination node.
   628  //  2. A packet from local endpoint sent to outside (BPF-masq).
   629  //  3. A packet from a host local application (i.e. running in the host netns)
   630  //     This is needed to prevent SNAT from hijacking such connections.
   631  //  4. By DSR on a backend node to SNAT responses with service IP+port before
   632  //     sending to a client.
   633  //
   634  // In all 4 cases we create a CT_EGRESS CT entry. This allows the
   635  // CT GC to remove corresponding SNAT entries.
   636  // See the unit test TestOrphanNatGC for more examples.
   637  func PurgeOrphanNATEntries(ctMapTCP, ctMapAny *Map) *NatGCStats {
   638  	// Both CT maps should point to the same natMap, so use the first one
   639  	// to determine natMap
   640  	ctMap := mapInfo[ctMapTCP.mapType]
   641  	if ctMap.natMapLock != nil {
   642  		ctMap.natMapLock.Lock()
   643  		defer ctMap.natMapLock.Unlock()
   644  	}
   645  	natMap := ctMap.natMap
   646  	if natMap == nil {
   647  		return nil
   648  	}
   649  
   650  	family := gcFamilyIPv4
   651  	if ctMapTCP.mapType.isIPv6() {
   652  		family = gcFamilyIPv6
   653  	}
   654  	stats := newNatGCStats(natMap, family)
   655  	defer stats.finish()
   656  
   657  	cb := func(key bpf.MapKey, value bpf.MapValue) {
   658  		natKey := key.(nat.NatKey)
   659  		natVal := value.(nat.NatEntry)
   660  
   661  		ctMap := ctMapAny
   662  		if natKey.GetNextHeader() == u8proto.TCP {
   663  			ctMap = ctMapTCP
   664  		}
   665  
   666  		if natKey.GetFlags()&tuple.TUPLE_F_IN == tuple.TUPLE_F_IN { // natKey is r(everse)tuple
   667  			ctKey := egressCTKeyFromIngressNatKeyAndVal(natKey, natVal)
   668  
   669  			if !ctEntryExist(ctMap, ctKey, nil) {
   670  				// No egress CT entry is found, delete the orphan ingress SNAT entry
   671  				if deleted, _ := natMap.Delete(natKey); deleted {
   672  					stats.IngressDeleted++
   673  				}
   674  			} else {
   675  				stats.IngressAlive++
   676  			}
   677  		} else if natKey.GetFlags()&tuple.TUPLE_F_OUT == tuple.TUPLE_F_OUT {
   678  			checkDsr := func(entry *CtEntry) bool {
   679  				return entry.isDsrInternalEntry()
   680  			}
   681  
   682  			egressCTKey := egressCTKeyFromEgressNatKey(natKey)
   683  			dsrCTKey := dsrCTKeyFromEgressNatKey(natKey)
   684  
   685  			if !ctEntryExist(ctMap, egressCTKey, nil) &&
   686  				!ctEntryExist(ctMap, dsrCTKey, checkDsr) {
   687  				// No relevant CT entries were found, delete the orphan egress NAT entry
   688  				if deleted, _ := natMap.Delete(natKey); deleted {
   689  					stats.EgressDeleted++
   690  				}
   691  			} else {
   692  				stats.EgressAlive++
   693  			}
   694  		}
   695  	}
   696  
   697  	if err := natMap.DumpReliablyWithCallback(cb, stats.DumpStats); err != nil {
   698  		log.WithError(err).Error("NATmap dump failed during GC")
   699  	} else {
   700  		natMap.UpdatePressureMetricWithSize(int32(stats.IngressAlive + stats.EgressAlive))
   701  	}
   702  
   703  	return &stats
   704  }
   705  
   706  // Flush runs garbage collection for map m with the name mapType, deleting all
   707  // entries. The specified map must be already opened using bpf.OpenMap().
   708  func (m *Map) Flush() int {
   709  	d, _ := doGC(m, &GCFilter{
   710  		RemoveExpired: true,
   711  		Time:          MaxTime,
   712  	})
   713  	return d
   714  }
   715  
   716  // DeleteIfUpgradeNeeded attempts to open the conntrack maps associated with
   717  // the specified endpoint, and delete the maps from the filesystem if any
   718  // properties do not match the properties defined in this package.
   719  //
   720  // The typical trigger for this is when, for example, the CT entry size changes
   721  // from one version of Cilium to the next. When Cilium restarts, it may opt
   722  // to restore endpoints from the prior life. Existing endpoints that use the
   723  // old map style are incompatible with the new version, so the CT map must be
   724  // destroyed and recreated during upgrade. By removing the old map location
   725  // from the filesystem, we ensure that the next time that the endpoint is
   726  // regenerated, it will recreate a new CT map with the new properties.
   727  //
   728  // Note that if an existing BPF program refers to the map at the canonical
   729  // paths (as fetched via the getMapPathsToKeySize() call below), then that BPF
   730  // program will continue to operate on the old map, even once the map is
   731  // removed from the filesystem. The old map will only be completely cleaned up
   732  // once all referenced to the map are cleared - that is, all BPF programs which
   733  // refer to the old map and removed/reloaded.
   734  func DeleteIfUpgradeNeeded(e CtEndpoint) {
   735  	for _, newMap := range maps(e, true, true) {
   736  		path, err := newMap.Path()
   737  		if err != nil {
   738  			log.WithError(err).Warning("Failed to get path for CT map")
   739  			continue
   740  		}
   741  		scopedLog := log.WithField(logfields.Path, path)
   742  
   743  		// Pass nil key and value types since we're not intending on accessing the
   744  		// map's contents.
   745  		oldMap, err := bpf.OpenMap(path, nil, nil)
   746  		if err != nil {
   747  			scopedLog.WithError(err).Debug("Couldn't open CT map for upgrade")
   748  			continue
   749  		}
   750  		defer oldMap.Close()
   751  
   752  		if oldMap.CheckAndUpgrade(&newMap.Map) {
   753  			scopedLog.Warning("CT Map upgraded, expect brief disruption of ongoing connections")
   754  		}
   755  	}
   756  }
   757  
   758  // maps returns all connecting tracking maps associated with endpoint 'e' (or
   759  // the global maps if 'e' is nil).
   760  func maps(e CtEndpoint, ipv4, ipv6 bool) []*Map {
   761  	result := make([]*Map, 0, mapCount)
   762  	if e == nil {
   763  		if ipv4 {
   764  			result = append(result, newMap(MapNameTCP4Global, mapTypeIPv4TCPGlobal))
   765  			result = append(result, newMap(MapNameAny4Global, mapTypeIPv4AnyGlobal))
   766  		}
   767  		if ipv6 {
   768  			result = append(result, newMap(MapNameTCP6Global, mapTypeIPv6TCPGlobal))
   769  			result = append(result, newMap(MapNameAny6Global, mapTypeIPv6AnyGlobal))
   770  		}
   771  	} else {
   772  		if ipv4 {
   773  			result = append(result, newMap(bpf.LocalMapName(MapNameTCP4, uint16(e.GetID())),
   774  				mapTypeIPv4TCPLocal))
   775  			result = append(result, newMap(bpf.LocalMapName(MapNameAny4, uint16(e.GetID())),
   776  				mapTypeIPv4AnyLocal))
   777  		}
   778  		if ipv6 {
   779  			result = append(result, newMap(bpf.LocalMapName(MapNameTCP6, uint16(e.GetID())),
   780  				mapTypeIPv6TCPLocal))
   781  			result = append(result, newMap(bpf.LocalMapName(MapNameAny6, uint16(e.GetID())),
   782  				mapTypeIPv6AnyLocal))
   783  		}
   784  	}
   785  	return result
   786  }
   787  
   788  // LocalMaps returns a slice of CT maps for the endpoint, which are local to
   789  // the endpoint and not shared with other endpoints. If ipv4 or ipv6 are false,
   790  // the maps for that protocol will not be returned.
   791  //
   792  // The returned maps are not yet opened.
   793  func LocalMaps(e CtEndpoint, ipv4, ipv6 bool) []*Map {
   794  	return maps(e, ipv4, ipv6)
   795  }
   796  
   797  // GlobalMaps returns a slice of CT maps that are used globally by all
   798  // endpoints that are not otherwise configured to use their own local maps.
   799  // If ipv4 or ipv6 are false, the maps for that protocol will not be returned.
   800  //
   801  // The returned maps are not yet opened.
   802  func GlobalMaps(ipv4, ipv6 bool) []*Map {
   803  	return maps(nil, ipv4, ipv6)
   804  }
   805  
   806  // NameIsGlobal returns true if the specified filename (basename) denotes a
   807  // global conntrack map.
   808  func NameIsGlobal(filename string) bool {
   809  	switch filename {
   810  	case MapNameTCP4Global, MapNameAny4Global, MapNameTCP6Global, MapNameAny6Global:
   811  		return true
   812  	}
   813  	return false
   814  }
   815  
   816  // WriteBPFMacros writes the map names for conntrack maps into the specified
   817  // writer, defining usage of the global map or local maps depending on whether
   818  // the specified CtEndpoint is nil.
   819  func WriteBPFMacros(fw io.Writer, e CtEndpoint) {
   820  	var mapEntriesTCP, mapEntriesAny int
   821  	for _, m := range maps(e, true, true) {
   822  		fmt.Fprintf(fw, "#define %s %s\n", m.define, m.Name())
   823  		if m.mapType.isTCP() {
   824  			mapEntriesTCP = m.mapType.maxEntries()
   825  		} else {
   826  			mapEntriesAny = m.mapType.maxEntries()
   827  		}
   828  	}
   829  	fmt.Fprintf(fw, "#define CT_MAP_SIZE_TCP %d\n", mapEntriesTCP)
   830  	fmt.Fprintf(fw, "#define CT_MAP_SIZE_ANY %d\n", mapEntriesAny)
   831  }
   832  
   833  // Exists returns false if the CT maps for the specified endpoint (or global
   834  // maps if nil) are not pinned to the filesystem, or true if they exist or
   835  // an internal error occurs.
   836  func Exists(e CtEndpoint, ipv4, ipv6 bool) bool {
   837  	result := true
   838  	for _, m := range maps(e, ipv4, ipv6) {
   839  		path, err := m.Path()
   840  		if err != nil {
   841  			// Catch this error early
   842  			return true
   843  		}
   844  		if _, err = os.Stat(path); os.IsNotExist(err) {
   845  			result = false
   846  		}
   847  	}
   848  
   849  	return result
   850  }
   851  
   852  var cachedGCInterval time.Duration
   853  
   854  // GetInterval returns the interval adjusted based on the deletion ratio of the
   855  // last run
   856  func GetInterval(actualPrevInterval time.Duration, maxDeleteRatio float64) time.Duration {
   857  	if val := option.Config.ConntrackGCInterval; val != time.Duration(0) {
   858  		return val
   859  	}
   860  
   861  	expectedPrevInterval := cachedGCInterval
   862  	adjustedDeleteRatio := maxDeleteRatio
   863  	if expectedPrevInterval == time.Duration(0) {
   864  		expectedPrevInterval = defaults.ConntrackGCStartingInterval
   865  	} else if actualPrevInterval < expectedPrevInterval && actualPrevInterval > 0 {
   866  		adjustedDeleteRatio *= float64(expectedPrevInterval) / float64(actualPrevInterval)
   867  	}
   868  
   869  	newInterval := calculateInterval(expectedPrevInterval, adjustedDeleteRatio)
   870  	if val := option.Config.ConntrackGCMaxInterval; val != time.Duration(0) && newInterval > val {
   871  		newInterval = val
   872  	}
   873  
   874  	if newInterval != expectedPrevInterval {
   875  		log.WithFields(logrus.Fields{
   876  			"expectedPrevInterval": expectedPrevInterval,
   877  			"actualPrevInterval":   actualPrevInterval,
   878  			"newInterval":          newInterval,
   879  			"deleteRatio":          maxDeleteRatio,
   880  			"adjustedDeleteRatio":  adjustedDeleteRatio,
   881  		}).Info("Conntrack garbage collector interval recalculated")
   882  	}
   883  
   884  	return newInterval
   885  }
   886  
   887  func calculateInterval(prevInterval time.Duration, maxDeleteRatio float64) (interval time.Duration) {
   888  	interval = prevInterval
   889  
   890  	if maxDeleteRatio == 0.0 {
   891  		return
   892  	}
   893  
   894  	switch {
   895  	case maxDeleteRatio > 0.25:
   896  		if maxDeleteRatio > 0.9 {
   897  			maxDeleteRatio = 0.9
   898  		}
   899  		// 25%..90% => 1.3x..10x shorter
   900  		interval = time.Duration(float64(interval) * (1.0 - maxDeleteRatio)).Round(time.Second)
   901  
   902  		if interval < defaults.ConntrackGCMinInterval {
   903  			interval = defaults.ConntrackGCMinInterval
   904  		}
   905  
   906  	case maxDeleteRatio < 0.05:
   907  		// When less than 5% of entries were deleted, increase the
   908  		// interval. Use a simple 1.5x multiplier to start growing slowly
   909  		// as a new node may not be seeing workloads yet and thus the
   910  		// scan will return a low deletion ratio at first.
   911  		interval = time.Duration(float64(interval) * 1.5).Round(time.Second)
   912  		if interval > defaults.ConntrackGCMaxLRUInterval {
   913  			interval = defaults.ConntrackGCMaxLRUInterval
   914  		}
   915  	}
   916  
   917  	cachedGCInterval = interval
   918  
   919  	return
   920  }
   921  
   922  // CalculateCTMapPressure is a controller that calculates the BPF CT map
   923  // pressure and pubishes it as part of the BPF map pressure metric.
   924  func CalculateCTMapPressure(mgr *controller.Manager, allMaps ...*Map) {
   925  	ctx, cancel := context.WithCancelCause(context.Background())
   926  	mgr.UpdateController("ct-map-pressure", controller.ControllerParams{
   927  		Group: controller.Group{
   928  			Name: "ct-map-pressure",
   929  		},
   930  		DoFunc: func(context.Context) error {
   931  			var errs error
   932  			for _, m := range allMaps {
   933  				path, err := OpenCTMap(m)
   934  				if err != nil {
   935  					msg := "Skipping CT map pressure calculation"
   936  					scopedLog := log.WithError(err).WithField(logfields.Path, path)
   937  					if os.IsNotExist(err) {
   938  						scopedLog.Debug(msg)
   939  					} else {
   940  						scopedLog.Warn(msg)
   941  					}
   942  					continue
   943  				}
   944  				defer m.Close()
   945  
   946  				count, err := m.Count()
   947  				if errors.Is(err, ebpf.ErrNotSupported) {
   948  					// We don't have batch ops, so cancel context to kill this
   949  					// controller.
   950  					cancel(err)
   951  					return err
   952  				}
   953  				if err != nil {
   954  					errs = errors.Join(errs, fmt.Errorf("failed to dump CT map %v: %w", m.Name(), err))
   955  				}
   956  				m.UpdatePressureMetricWithSize(int32(count))
   957  			}
   958  			return errs
   959  		},
   960  		RunInterval: 30 * time.Second,
   961  		Context:     ctx,
   962  	})
   963  }