github.com/cilium/cilium@v1.16.2/pkg/maps/nat/nat.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  package nat
     5  
     6  import (
     7  	"errors"
     8  	"fmt"
     9  	"math"
    10  	"strings"
    11  
    12  	"github.com/cilium/ebpf"
    13  
    14  	"golang.org/x/sys/unix"
    15  
    16  	"github.com/cilium/cilium/api/v1/models"
    17  	"github.com/cilium/cilium/pkg/bpf"
    18  	"github.com/cilium/cilium/pkg/logging"
    19  	"github.com/cilium/cilium/pkg/logging/logfields"
    20  	"github.com/cilium/cilium/pkg/maps/timestamp"
    21  	"github.com/cilium/cilium/pkg/option"
    22  	"github.com/cilium/cilium/pkg/tuple"
    23  )
    24  
    25  var (
    26  	log = logging.DefaultLogger.WithField(logfields.LogSubsys, "map-nat")
    27  )
    28  
    29  const (
    30  	// MapNameSnat4Global represents global IPv4 NAT table.
    31  	MapNameSnat4Global = "cilium_snat_v4_external"
    32  	// MapNameSnat6Global represents global IPv6 NAT table.
    33  	MapNameSnat6Global = "cilium_snat_v6_external"
    34  
    35  	// MinPortSnatDefault represents default min port from range.
    36  	MinPortSnatDefault = 1024
    37  	// MaxPortSnatDefault represents default max port from range.
    38  	MaxPortSnatDefault = 65535
    39  )
    40  
    41  // Map represents a NAT map.
    42  // It also implements the NatMap interface.
    43  type Map struct {
    44  	bpf.Map
    45  	family IPFamily
    46  }
    47  
    48  // NatEntry is the interface describing values to the NAT map.
    49  type NatEntry interface {
    50  	bpf.MapValue
    51  
    52  	// ToHost converts fields to host byte order.
    53  	ToHost() NatEntry
    54  
    55  	// Dumps the Nat entry as string.
    56  	Dump(key NatKey, toDeltaSecs func(uint64) string) string
    57  }
    58  
    59  // A "Record" designates a map entry (key + value), but avoid "entry" because of
    60  // possible confusion with "NatEntry" (actually the value part).
    61  // This type is used for JSON dump and mock maps.
    62  type NatMapRecord struct {
    63  	Key   NatKey
    64  	Value NatEntry
    65  }
    66  
    67  // NatMap interface represents a NAT map, and can be reused to implement mock
    68  // maps for unit tests.
    69  type NatMap interface {
    70  	Open() error
    71  	Close() error
    72  	Path() (string, error)
    73  	DumpEntries() (string, error)
    74  	DumpWithCallback(bpf.DumpCallback) error
    75  }
    76  
    77  // NewMap instantiates a Map.
    78  func NewMap(name string, family IPFamily, entries int) *Map {
    79  	var mapKey bpf.MapKey
    80  	var mapValue bpf.MapValue
    81  
    82  	if family == IPv4 {
    83  		mapKey = &NatKey4{}
    84  		mapValue = &NatEntry4{}
    85  	} else {
    86  		mapKey = &NatKey6{}
    87  		mapValue = &NatEntry6{}
    88  	}
    89  
    90  	return &Map{
    91  		Map: *bpf.NewMap(
    92  			name,
    93  			ebpf.LRUHash,
    94  			mapKey,
    95  			mapValue,
    96  			entries,
    97  			0,
    98  		).WithCache().
    99  			WithEvents(option.Config.GetEventBufferConfig(name)).
   100  			WithPressureMetric(),
   101  		family: family,
   102  	}
   103  }
   104  
   105  func startingChunkSize(maxEntries int) int {
   106  	bucketSize := math.Sqrt(float64(maxEntries * 2))
   107  	nearest2 := math.Log2(bucketSize)
   108  	return int(math.Pow(2, math.Ceil(nearest2)))
   109  }
   110  
   111  // ApplyBatch4 uses batch iteration to walk the map and applies fn for each batch of entries.
   112  func (m *Map) ApplyBatch4(fn func([]tuple.TupleKey4, []NatEntry4, int)) (count int, err error) {
   113  	if m.family != IPv4 {
   114  		return 0, fmt.Errorf("not implemented: wrong ip family: %s", m.family)
   115  	}
   116  	return applyBatchReliably(m, fn)
   117  }
   118  
   119  // ApplyBatch4 uses batch iteration to walk the map and applies fn for each batch of entries.
   120  func (m *Map) ApplyBatch6(fn func([]tuple.TupleKey6, []NatEntry6, int)) (count int, err error) {
   121  	if m.family != IPv6 {
   122  		return 0, fmt.Errorf("not implemented: wrong ip family: %s", m.family)
   123  	}
   124  	return applyBatchReliably(m, fn)
   125  }
   126  
   127  func applyBatchReliably[KeyType, EntryType any](m *Map, fn func([]KeyType, []EntryType, int)) (count int, err error) {
   128  	var chunkSize = uint32(startingChunkSize(int(m.MaxEntries())))
   129  	const maxRetries = 3
   130  	for i := 0; i < maxRetries; i++ {
   131  		count, err = applyBatch(m, fn, chunkSize)
   132  		if err != nil {
   133  			// Lookup batch on LRU hash map may fail if the buffer passed is not big enough to
   134  			// accommodate the largest bucket size in the LRU map [1]
   135  			// Because bucket size, in general, cannot be known, we take the number of entries until
   136  			// we expect to see a hash map collision: sqrt(max_entries * 2)
   137  			// Default NAT map size is 262144 -> 2^ceil(log2(sqrt(262144 * 2))) = 1024, with key + entry size
   138  			// being ~ 432 bits, this means we'll need to allocate 55kb to accommodate this iteration.
   139  			// To avoid unbounded growth, each ENOSPC will result in a doubling of the chuck chunkSize
   140  			// which will persist into subsequent calls of Stats, up to a maximum of 3 (fold-increase).
   141  			//
   142  			// [1] https://elixir.bootlin.com/linux/latest/source/kernel/bpf/hashtab.c#L1776
   143  			if errors.Is(err, unix.ENOSPC) {
   144  				chunkSize *= 2
   145  				continue
   146  			}
   147  			return 0, fmt.Errorf("failed to count nat map: %w", err)
   148  		}
   149  		break
   150  	}
   151  	return count, err
   152  }
   153  
   154  func applyBatch[TupleType any, EntryType any](m *Map, fn func([]TupleType, []EntryType, int), chunkSize uint32) (count int, err error) {
   155  	kout := make([]TupleType, chunkSize)
   156  	vout := make([]EntryType, chunkSize)
   157  
   158  	var cursor ebpf.MapBatchCursor
   159  	for {
   160  		c, batchErr := m.BatchLookup(&cursor, kout, vout, nil)
   161  		count += c
   162  		fn(kout, vout, c)
   163  		if batchErr != nil {
   164  			if errors.Is(batchErr, ebpf.ErrKeyNotExist) {
   165  				return count, nil // end of map, we're done iterating
   166  			}
   167  			return count, batchErr
   168  		}
   169  	}
   170  }
   171  
   172  func (m *Map) Delete(k bpf.MapKey) (deleted bool, err error) {
   173  	deleted, err = (&m.Map).SilentDelete(k)
   174  	return
   175  }
   176  
   177  func (m *Map) DumpStats() *bpf.DumpStats {
   178  	return bpf.NewDumpStats(&m.Map)
   179  }
   180  
   181  func (m *Map) DumpReliablyWithCallback(cb bpf.DumpCallback, stats *bpf.DumpStats) error {
   182  	return (&m.Map).DumpReliablyWithCallback(cb, stats)
   183  }
   184  
   185  // DumpEntriesWithTimeDiff iterates through Map m and writes the values of the
   186  // nat entries in m to a string. If clockSource is not nil, it uses it to
   187  // compute the time difference of each entry from now and prints that too.
   188  func DumpEntriesWithTimeDiff(m NatMap, clockSource *models.ClockSource) (string, error) {
   189  	var toDeltaSecs func(uint64) string
   190  	var sb strings.Builder
   191  
   192  	if clockSource == nil {
   193  		toDeltaSecs = func(t uint64) string {
   194  			return fmt.Sprintf("? (raw %d)", t)
   195  		}
   196  	} else {
   197  		now, err := timestamp.GetCTCurTime(clockSource)
   198  		if err != nil {
   199  			return "", err
   200  		}
   201  		tsConverter, err := timestamp.NewCTTimeToSecConverter(clockSource)
   202  		if err != nil {
   203  			return "", err
   204  		}
   205  		tsecNow := tsConverter(now)
   206  		toDeltaSecs = func(t uint64) string {
   207  			tsec := tsConverter(uint64(t))
   208  			diff := int64(tsecNow) - int64(tsec)
   209  			return fmt.Sprintf("%dsec ago", diff)
   210  		}
   211  	}
   212  
   213  	cb := func(k bpf.MapKey, v bpf.MapValue) {
   214  		key := k.(NatKey)
   215  		if !key.ToHost().Dump(&sb, false) {
   216  			return
   217  		}
   218  		val := v.(NatEntry)
   219  		sb.WriteString(val.ToHost().Dump(key, toDeltaSecs))
   220  	}
   221  	err := m.DumpWithCallback(cb)
   222  	return sb.String(), err
   223  }
   224  
   225  // DoDumpEntries iterates through Map m and writes the values of the
   226  // nat entries in m to a string.
   227  func DoDumpEntries(m NatMap) (string, error) {
   228  	return DumpEntriesWithTimeDiff(m, nil)
   229  }
   230  
   231  // DumpEntries iterates through Map m and writes the values of the
   232  // nat entries in m to a string.
   233  func (m *Map) DumpEntries() (string, error) {
   234  	return DoDumpEntries(m)
   235  }
   236  
   237  type gcStats struct {
   238  	*bpf.DumpStats
   239  
   240  	// deleted is the number of keys deleted
   241  	deleted uint32
   242  
   243  	// dumpError records any error that occurred during the dump.
   244  	dumpError error
   245  }
   246  
   247  func statStartGc(m *Map) gcStats {
   248  	return gcStats{
   249  		DumpStats: bpf.NewDumpStats(&m.Map),
   250  	}
   251  }
   252  
   253  func doFlush4(m *Map) gcStats {
   254  	stats := statStartGc(m)
   255  	filterCallback := func(key bpf.MapKey, _ bpf.MapValue) {
   256  		err := (&m.Map).Delete(key)
   257  		if err != nil {
   258  			log.WithError(err).WithField(logfields.Key, key.String()).Error("Unable to delete NAT entry")
   259  		} else {
   260  			stats.deleted++
   261  		}
   262  	}
   263  	stats.dumpError = m.DumpReliablyWithCallback(filterCallback, stats.DumpStats)
   264  	return stats
   265  }
   266  
   267  func doFlush6(m *Map) gcStats {
   268  	stats := statStartGc(m)
   269  	filterCallback := func(key bpf.MapKey, _ bpf.MapValue) {
   270  		err := (&m.Map).Delete(key)
   271  		if err != nil {
   272  			log.WithError(err).WithField(logfields.Key, key.String()).Error("Unable to delete NAT entry")
   273  		} else {
   274  			stats.deleted++
   275  		}
   276  	}
   277  	stats.dumpError = m.DumpReliablyWithCallback(filterCallback, stats.DumpStats)
   278  	return stats
   279  }
   280  
   281  // Flush deletes all NAT mappings from the given table.
   282  func (m *Map) Flush() int {
   283  	if m.family == IPv4 {
   284  		return int(doFlush4(m).deleted)
   285  	}
   286  
   287  	return int(doFlush6(m).deleted)
   288  }
   289  
   290  func DeleteMapping4(m *Map, ctKey *tuple.TupleKey4Global) error {
   291  	key := NatKey4{
   292  		TupleKey4Global: *ctKey,
   293  	}
   294  	// Workaround #5848.
   295  	addr := key.SourceAddr
   296  	key.SourceAddr = key.DestAddr
   297  	key.DestAddr = addr
   298  	valMap, err := m.Lookup(&key)
   299  	if err == nil {
   300  		val := *(valMap.(*NatEntry4))
   301  		rkey := key
   302  		rkey.SourceAddr = key.DestAddr
   303  		rkey.SourcePort = key.DestPort
   304  		rkey.DestAddr = val.Addr
   305  		rkey.DestPort = val.Port
   306  		rkey.Flags = tuple.TUPLE_F_IN
   307  
   308  		m.SilentDelete(&key)
   309  		m.SilentDelete(&rkey)
   310  	}
   311  	return nil
   312  }
   313  
   314  func DeleteMapping6(m *Map, ctKey *tuple.TupleKey6Global) error {
   315  	key := NatKey6{
   316  		TupleKey6Global: *ctKey,
   317  	}
   318  	// Workaround #5848.
   319  	addr := key.SourceAddr
   320  	key.SourceAddr = key.DestAddr
   321  	key.DestAddr = addr
   322  	valMap, err := m.Lookup(&key)
   323  	if err == nil {
   324  		val := *(valMap.(*NatEntry6))
   325  		rkey := key
   326  		rkey.SourceAddr = key.DestAddr
   327  		rkey.SourcePort = key.DestPort
   328  		rkey.DestAddr = val.Addr
   329  		rkey.DestPort = val.Port
   330  		rkey.Flags = tuple.TUPLE_F_IN
   331  
   332  		m.SilentDelete(&key)
   333  		m.SilentDelete(&rkey)
   334  	}
   335  	return nil
   336  }
   337  
   338  // Expects ingress tuple
   339  func DeleteSwappedMapping4(m *Map, ctKey *tuple.TupleKey4Global) error {
   340  	key := NatKey4{TupleKey4Global: *ctKey}
   341  	// Because of #5848, we need to reverse only ports
   342  	port := key.SourcePort
   343  	key.SourcePort = key.DestPort
   344  	key.DestPort = port
   345  	key.Flags = tuple.TUPLE_F_OUT
   346  	m.SilentDelete(&key)
   347  
   348  	return nil
   349  }
   350  
   351  // Expects ingress tuple
   352  func DeleteSwappedMapping6(m *Map, ctKey *tuple.TupleKey6Global) error {
   353  	key := NatKey6{TupleKey6Global: *ctKey}
   354  	// Because of #5848, we need to reverse only ports
   355  	port := key.SourcePort
   356  	key.SourcePort = key.DestPort
   357  	key.DestPort = port
   358  	key.Flags = tuple.TUPLE_F_OUT
   359  	m.SilentDelete(&key)
   360  
   361  	return nil
   362  }
   363  
   364  // GlobalMaps returns all global NAT maps.
   365  func GlobalMaps(ipv4, ipv6, nodeport bool) (ipv4Map, ipv6Map *Map) {
   366  	if !nodeport {
   367  		return
   368  	}
   369  	if ipv4 {
   370  		ipv4Map = NewMap(MapNameSnat4Global, IPv4, maxEntries())
   371  	}
   372  	if ipv6 {
   373  		ipv6Map = NewMap(MapNameSnat6Global, IPv6, maxEntries())
   374  	}
   375  	return
   376  }
   377  
   378  // ClusterMaps returns all NAT maps for given clusters
   379  func ClusterMaps(clusterID uint32, ipv4, ipv6 bool) (ipv4Map, ipv6Map *Map, err error) {
   380  	if ipv4 {
   381  		ipv4Map, err = GetClusterNATMap(clusterID, IPv4)
   382  		if err != nil {
   383  			return
   384  		}
   385  	}
   386  	if ipv6 {
   387  		ipv6Map, err = GetClusterNATMap(clusterID, IPv6)
   388  		if err != nil {
   389  			return
   390  		}
   391  	}
   392  	return
   393  }
   394  
   395  func maxEntries() int {
   396  	if option.Config.NATMapEntriesGlobal != 0 {
   397  		return option.Config.NATMapEntriesGlobal
   398  	}
   399  	return option.LimitTableMax
   400  }