github.com/cilium/cilium@v1.16.2/pkg/maps/nat/nat.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package nat 5 6 import ( 7 "errors" 8 "fmt" 9 "math" 10 "strings" 11 12 "github.com/cilium/ebpf" 13 14 "golang.org/x/sys/unix" 15 16 "github.com/cilium/cilium/api/v1/models" 17 "github.com/cilium/cilium/pkg/bpf" 18 "github.com/cilium/cilium/pkg/logging" 19 "github.com/cilium/cilium/pkg/logging/logfields" 20 "github.com/cilium/cilium/pkg/maps/timestamp" 21 "github.com/cilium/cilium/pkg/option" 22 "github.com/cilium/cilium/pkg/tuple" 23 ) 24 25 var ( 26 log = logging.DefaultLogger.WithField(logfields.LogSubsys, "map-nat") 27 ) 28 29 const ( 30 // MapNameSnat4Global represents global IPv4 NAT table. 31 MapNameSnat4Global = "cilium_snat_v4_external" 32 // MapNameSnat6Global represents global IPv6 NAT table. 33 MapNameSnat6Global = "cilium_snat_v6_external" 34 35 // MinPortSnatDefault represents default min port from range. 36 MinPortSnatDefault = 1024 37 // MaxPortSnatDefault represents default max port from range. 38 MaxPortSnatDefault = 65535 39 ) 40 41 // Map represents a NAT map. 42 // It also implements the NatMap interface. 43 type Map struct { 44 bpf.Map 45 family IPFamily 46 } 47 48 // NatEntry is the interface describing values to the NAT map. 49 type NatEntry interface { 50 bpf.MapValue 51 52 // ToHost converts fields to host byte order. 53 ToHost() NatEntry 54 55 // Dumps the Nat entry as string. 56 Dump(key NatKey, toDeltaSecs func(uint64) string) string 57 } 58 59 // A "Record" designates a map entry (key + value), but avoid "entry" because of 60 // possible confusion with "NatEntry" (actually the value part). 61 // This type is used for JSON dump and mock maps. 62 type NatMapRecord struct { 63 Key NatKey 64 Value NatEntry 65 } 66 67 // NatMap interface represents a NAT map, and can be reused to implement mock 68 // maps for unit tests. 69 type NatMap interface { 70 Open() error 71 Close() error 72 Path() (string, error) 73 DumpEntries() (string, error) 74 DumpWithCallback(bpf.DumpCallback) error 75 } 76 77 // NewMap instantiates a Map. 78 func NewMap(name string, family IPFamily, entries int) *Map { 79 var mapKey bpf.MapKey 80 var mapValue bpf.MapValue 81 82 if family == IPv4 { 83 mapKey = &NatKey4{} 84 mapValue = &NatEntry4{} 85 } else { 86 mapKey = &NatKey6{} 87 mapValue = &NatEntry6{} 88 } 89 90 return &Map{ 91 Map: *bpf.NewMap( 92 name, 93 ebpf.LRUHash, 94 mapKey, 95 mapValue, 96 entries, 97 0, 98 ).WithCache(). 99 WithEvents(option.Config.GetEventBufferConfig(name)). 100 WithPressureMetric(), 101 family: family, 102 } 103 } 104 105 func startingChunkSize(maxEntries int) int { 106 bucketSize := math.Sqrt(float64(maxEntries * 2)) 107 nearest2 := math.Log2(bucketSize) 108 return int(math.Pow(2, math.Ceil(nearest2))) 109 } 110 111 // ApplyBatch4 uses batch iteration to walk the map and applies fn for each batch of entries. 112 func (m *Map) ApplyBatch4(fn func([]tuple.TupleKey4, []NatEntry4, int)) (count int, err error) { 113 if m.family != IPv4 { 114 return 0, fmt.Errorf("not implemented: wrong ip family: %s", m.family) 115 } 116 return applyBatchReliably(m, fn) 117 } 118 119 // ApplyBatch4 uses batch iteration to walk the map and applies fn for each batch of entries. 120 func (m *Map) ApplyBatch6(fn func([]tuple.TupleKey6, []NatEntry6, int)) (count int, err error) { 121 if m.family != IPv6 { 122 return 0, fmt.Errorf("not implemented: wrong ip family: %s", m.family) 123 } 124 return applyBatchReliably(m, fn) 125 } 126 127 func applyBatchReliably[KeyType, EntryType any](m *Map, fn func([]KeyType, []EntryType, int)) (count int, err error) { 128 var chunkSize = uint32(startingChunkSize(int(m.MaxEntries()))) 129 const maxRetries = 3 130 for i := 0; i < maxRetries; i++ { 131 count, err = applyBatch(m, fn, chunkSize) 132 if err != nil { 133 // Lookup batch on LRU hash map may fail if the buffer passed is not big enough to 134 // accommodate the largest bucket size in the LRU map [1] 135 // Because bucket size, in general, cannot be known, we take the number of entries until 136 // we expect to see a hash map collision: sqrt(max_entries * 2) 137 // Default NAT map size is 262144 -> 2^ceil(log2(sqrt(262144 * 2))) = 1024, with key + entry size 138 // being ~ 432 bits, this means we'll need to allocate 55kb to accommodate this iteration. 139 // To avoid unbounded growth, each ENOSPC will result in a doubling of the chuck chunkSize 140 // which will persist into subsequent calls of Stats, up to a maximum of 3 (fold-increase). 141 // 142 // [1] https://elixir.bootlin.com/linux/latest/source/kernel/bpf/hashtab.c#L1776 143 if errors.Is(err, unix.ENOSPC) { 144 chunkSize *= 2 145 continue 146 } 147 return 0, fmt.Errorf("failed to count nat map: %w", err) 148 } 149 break 150 } 151 return count, err 152 } 153 154 func applyBatch[TupleType any, EntryType any](m *Map, fn func([]TupleType, []EntryType, int), chunkSize uint32) (count int, err error) { 155 kout := make([]TupleType, chunkSize) 156 vout := make([]EntryType, chunkSize) 157 158 var cursor ebpf.MapBatchCursor 159 for { 160 c, batchErr := m.BatchLookup(&cursor, kout, vout, nil) 161 count += c 162 fn(kout, vout, c) 163 if batchErr != nil { 164 if errors.Is(batchErr, ebpf.ErrKeyNotExist) { 165 return count, nil // end of map, we're done iterating 166 } 167 return count, batchErr 168 } 169 } 170 } 171 172 func (m *Map) Delete(k bpf.MapKey) (deleted bool, err error) { 173 deleted, err = (&m.Map).SilentDelete(k) 174 return 175 } 176 177 func (m *Map) DumpStats() *bpf.DumpStats { 178 return bpf.NewDumpStats(&m.Map) 179 } 180 181 func (m *Map) DumpReliablyWithCallback(cb bpf.DumpCallback, stats *bpf.DumpStats) error { 182 return (&m.Map).DumpReliablyWithCallback(cb, stats) 183 } 184 185 // DumpEntriesWithTimeDiff iterates through Map m and writes the values of the 186 // nat entries in m to a string. If clockSource is not nil, it uses it to 187 // compute the time difference of each entry from now and prints that too. 188 func DumpEntriesWithTimeDiff(m NatMap, clockSource *models.ClockSource) (string, error) { 189 var toDeltaSecs func(uint64) string 190 var sb strings.Builder 191 192 if clockSource == nil { 193 toDeltaSecs = func(t uint64) string { 194 return fmt.Sprintf("? (raw %d)", t) 195 } 196 } else { 197 now, err := timestamp.GetCTCurTime(clockSource) 198 if err != nil { 199 return "", err 200 } 201 tsConverter, err := timestamp.NewCTTimeToSecConverter(clockSource) 202 if err != nil { 203 return "", err 204 } 205 tsecNow := tsConverter(now) 206 toDeltaSecs = func(t uint64) string { 207 tsec := tsConverter(uint64(t)) 208 diff := int64(tsecNow) - int64(tsec) 209 return fmt.Sprintf("%dsec ago", diff) 210 } 211 } 212 213 cb := func(k bpf.MapKey, v bpf.MapValue) { 214 key := k.(NatKey) 215 if !key.ToHost().Dump(&sb, false) { 216 return 217 } 218 val := v.(NatEntry) 219 sb.WriteString(val.ToHost().Dump(key, toDeltaSecs)) 220 } 221 err := m.DumpWithCallback(cb) 222 return sb.String(), err 223 } 224 225 // DoDumpEntries iterates through Map m and writes the values of the 226 // nat entries in m to a string. 227 func DoDumpEntries(m NatMap) (string, error) { 228 return DumpEntriesWithTimeDiff(m, nil) 229 } 230 231 // DumpEntries iterates through Map m and writes the values of the 232 // nat entries in m to a string. 233 func (m *Map) DumpEntries() (string, error) { 234 return DoDumpEntries(m) 235 } 236 237 type gcStats struct { 238 *bpf.DumpStats 239 240 // deleted is the number of keys deleted 241 deleted uint32 242 243 // dumpError records any error that occurred during the dump. 244 dumpError error 245 } 246 247 func statStartGc(m *Map) gcStats { 248 return gcStats{ 249 DumpStats: bpf.NewDumpStats(&m.Map), 250 } 251 } 252 253 func doFlush4(m *Map) gcStats { 254 stats := statStartGc(m) 255 filterCallback := func(key bpf.MapKey, _ bpf.MapValue) { 256 err := (&m.Map).Delete(key) 257 if err != nil { 258 log.WithError(err).WithField(logfields.Key, key.String()).Error("Unable to delete NAT entry") 259 } else { 260 stats.deleted++ 261 } 262 } 263 stats.dumpError = m.DumpReliablyWithCallback(filterCallback, stats.DumpStats) 264 return stats 265 } 266 267 func doFlush6(m *Map) gcStats { 268 stats := statStartGc(m) 269 filterCallback := func(key bpf.MapKey, _ bpf.MapValue) { 270 err := (&m.Map).Delete(key) 271 if err != nil { 272 log.WithError(err).WithField(logfields.Key, key.String()).Error("Unable to delete NAT entry") 273 } else { 274 stats.deleted++ 275 } 276 } 277 stats.dumpError = m.DumpReliablyWithCallback(filterCallback, stats.DumpStats) 278 return stats 279 } 280 281 // Flush deletes all NAT mappings from the given table. 282 func (m *Map) Flush() int { 283 if m.family == IPv4 { 284 return int(doFlush4(m).deleted) 285 } 286 287 return int(doFlush6(m).deleted) 288 } 289 290 func DeleteMapping4(m *Map, ctKey *tuple.TupleKey4Global) error { 291 key := NatKey4{ 292 TupleKey4Global: *ctKey, 293 } 294 // Workaround #5848. 295 addr := key.SourceAddr 296 key.SourceAddr = key.DestAddr 297 key.DestAddr = addr 298 valMap, err := m.Lookup(&key) 299 if err == nil { 300 val := *(valMap.(*NatEntry4)) 301 rkey := key 302 rkey.SourceAddr = key.DestAddr 303 rkey.SourcePort = key.DestPort 304 rkey.DestAddr = val.Addr 305 rkey.DestPort = val.Port 306 rkey.Flags = tuple.TUPLE_F_IN 307 308 m.SilentDelete(&key) 309 m.SilentDelete(&rkey) 310 } 311 return nil 312 } 313 314 func DeleteMapping6(m *Map, ctKey *tuple.TupleKey6Global) error { 315 key := NatKey6{ 316 TupleKey6Global: *ctKey, 317 } 318 // Workaround #5848. 319 addr := key.SourceAddr 320 key.SourceAddr = key.DestAddr 321 key.DestAddr = addr 322 valMap, err := m.Lookup(&key) 323 if err == nil { 324 val := *(valMap.(*NatEntry6)) 325 rkey := key 326 rkey.SourceAddr = key.DestAddr 327 rkey.SourcePort = key.DestPort 328 rkey.DestAddr = val.Addr 329 rkey.DestPort = val.Port 330 rkey.Flags = tuple.TUPLE_F_IN 331 332 m.SilentDelete(&key) 333 m.SilentDelete(&rkey) 334 } 335 return nil 336 } 337 338 // Expects ingress tuple 339 func DeleteSwappedMapping4(m *Map, ctKey *tuple.TupleKey4Global) error { 340 key := NatKey4{TupleKey4Global: *ctKey} 341 // Because of #5848, we need to reverse only ports 342 port := key.SourcePort 343 key.SourcePort = key.DestPort 344 key.DestPort = port 345 key.Flags = tuple.TUPLE_F_OUT 346 m.SilentDelete(&key) 347 348 return nil 349 } 350 351 // Expects ingress tuple 352 func DeleteSwappedMapping6(m *Map, ctKey *tuple.TupleKey6Global) error { 353 key := NatKey6{TupleKey6Global: *ctKey} 354 // Because of #5848, we need to reverse only ports 355 port := key.SourcePort 356 key.SourcePort = key.DestPort 357 key.DestPort = port 358 key.Flags = tuple.TUPLE_F_OUT 359 m.SilentDelete(&key) 360 361 return nil 362 } 363 364 // GlobalMaps returns all global NAT maps. 365 func GlobalMaps(ipv4, ipv6, nodeport bool) (ipv4Map, ipv6Map *Map) { 366 if !nodeport { 367 return 368 } 369 if ipv4 { 370 ipv4Map = NewMap(MapNameSnat4Global, IPv4, maxEntries()) 371 } 372 if ipv6 { 373 ipv6Map = NewMap(MapNameSnat6Global, IPv6, maxEntries()) 374 } 375 return 376 } 377 378 // ClusterMaps returns all NAT maps for given clusters 379 func ClusterMaps(clusterID uint32, ipv4, ipv6 bool) (ipv4Map, ipv6Map *Map, err error) { 380 if ipv4 { 381 ipv4Map, err = GetClusterNATMap(clusterID, IPv4) 382 if err != nil { 383 return 384 } 385 } 386 if ipv6 { 387 ipv6Map, err = GetClusterNATMap(clusterID, IPv6) 388 if err != nil { 389 return 390 } 391 } 392 return 393 } 394 395 func maxEntries() int { 396 if option.Config.NATMapEntriesGlobal != 0 { 397 return option.Config.NATMapEntriesGlobal 398 } 399 return option.LimitTableMax 400 }