github.com/cilium/cilium@v1.16.2/pkg/maps/ctmap/ctmap.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package ctmap 5 6 import ( 7 "context" 8 "errors" 9 "fmt" 10 "io" 11 "math" 12 "net/netip" 13 "os" 14 "reflect" 15 "strings" 16 17 "github.com/cilium/ebpf" 18 "github.com/sirupsen/logrus" 19 20 "github.com/cilium/cilium/api/v1/models" 21 "github.com/cilium/cilium/pkg/bpf" 22 "github.com/cilium/cilium/pkg/controller" 23 "github.com/cilium/cilium/pkg/defaults" 24 "github.com/cilium/cilium/pkg/lock" 25 "github.com/cilium/cilium/pkg/logging" 26 "github.com/cilium/cilium/pkg/logging/logfields" 27 "github.com/cilium/cilium/pkg/maps/nat" 28 "github.com/cilium/cilium/pkg/maps/timestamp" 29 "github.com/cilium/cilium/pkg/metrics" 30 "github.com/cilium/cilium/pkg/option" 31 "github.com/cilium/cilium/pkg/time" 32 "github.com/cilium/cilium/pkg/tuple" 33 "github.com/cilium/cilium/pkg/u8proto" 34 ) 35 36 var ( 37 log = logging.DefaultLogger.WithField(logfields.LogSubsys, "map-ct") 38 39 // labelIPv6CTDumpInterrupts marks the count for conntrack dump resets (IPv6). 40 labelIPv6CTDumpInterrupts = map[string]string{ 41 metrics.LabelDatapathArea: "conntrack", 42 metrics.LabelDatapathName: "dump_interrupts", 43 metrics.LabelDatapathFamily: "ipv6", 44 } 45 // labelIPv4CTDumpInterrupts marks the count for conntrack dump resets (IPv4). 46 labelIPv4CTDumpInterrupts = map[string]string{ 47 metrics.LabelDatapathArea: "conntrack", 48 metrics.LabelDatapathName: "dump_interrupts", 49 metrics.LabelDatapathFamily: "ipv4", 50 } 51 52 mapInfo map[mapType]mapAttributes 53 ) 54 55 const ( 56 // mapCount counts the maximum number of CT maps that one endpoint may 57 // access at once. 58 mapCount = 4 59 60 // Map names for TCP CT tables are retained from Cilium 1.0 naming 61 // scheme to minimize disruption of ongoing connections during upgrade. 62 MapNamePrefix = "cilium_ct" 63 MapNameTCP6 = MapNamePrefix + "6_" 64 MapNameTCP4 = MapNamePrefix + "4_" 65 MapNameTCP6Global = MapNameTCP6 + "global" 66 MapNameTCP4Global = MapNameTCP4 + "global" 67 68 // Map names for "any" protocols indicate CT for non-TCP protocols. 69 MapNameAny6 = MapNamePrefix + "_any6_" 70 MapNameAny4 = MapNamePrefix + "_any4_" 71 MapNameAny6Global = MapNameAny6 + "global" 72 MapNameAny4Global = MapNameAny4 + "global" 73 74 mapNumEntriesLocal = 64000 75 76 TUPLE_F_OUT = 0 77 TUPLE_F_IN = 1 78 TUPLE_F_RELATED = 2 79 TUPLE_F_SERVICE = 4 80 81 // MaxTime specifies the last possible time for GCFilter.Time 82 MaxTime = math.MaxUint32 83 84 metricsAlive = "alive" 85 metricsDeleted = "deleted" 86 87 metricsIngress = "ingress" 88 metricsEgress = "egress" 89 ) 90 91 type action int 92 93 const ( 94 noAction action = iota 95 deleteEntry 96 ) 97 98 var globalDeleteLock [mapTypeMax]lock.Mutex 99 100 type mapAttributes struct { 101 natMapLock *lock.Mutex // Serializes concurrent accesses to natMap 102 natMap *nat.Map 103 } 104 105 // CtMap interface represents a CT map, and can be reused to implement mock 106 // maps for unit tests. 107 type CtMap interface { 108 Open() error 109 Close() error 110 Path() (string, error) 111 DumpEntries() (string, error) 112 DumpWithCallback(bpf.DumpCallback) error 113 Count() (int, error) 114 Update(key bpf.MapKey, value bpf.MapValue) error 115 } 116 117 // A "Record" designates a map entry (key + value), but avoid "entry" because of 118 // possible confusion with "CtEntry" (actually the value part). 119 // This type is used for JSON dump and mock maps. 120 type CtMapRecord struct { 121 Key CtKey 122 Value CtEntry 123 } 124 125 // InitMapInfo builds the information about different CT maps for the 126 // combination of L3/L4 protocols. 127 func InitMapInfo(v4, v6, nodeport bool) { 128 global4Map, global6Map := nat.GlobalMaps(v4, v6, nodeport) 129 global4MapLock := &lock.Mutex{} 130 global6MapLock := &lock.Mutex{} 131 132 // SNAT also only works if the CT map is global so all local maps will be nil 133 mapInfo = map[mapType]mapAttributes{ 134 mapTypeIPv4TCPGlobal: {natMap: global4Map, natMapLock: global4MapLock}, 135 mapTypeIPv6TCPGlobal: {natMap: global6Map, natMapLock: global6MapLock}, 136 mapTypeIPv4AnyGlobal: {natMap: global4Map, natMapLock: global4MapLock}, 137 mapTypeIPv6AnyGlobal: {natMap: global6Map, natMapLock: global6MapLock}, 138 } 139 } 140 141 // CtEndpoint represents an endpoint for the functions required to manage 142 // conntrack maps for the endpoint. 143 type CtEndpoint interface { 144 GetID() uint64 145 } 146 147 // Map represents an instance of a BPF connection tracking map. 148 // It also implements the CtMap interface. 149 type Map struct { 150 bpf.Map 151 152 mapType mapType 153 // define maps to the macro used in the datapath portion for the map 154 // name, for example 'CT_MAP4'. 155 define string 156 157 // This field indicates which cluster this ctmap is. Zero for global 158 // maps and non-zero for per-cluster maps. 159 clusterID uint32 160 } 161 162 // GCFilter contains the necessary fields to filter the CT maps. 163 // Filtering by endpoint requires both EndpointID to be > 0 and 164 // EndpointIP to be not nil. 165 type GCFilter struct { 166 // RemoveExpired enables removal of all entries that have expired 167 RemoveExpired bool 168 169 // Time is the reference timestamp to remove expired entries. If 170 // RemoveExpired is true and lifetime is lesser than Time, the entry is 171 // removed 172 Time uint32 173 174 // ValidIPs is the list of valid IPs to scrub all entries for which the 175 // source or destination IP is *not* matching one of the valid IPs. 176 ValidIPs map[netip.Addr]struct{} 177 178 // MatchIPs is the list of IPs to remove from the conntrack table 179 MatchIPs map[netip.Addr]struct{} 180 181 // EmitCTEntry is called, when non-nil, if filtering by ValidIPs and MatchIPs 182 // passes. It has no impact on CT GC, but can be used to iterate over valid 183 // CT entries. 184 EmitCTEntryCB EmitCTEntryCBFunc 185 } 186 187 // EmitCTEntryCBFunc is the type used for the EmitCTEntryCB callback in GCFilter 188 type EmitCTEntryCBFunc func(srcIP, dstIP netip.Addr, srcPort, dstPort uint16, nextHdr, flags uint8, entry *CtEntry) 189 190 // DumpEntriesWithTimeDiff iterates through Map m and writes the values of the 191 // ct entries in m to a string. If clockSource is not nil, it uses it to 192 // compute the time difference of each entry from now and prints that too. 193 func DumpEntriesWithTimeDiff(m CtMap, clockSource *models.ClockSource) (string, error) { 194 var toRemSecs func(uint32) string 195 196 if clockSource == nil { 197 toRemSecs = nil 198 } else { 199 now, err := timestamp.GetCTCurTime(clockSource) 200 if err != nil { 201 return "", err 202 } 203 tsConverter, err := timestamp.NewCTTimeToSecConverter(clockSource) 204 if err != nil { 205 return "", err 206 } 207 tsecNow := tsConverter(now) 208 toRemSecs = func(t uint32) string { 209 tsec := tsConverter(uint64(t)) 210 diff := int64(tsec) - int64(tsecNow) 211 return fmt.Sprintf("remaining: %d sec(s)", diff) 212 } 213 } 214 215 var sb strings.Builder 216 cb := func(k bpf.MapKey, v bpf.MapValue) { 217 // No need to deep copy as the values are used to create new strings 218 key := k.(CtKey) 219 if !key.ToHost().Dump(&sb, true) { 220 return 221 } 222 value := v.(*CtEntry) 223 sb.WriteString(value.StringWithTimeDiff(toRemSecs)) 224 } 225 // DumpWithCallback() must be called before sb.String(). 226 err := m.DumpWithCallback(cb) 227 if err != nil { 228 return "", err 229 } 230 return sb.String(), err 231 } 232 233 // DoDumpEntries iterates through Map m and writes the values of the ct entries 234 // in m to a string. 235 func DoDumpEntries(m CtMap) (string, error) { 236 return DumpEntriesWithTimeDiff(m, nil) 237 } 238 239 // DumpEntries iterates through Map m and writes the values of the ct entries 240 // in m to a string. 241 func (m *Map) DumpEntries() (string, error) { 242 return DoDumpEntries(m) 243 } 244 245 // Count batch dumps the Map m and returns the count of the entries. 246 func (m *Map) Count() (count int, err error) { 247 global := m.mapType.isGlobal() 248 v4 := m.mapType.isIPv4() 249 switch { 250 case global && v4: 251 return countBatch[CtKey4Global](m) 252 case global && !v4: 253 return countBatch[CtKey6Global](m) 254 case !global && v4: 255 return countBatch[CtKey4](m) 256 case !global && !v4: 257 return countBatch[CtKey6](m) 258 } 259 return 260 } 261 262 func countBatch[T any](m *Map) (count int, err error) { 263 // If we have a hash map of N = 2^n elements, then the first collision is 264 // expected [at random] when we insert around sqrt(2*N) elements. For 265 // example, for a map of size 1024, this is around 45 elements. In normal 266 // life input is not uniformly distributed, so there could be more 267 // collisions. 268 // 269 // In practice, we can expect maximum collision lengths (# of elements in a 270 // bucket ~= chunkSize) to be around 30-40. So anything like chunk_size=10% 271 // of map size should be pretty safe. If the chunkSize is not enough, then 272 // the kernel returns ENOSPC. In this case, it is possible to just set 273 // chunkSize *= 2 and try again. However, with the current chunkSize of 274 // 4096, we observe no issues dumping the maximum size of a CT map. As 275 // explained a bit below, 4096 was an optimal number considering idle 276 // memory usage and benchmarks (see commit msg). 277 // 278 // Credits to Anton for the above explanation of htab maps. 279 const chunkSize uint32 = 4096 280 281 // We can reuse the following buffers as the batch lookup does not care for 282 // the contents of the map. This saves on redundant memory allocations. 283 // 284 // The following is the number of KiB total that is allocated by Go for the 285 // following buffers based on the data type: 286 // >>> (14*4096) / 1024 # CT IPv4 map key 287 // 56.0 288 // >>> (38*4096) / 1024 # CT IPv6 map key 289 // 152.0 290 // >>> (56*4096) / 1024 # CT map value 291 // 224.0 292 kout := make([]T, chunkSize) 293 vout := make([]CtEntry, chunkSize) 294 295 var cursor ebpf.MapBatchCursor 296 for { 297 c, batchErr := m.BatchLookup(&cursor, kout, vout, nil) 298 count += c 299 if batchErr != nil { 300 if errors.Is(batchErr, ebpf.ErrKeyNotExist) { 301 return count, nil // end of map, we're done iterating 302 } 303 return count, batchErr 304 } 305 } 306 } 307 308 // OpenCTMap is a convenience function to open CT maps. It is the 309 // responsibility of the caller to ensure that m.Close() is called after this 310 // function. 311 func OpenCTMap(m CtMap) (path string, err error) { 312 path, err = m.Path() 313 if err == nil { 314 err = m.Open() 315 } 316 return 317 } 318 319 // newMap creates a new CT map of the specified type with the specified name. 320 func newMap(mapName string, m mapType) *Map { 321 result := &Map{ 322 Map: *bpf.NewMap(mapName, 323 ebpf.LRUHash, 324 m.key(), 325 m.value(), 326 m.maxEntries(), 327 0, 328 ).WithPressureMetric(), 329 mapType: m, 330 define: m.bpfDefine(), 331 } 332 return result 333 } 334 335 func purgeCtEntry6(m *Map, key CtKey, entry *CtEntry, natMap *nat.Map) error { 336 err := m.Delete(key) 337 if err != nil || natMap == nil { 338 return err 339 } 340 341 t := key.GetTupleKey() 342 tupleType := t.GetFlags() 343 344 if tupleType == tuple.TUPLE_F_OUT { 345 if entry.isDsrInternalEntry() { 346 // To delete NAT entries created by DSR 347 nat.DeleteSwappedMapping6(natMap, t.(*tuple.TupleKey6Global)) 348 } else { 349 // To delete NAT entries created for SNAT 350 nat.DeleteMapping6(natMap, t.(*tuple.TupleKey6Global)) 351 352 } 353 } 354 355 return nil 356 } 357 358 // doGC6 iterates through a CTv6 map and drops entries based on the given 359 // filter. 360 func doGC6(m *Map, filter *GCFilter) gcStats { 361 var natMap *nat.Map 362 363 if m.clusterID == 0 { 364 // global map handling 365 ctMap := mapInfo[m.mapType] 366 if ctMap.natMapLock != nil { 367 ctMap.natMapLock.Lock() 368 defer ctMap.natMapLock.Unlock() 369 } 370 natMap = ctMap.natMap 371 } else { 372 // per-cluster map handling 373 natm, err := nat.GetClusterNATMap(m.clusterID, nat.IPv6) 374 if err != nil { 375 log.WithError(err).Error("Unable to get per-cluster NAT map") 376 } else { 377 natMap = natm 378 } 379 } 380 381 stats := statStartGc(m) 382 defer stats.finish() 383 384 if natMap != nil { 385 err := natMap.Open() 386 if err == nil { 387 defer natMap.Close() 388 } else { 389 natMap = nil 390 } 391 } 392 393 filterCallback := func(key bpf.MapKey, value bpf.MapValue) { 394 entry := value.(*CtEntry) 395 396 switch obj := key.(type) { 397 case *CtKey6Global: 398 currentKey6Global := obj 399 // In CT entries, the source address of the conntrack entry (`SourceAddr`) is 400 // the destination of the packet received, therefore it's the packet's 401 // destination IP 402 action := filter.doFiltering(currentKey6Global.DestAddr.Addr(), currentKey6Global.SourceAddr.Addr(), 403 currentKey6Global.DestPort, currentKey6Global.SourcePort, 404 uint8(currentKey6Global.NextHeader), currentKey6Global.Flags, entry) 405 406 switch action { 407 case deleteEntry: 408 err := purgeCtEntry6(m, currentKey6Global, entry, natMap) 409 if err != nil { 410 log.WithError(err).WithField(logfields.Key, currentKey6Global.String()).Error("Unable to delete CT entry") 411 } else { 412 stats.deleted++ 413 } 414 default: 415 stats.aliveEntries++ 416 } 417 case *CtKey6: 418 currentKey6 := obj 419 // In CT entries, the source address of the conntrack entry (`SourceAddr`) is 420 // the destination of the packet received, therefore it's the packet's 421 // destination IP 422 action := filter.doFiltering(currentKey6.DestAddr.Addr(), currentKey6.SourceAddr.Addr(), 423 currentKey6.DestPort, currentKey6.SourcePort, 424 uint8(currentKey6.NextHeader), currentKey6.Flags, entry) 425 426 switch action { 427 case deleteEntry: 428 err := purgeCtEntry6(m, currentKey6, entry, natMap) 429 if err != nil { 430 log.WithError(err).WithField(logfields.Key, currentKey6.String()).Error("Unable to delete CT entry") 431 } else { 432 stats.deleted++ 433 } 434 default: 435 stats.aliveEntries++ 436 } 437 default: 438 log.Warningf("Encountered unknown type while scanning conntrack table: %v", reflect.TypeOf(key)) 439 } 440 } 441 442 // See doGC4() comment. 443 globalDeleteLock[m.mapType].Lock() 444 stats.dumpError = m.DumpReliablyWithCallback(filterCallback, stats.DumpStats) 445 globalDeleteLock[m.mapType].Unlock() 446 return stats 447 } 448 449 func purgeCtEntry4(m *Map, key CtKey, entry *CtEntry, natMap *nat.Map) error { 450 err := m.Delete(key) 451 if err != nil || natMap == nil { 452 return err 453 } 454 455 t := key.GetTupleKey() 456 tupleType := t.GetFlags() 457 458 if tupleType == tuple.TUPLE_F_OUT { 459 if entry.isDsrInternalEntry() { 460 // To delete NAT entries created by DSR 461 nat.DeleteSwappedMapping4(natMap, t.(*tuple.TupleKey4Global)) 462 } else { 463 // To delete NAT entries created for SNAT 464 nat.DeleteMapping4(natMap, t.(*tuple.TupleKey4Global)) 465 } 466 } 467 468 return nil 469 } 470 471 // doGC4 iterates through a CTv4 map and drops entries based on the given 472 // filter. 473 func doGC4(m *Map, filter *GCFilter) gcStats { 474 var natMap *nat.Map 475 476 if m.clusterID == 0 { 477 // global map handling 478 ctMap := mapInfo[m.mapType] 479 if ctMap.natMapLock != nil { 480 ctMap.natMapLock.Lock() 481 defer ctMap.natMapLock.Unlock() 482 } 483 natMap = ctMap.natMap 484 } else { 485 // per-cluster map handling 486 natm, err := nat.GetClusterNATMap(m.clusterID, nat.IPv4) 487 if err != nil { 488 log.WithError(err).Error("Unable to get per-cluster NAT map") 489 } else { 490 natMap = natm 491 } 492 } 493 494 stats := statStartGc(m) 495 defer stats.finish() 496 497 if natMap != nil { 498 if err := natMap.Open(); err == nil { 499 defer natMap.Close() 500 } else { 501 natMap = nil 502 } 503 } 504 505 filterCallback := func(key bpf.MapKey, value bpf.MapValue) { 506 entry := value.(*CtEntry) 507 508 switch obj := key.(type) { 509 case *CtKey4Global: 510 currentKey4Global := obj 511 // In CT entries, the source address of the conntrack entry (`SourceAddr`) is 512 // the destination of the packet received, therefore it's the packet's 513 // destination IP 514 action := filter.doFiltering(currentKey4Global.DestAddr.Addr(), currentKey4Global.SourceAddr.Addr(), 515 currentKey4Global.DestPort, currentKey4Global.SourcePort, 516 uint8(currentKey4Global.NextHeader), currentKey4Global.Flags, entry) 517 518 switch action { 519 case deleteEntry: 520 err := purgeCtEntry4(m, currentKey4Global, entry, natMap) 521 if err != nil { 522 log.WithError(err).WithField(logfields.Key, currentKey4Global.String()).Error("Unable to delete CT entry") 523 } else { 524 stats.deleted++ 525 } 526 default: 527 stats.aliveEntries++ 528 } 529 case *CtKey4: 530 currentKey4 := obj 531 // In CT entries, the source address of the conntrack entry (`SourceAddr`) is 532 // the destination of the packet received, therefore it's the packet's 533 // destination IP 534 action := filter.doFiltering(currentKey4.DestAddr.Addr(), currentKey4.SourceAddr.Addr(), 535 currentKey4.DestPort, currentKey4.SourcePort, 536 uint8(currentKey4.NextHeader), currentKey4.Flags, entry) 537 538 switch action { 539 case deleteEntry: 540 err := purgeCtEntry4(m, currentKey4, entry, natMap) 541 if err != nil { 542 log.WithError(err).WithField(logfields.Key, currentKey4.String()).Error("Unable to delete CT entry") 543 } else { 544 stats.deleted++ 545 } 546 default: 547 stats.aliveEntries++ 548 } 549 default: 550 log.Warningf("Encountered unknown type while scanning conntrack table: %v", reflect.TypeOf(key)) 551 } 552 } 553 554 // We serialize the deletions in order to avoid forced map walk restarts 555 // when keys are being evicted underneath us from concurrent goroutines. 556 globalDeleteLock[m.mapType].Lock() 557 stats.dumpError = m.DumpReliablyWithCallback(filterCallback, stats.DumpStats) 558 globalDeleteLock[m.mapType].Unlock() 559 return stats 560 } 561 562 func (f *GCFilter) doFiltering(srcIP, dstIP netip.Addr, srcPort, dstPort uint16, nextHdr, flags uint8, entry *CtEntry) action { 563 if f.RemoveExpired && entry.Lifetime < f.Time { 564 return deleteEntry 565 } 566 if f.ValidIPs != nil { 567 _, srcIPExists := f.ValidIPs[srcIP] 568 _, dstIPExists := f.ValidIPs[dstIP] 569 if !srcIPExists && !dstIPExists { 570 return deleteEntry 571 } 572 } 573 574 if f.MatchIPs != nil { 575 _, srcIPExists := f.MatchIPs[srcIP] 576 _, dstIPExists := f.MatchIPs[dstIP] 577 if srcIPExists || dstIPExists { 578 return deleteEntry 579 } 580 } 581 582 if f.EmitCTEntryCB != nil { 583 f.EmitCTEntryCB(srcIP, dstIP, srcPort, dstPort, nextHdr, flags, entry) 584 } 585 586 return noAction 587 } 588 589 func doGC(m *Map, filter *GCFilter) (int, error) { 590 if m.mapType.isIPv6() { 591 stats := doGC6(m, filter) 592 return int(stats.deleted), stats.dumpError 593 } else if m.mapType.isIPv4() { 594 stats := doGC4(m, filter) 595 return int(stats.deleted), stats.dumpError 596 } 597 log.Fatalf("Unsupported ct map type: %s", m.mapType.String()) 598 return 0, fmt.Errorf("unsupported ct map type: %s", m.mapType.String()) 599 } 600 601 // GC runs garbage collection for map m with name mapType with the given filter. 602 // It returns how many items were deleted from m. 603 func GC(m *Map, filter *GCFilter) (int, error) { 604 if filter.RemoveExpired { 605 t, _ := timestamp.GetCTCurTime(timestamp.GetClockSourceFromOptions()) 606 filter.Time = uint32(t) 607 } 608 609 return doGC(m, filter) 610 } 611 612 // PurgeOrphanNATEntries removes orphan SNAT entries. We call an SNAT entry 613 // orphan if it does not have a corresponding CT entry. 614 // 615 // Typically NAT entries should get removed along with their owning CT entry, 616 // as part of purgeCtEntry*(). But stale NAT entries can get left behind if the 617 // CT entry disappears for other reasons - for instance by LRU eviction, or 618 // when the datapath re-purposes the CT entry. 619 // 620 // PurgeOrphanNATEntries() is triggered by the datapath via the GC signaling 621 // mechanism. When the datapath SNAT fails to find free mapping after 622 // SNAT_SIGNAL_THRES attempts, it sends the signal via the perf ring buffer. 623 // The consumer of the buffer invokes the function. 624 // 625 // The SNAT is being used for the following cases: 626 // 1. By NodePort BPF on an intermediate node before fwd'ing request from outside 627 // to a destination node. 628 // 2. A packet from local endpoint sent to outside (BPF-masq). 629 // 3. A packet from a host local application (i.e. running in the host netns) 630 // This is needed to prevent SNAT from hijacking such connections. 631 // 4. By DSR on a backend node to SNAT responses with service IP+port before 632 // sending to a client. 633 // 634 // In all 4 cases we create a CT_EGRESS CT entry. This allows the 635 // CT GC to remove corresponding SNAT entries. 636 // See the unit test TestOrphanNatGC for more examples. 637 func PurgeOrphanNATEntries(ctMapTCP, ctMapAny *Map) *NatGCStats { 638 // Both CT maps should point to the same natMap, so use the first one 639 // to determine natMap 640 ctMap := mapInfo[ctMapTCP.mapType] 641 if ctMap.natMapLock != nil { 642 ctMap.natMapLock.Lock() 643 defer ctMap.natMapLock.Unlock() 644 } 645 natMap := ctMap.natMap 646 if natMap == nil { 647 return nil 648 } 649 650 family := gcFamilyIPv4 651 if ctMapTCP.mapType.isIPv6() { 652 family = gcFamilyIPv6 653 } 654 stats := newNatGCStats(natMap, family) 655 defer stats.finish() 656 657 cb := func(key bpf.MapKey, value bpf.MapValue) { 658 natKey := key.(nat.NatKey) 659 natVal := value.(nat.NatEntry) 660 661 ctMap := ctMapAny 662 if natKey.GetNextHeader() == u8proto.TCP { 663 ctMap = ctMapTCP 664 } 665 666 if natKey.GetFlags()&tuple.TUPLE_F_IN == tuple.TUPLE_F_IN { // natKey is r(everse)tuple 667 ctKey := egressCTKeyFromIngressNatKeyAndVal(natKey, natVal) 668 669 if !ctEntryExist(ctMap, ctKey, nil) { 670 // No egress CT entry is found, delete the orphan ingress SNAT entry 671 if deleted, _ := natMap.Delete(natKey); deleted { 672 stats.IngressDeleted++ 673 } 674 } else { 675 stats.IngressAlive++ 676 } 677 } else if natKey.GetFlags()&tuple.TUPLE_F_OUT == tuple.TUPLE_F_OUT { 678 checkDsr := func(entry *CtEntry) bool { 679 return entry.isDsrInternalEntry() 680 } 681 682 egressCTKey := egressCTKeyFromEgressNatKey(natKey) 683 dsrCTKey := dsrCTKeyFromEgressNatKey(natKey) 684 685 if !ctEntryExist(ctMap, egressCTKey, nil) && 686 !ctEntryExist(ctMap, dsrCTKey, checkDsr) { 687 // No relevant CT entries were found, delete the orphan egress NAT entry 688 if deleted, _ := natMap.Delete(natKey); deleted { 689 stats.EgressDeleted++ 690 } 691 } else { 692 stats.EgressAlive++ 693 } 694 } 695 } 696 697 if err := natMap.DumpReliablyWithCallback(cb, stats.DumpStats); err != nil { 698 log.WithError(err).Error("NATmap dump failed during GC") 699 } else { 700 natMap.UpdatePressureMetricWithSize(int32(stats.IngressAlive + stats.EgressAlive)) 701 } 702 703 return &stats 704 } 705 706 // Flush runs garbage collection for map m with the name mapType, deleting all 707 // entries. The specified map must be already opened using bpf.OpenMap(). 708 func (m *Map) Flush() int { 709 d, _ := doGC(m, &GCFilter{ 710 RemoveExpired: true, 711 Time: MaxTime, 712 }) 713 return d 714 } 715 716 // DeleteIfUpgradeNeeded attempts to open the conntrack maps associated with 717 // the specified endpoint, and delete the maps from the filesystem if any 718 // properties do not match the properties defined in this package. 719 // 720 // The typical trigger for this is when, for example, the CT entry size changes 721 // from one version of Cilium to the next. When Cilium restarts, it may opt 722 // to restore endpoints from the prior life. Existing endpoints that use the 723 // old map style are incompatible with the new version, so the CT map must be 724 // destroyed and recreated during upgrade. By removing the old map location 725 // from the filesystem, we ensure that the next time that the endpoint is 726 // regenerated, it will recreate a new CT map with the new properties. 727 // 728 // Note that if an existing BPF program refers to the map at the canonical 729 // paths (as fetched via the getMapPathsToKeySize() call below), then that BPF 730 // program will continue to operate on the old map, even once the map is 731 // removed from the filesystem. The old map will only be completely cleaned up 732 // once all referenced to the map are cleared - that is, all BPF programs which 733 // refer to the old map and removed/reloaded. 734 func DeleteIfUpgradeNeeded(e CtEndpoint) { 735 for _, newMap := range maps(e, true, true) { 736 path, err := newMap.Path() 737 if err != nil { 738 log.WithError(err).Warning("Failed to get path for CT map") 739 continue 740 } 741 scopedLog := log.WithField(logfields.Path, path) 742 743 // Pass nil key and value types since we're not intending on accessing the 744 // map's contents. 745 oldMap, err := bpf.OpenMap(path, nil, nil) 746 if err != nil { 747 scopedLog.WithError(err).Debug("Couldn't open CT map for upgrade") 748 continue 749 } 750 defer oldMap.Close() 751 752 if oldMap.CheckAndUpgrade(&newMap.Map) { 753 scopedLog.Warning("CT Map upgraded, expect brief disruption of ongoing connections") 754 } 755 } 756 } 757 758 // maps returns all connecting tracking maps associated with endpoint 'e' (or 759 // the global maps if 'e' is nil). 760 func maps(e CtEndpoint, ipv4, ipv6 bool) []*Map { 761 result := make([]*Map, 0, mapCount) 762 if e == nil { 763 if ipv4 { 764 result = append(result, newMap(MapNameTCP4Global, mapTypeIPv4TCPGlobal)) 765 result = append(result, newMap(MapNameAny4Global, mapTypeIPv4AnyGlobal)) 766 } 767 if ipv6 { 768 result = append(result, newMap(MapNameTCP6Global, mapTypeIPv6TCPGlobal)) 769 result = append(result, newMap(MapNameAny6Global, mapTypeIPv6AnyGlobal)) 770 } 771 } else { 772 if ipv4 { 773 result = append(result, newMap(bpf.LocalMapName(MapNameTCP4, uint16(e.GetID())), 774 mapTypeIPv4TCPLocal)) 775 result = append(result, newMap(bpf.LocalMapName(MapNameAny4, uint16(e.GetID())), 776 mapTypeIPv4AnyLocal)) 777 } 778 if ipv6 { 779 result = append(result, newMap(bpf.LocalMapName(MapNameTCP6, uint16(e.GetID())), 780 mapTypeIPv6TCPLocal)) 781 result = append(result, newMap(bpf.LocalMapName(MapNameAny6, uint16(e.GetID())), 782 mapTypeIPv6AnyLocal)) 783 } 784 } 785 return result 786 } 787 788 // LocalMaps returns a slice of CT maps for the endpoint, which are local to 789 // the endpoint and not shared with other endpoints. If ipv4 or ipv6 are false, 790 // the maps for that protocol will not be returned. 791 // 792 // The returned maps are not yet opened. 793 func LocalMaps(e CtEndpoint, ipv4, ipv6 bool) []*Map { 794 return maps(e, ipv4, ipv6) 795 } 796 797 // GlobalMaps returns a slice of CT maps that are used globally by all 798 // endpoints that are not otherwise configured to use their own local maps. 799 // If ipv4 or ipv6 are false, the maps for that protocol will not be returned. 800 // 801 // The returned maps are not yet opened. 802 func GlobalMaps(ipv4, ipv6 bool) []*Map { 803 return maps(nil, ipv4, ipv6) 804 } 805 806 // NameIsGlobal returns true if the specified filename (basename) denotes a 807 // global conntrack map. 808 func NameIsGlobal(filename string) bool { 809 switch filename { 810 case MapNameTCP4Global, MapNameAny4Global, MapNameTCP6Global, MapNameAny6Global: 811 return true 812 } 813 return false 814 } 815 816 // WriteBPFMacros writes the map names for conntrack maps into the specified 817 // writer, defining usage of the global map or local maps depending on whether 818 // the specified CtEndpoint is nil. 819 func WriteBPFMacros(fw io.Writer, e CtEndpoint) { 820 var mapEntriesTCP, mapEntriesAny int 821 for _, m := range maps(e, true, true) { 822 fmt.Fprintf(fw, "#define %s %s\n", m.define, m.Name()) 823 if m.mapType.isTCP() { 824 mapEntriesTCP = m.mapType.maxEntries() 825 } else { 826 mapEntriesAny = m.mapType.maxEntries() 827 } 828 } 829 fmt.Fprintf(fw, "#define CT_MAP_SIZE_TCP %d\n", mapEntriesTCP) 830 fmt.Fprintf(fw, "#define CT_MAP_SIZE_ANY %d\n", mapEntriesAny) 831 } 832 833 // Exists returns false if the CT maps for the specified endpoint (or global 834 // maps if nil) are not pinned to the filesystem, or true if they exist or 835 // an internal error occurs. 836 func Exists(e CtEndpoint, ipv4, ipv6 bool) bool { 837 result := true 838 for _, m := range maps(e, ipv4, ipv6) { 839 path, err := m.Path() 840 if err != nil { 841 // Catch this error early 842 return true 843 } 844 if _, err = os.Stat(path); os.IsNotExist(err) { 845 result = false 846 } 847 } 848 849 return result 850 } 851 852 var cachedGCInterval time.Duration 853 854 // GetInterval returns the interval adjusted based on the deletion ratio of the 855 // last run 856 func GetInterval(actualPrevInterval time.Duration, maxDeleteRatio float64) time.Duration { 857 if val := option.Config.ConntrackGCInterval; val != time.Duration(0) { 858 return val 859 } 860 861 expectedPrevInterval := cachedGCInterval 862 adjustedDeleteRatio := maxDeleteRatio 863 if expectedPrevInterval == time.Duration(0) { 864 expectedPrevInterval = defaults.ConntrackGCStartingInterval 865 } else if actualPrevInterval < expectedPrevInterval && actualPrevInterval > 0 { 866 adjustedDeleteRatio *= float64(expectedPrevInterval) / float64(actualPrevInterval) 867 } 868 869 newInterval := calculateInterval(expectedPrevInterval, adjustedDeleteRatio) 870 if val := option.Config.ConntrackGCMaxInterval; val != time.Duration(0) && newInterval > val { 871 newInterval = val 872 } 873 874 if newInterval != expectedPrevInterval { 875 log.WithFields(logrus.Fields{ 876 "expectedPrevInterval": expectedPrevInterval, 877 "actualPrevInterval": actualPrevInterval, 878 "newInterval": newInterval, 879 "deleteRatio": maxDeleteRatio, 880 "adjustedDeleteRatio": adjustedDeleteRatio, 881 }).Info("Conntrack garbage collector interval recalculated") 882 } 883 884 return newInterval 885 } 886 887 func calculateInterval(prevInterval time.Duration, maxDeleteRatio float64) (interval time.Duration) { 888 interval = prevInterval 889 890 if maxDeleteRatio == 0.0 { 891 return 892 } 893 894 switch { 895 case maxDeleteRatio > 0.25: 896 if maxDeleteRatio > 0.9 { 897 maxDeleteRatio = 0.9 898 } 899 // 25%..90% => 1.3x..10x shorter 900 interval = time.Duration(float64(interval) * (1.0 - maxDeleteRatio)).Round(time.Second) 901 902 if interval < defaults.ConntrackGCMinInterval { 903 interval = defaults.ConntrackGCMinInterval 904 } 905 906 case maxDeleteRatio < 0.05: 907 // When less than 5% of entries were deleted, increase the 908 // interval. Use a simple 1.5x multiplier to start growing slowly 909 // as a new node may not be seeing workloads yet and thus the 910 // scan will return a low deletion ratio at first. 911 interval = time.Duration(float64(interval) * 1.5).Round(time.Second) 912 if interval > defaults.ConntrackGCMaxLRUInterval { 913 interval = defaults.ConntrackGCMaxLRUInterval 914 } 915 } 916 917 cachedGCInterval = interval 918 919 return 920 } 921 922 // CalculateCTMapPressure is a controller that calculates the BPF CT map 923 // pressure and pubishes it as part of the BPF map pressure metric. 924 func CalculateCTMapPressure(mgr *controller.Manager, allMaps ...*Map) { 925 ctx, cancel := context.WithCancelCause(context.Background()) 926 mgr.UpdateController("ct-map-pressure", controller.ControllerParams{ 927 Group: controller.Group{ 928 Name: "ct-map-pressure", 929 }, 930 DoFunc: func(context.Context) error { 931 var errs error 932 for _, m := range allMaps { 933 path, err := OpenCTMap(m) 934 if err != nil { 935 msg := "Skipping CT map pressure calculation" 936 scopedLog := log.WithError(err).WithField(logfields.Path, path) 937 if os.IsNotExist(err) { 938 scopedLog.Debug(msg) 939 } else { 940 scopedLog.Warn(msg) 941 } 942 continue 943 } 944 defer m.Close() 945 946 count, err := m.Count() 947 if errors.Is(err, ebpf.ErrNotSupported) { 948 // We don't have batch ops, so cancel context to kill this 949 // controller. 950 cancel(err) 951 return err 952 } 953 if err != nil { 954 errs = errors.Join(errs, fmt.Errorf("failed to dump CT map %v: %w", m.Name(), err)) 955 } 956 m.UpdatePressureMetricWithSize(int32(count)) 957 } 958 return errs 959 }, 960 RunInterval: 30 * time.Second, 961 Context: ctx, 962 }) 963 }