github.com/cilium/cilium@v1.16.2/pkg/datapath/linux/ipsec/ipsec_linux.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 //go:build linux 5 6 package ipsec 7 8 import ( 9 "bufio" 10 "bytes" 11 "context" 12 "crypto/sha256" 13 "crypto/sha512" 14 "encoding/hex" 15 "errors" 16 "fmt" 17 "io" 18 "log/slog" 19 "net" 20 "os" 21 "strconv" 22 "strings" 23 "sync" 24 25 "github.com/cilium/hive/cell" 26 "github.com/cilium/hive/job" 27 "github.com/fsnotify/fsnotify" 28 "github.com/prometheus/procfs" 29 "github.com/vishvananda/netlink" 30 31 "github.com/cilium/cilium/pkg/common/ipsec" 32 "github.com/cilium/cilium/pkg/datapath/linux/linux_defaults" 33 "github.com/cilium/cilium/pkg/datapath/linux/route" 34 datapath "github.com/cilium/cilium/pkg/datapath/types" 35 "github.com/cilium/cilium/pkg/fswatcher" 36 "github.com/cilium/cilium/pkg/lock" 37 "github.com/cilium/cilium/pkg/logging/logfields" 38 "github.com/cilium/cilium/pkg/maps/encrypt" 39 "github.com/cilium/cilium/pkg/node" 40 "github.com/cilium/cilium/pkg/option" 41 "github.com/cilium/cilium/pkg/resiliency" 42 "github.com/cilium/cilium/pkg/time" 43 ) 44 45 type IPSecDir string 46 47 const ( 48 IPSecDirIn IPSecDir = "IPSEC_IN" 49 IPSecDirOut IPSecDir = "IPSEC_OUT" 50 IPSecDirBoth IPSecDir = "IPSEC_BOTH" 51 IPSecDirOutNode IPSecDir = "IPSEC_OUT_NODE" 52 53 // Constants used to decode the IPsec secret in both formats: 54 // 1. [spi] aead-algo aead-key icv-len 55 // 2. [spi] auth-algo auth-key enc-algo enc-key [IP] 56 offsetSPI = 0 57 offsetAeadAlgo = 1 58 offsetAeadKey = 2 59 offsetICV = 3 60 offsetAuthAlgo = 1 61 offsetAuthKey = 2 62 offsetEncAlgo = 3 63 offsetEncKey = 4 64 offsetIP = 5 65 maxOffset = offsetIP 66 67 defaultDropPriority = 100 68 oldXFRMOutPolicyPriority = 50 69 70 // DefaultReqID is the default reqid used for all IPSec rules. 71 DefaultReqID = 1 72 73 // EncryptedOverlayReqID is the reqid used for encrypting overlay traffic. 74 EncryptedOverlayReqID = 2 75 ) 76 77 type dir string 78 79 const ( 80 dirUnspec dir = "unspecified" 81 dirIngress dir = "ingress" 82 dirEgress dir = "egress" 83 ) 84 85 type ipSecKey struct { 86 Spi uint8 87 ESN bool 88 ReqID int 89 Auth *netlink.XfrmStateAlgo 90 Crypt *netlink.XfrmStateAlgo 91 Aead *netlink.XfrmStateAlgo 92 } 93 94 type oldXfrmStateKey struct { 95 Spi int 96 Dst [16]byte 97 } 98 99 var ( 100 ipSecLock lock.RWMutex 101 102 // ipSecKeysGlobal can be accessed by multiple subsystems concurrently, 103 // so it should be accessed only through the getIPSecKeys and 104 // LoadIPSecKeys functions, which will ensure the proper lock is held 105 ipSecKeysGlobal = make(map[string]*ipSecKey) 106 107 // ipSecCurrentKeySPI is the SPI of the IPSec currently in use 108 ipSecCurrentKeySPI uint8 109 110 // ipSecKeysRemovalTime is used to track at which time a given key is 111 // replaced with a newer one, allowing to reclaim old keys only after 112 // enough time has passed since their replacement 113 ipSecKeysRemovalTime = make(map[uint8]time.Time) 114 115 wildcardIPv4 = net.ParseIP("0.0.0.0") 116 wildcardCIDRv4 = &net.IPNet{ 117 IP: wildcardIPv4, 118 Mask: net.IPv4Mask(0, 0, 0, 0), 119 } 120 wildcardIPv6 = net.ParseIP("0::0") 121 wildcardCIDRv6 = &net.IPNet{ 122 IP: wildcardIPv6, 123 Mask: net.CIDRMask(0, 128), 124 } 125 126 defaultDropMark = &netlink.XfrmMark{ 127 Value: linux_defaults.RouteMarkEncrypt, 128 Mask: linux_defaults.IPsecMarkBitMask, 129 } 130 defaultDropPolicyIPv4 = &netlink.XfrmPolicy{ 131 Dir: netlink.XFRM_DIR_OUT, 132 Src: wildcardCIDRv4, 133 Dst: wildcardCIDRv4, 134 Mark: defaultDropMark, 135 Action: netlink.XFRM_POLICY_BLOCK, 136 Priority: defaultDropPriority, 137 } 138 defaultDropPolicyIPv6 = &netlink.XfrmPolicy{ 139 Dir: netlink.XFRM_DIR_OUT, 140 Src: wildcardCIDRv6, 141 Dst: wildcardCIDRv6, 142 Mark: defaultDropMark, 143 Action: netlink.XFRM_POLICY_BLOCK, 144 Priority: defaultDropPriority, 145 } 146 147 // To attempt to remove any stale XFRM configs once at startup, after 148 // we've added the catch-all default-drop policy. 149 removeStaleIPv4XFRMOnce sync.Once 150 removeStaleIPv6XFRMOnce sync.Once 151 152 oldXFRMInMark *netlink.XfrmMark = &netlink.XfrmMark{ 153 Value: linux_defaults.RouteMarkDecrypt, 154 Mask: linux_defaults.IPsecMarkBitMask, 155 } 156 // xfrmStateCache is a cache of XFRM states to avoid querying each time. 157 // This is especially important for backgroundSync that is used to validate 158 // if the XFRM state is correct, without usually modyfing anything. 159 // The cache is invalidated whenever a new XFRM state is added/updated/removed, 160 // but also in case of TTL expiration. 161 // It provides XfrmStateAdd/Update/Del wrappers that ensure cache 162 // is correctly invalidate. 163 xfrmStateCache = NewXfrmStateListCache(time.Minute) 164 ) 165 166 func getGlobalIPsecKey(ip net.IP) *ipSecKey { 167 ipSecLock.RLock() 168 defer ipSecLock.RUnlock() 169 170 key, scoped := ipSecKeysGlobal[ip.String()] 171 if !scoped { 172 key = ipSecKeysGlobal[""] 173 } 174 return key 175 } 176 177 // computeNodeIPsecKey computes per-node-pair IPsec keys from the global, 178 // pre-shared key. The per-node-pair keys are computed with a SHA256 hash of 179 // the global key, source node IP, destination node IP appended together. 180 func computeNodeIPsecKey(globalKey, srcNodeIP, dstNodeIP, srcBootID, dstBootID []byte) []byte { 181 inputLen := len(globalKey) + len(srcNodeIP) + len(dstNodeIP) + len(srcBootID) + len(dstBootID) 182 input := make([]byte, 0, inputLen) 183 input = append(input, globalKey...) 184 input = append(input, srcNodeIP...) 185 input = append(input, dstNodeIP...) 186 input = append(input, srcBootID[:36]...) 187 input = append(input, dstBootID[:36]...) 188 189 var hash []byte 190 if len(globalKey) <= 32 { 191 h := sha256.Sum256(input) 192 hash = h[:] 193 } else { 194 h := sha512.Sum512(input) 195 hash = h[:] 196 } 197 return hash[:len(globalKey)] 198 } 199 200 // canonicalIP returns a canonical IPv4 address (4 bytes) 201 // in case we were dealing with a v4 mapped V6 address. 202 func canonicalIP(ip net.IP) net.IP { 203 if v4 := ip.To4(); v4 != nil { 204 return v4 205 } 206 return ip 207 } 208 209 // deriveNodeIPsecKey builds a per-node-pair ipSecKey object from the global 210 // ipSecKey object. 211 func deriveNodeIPsecKey(globalKey *ipSecKey, srcNodeIP, dstNodeIP net.IP, srcBootID, dstBootID string) *ipSecKey { 212 nodeKey := &ipSecKey{ 213 Spi: globalKey.Spi, 214 ReqID: globalKey.ReqID, 215 ESN: globalKey.ESN, 216 } 217 218 srcNodeIP = canonicalIP(srcNodeIP) 219 dstNodeIP = canonicalIP(dstNodeIP) 220 221 if globalKey.Aead != nil { 222 nodeKey.Aead = &netlink.XfrmStateAlgo{ 223 Name: globalKey.Aead.Name, 224 Key: computeNodeIPsecKey(globalKey.Aead.Key, srcNodeIP, dstNodeIP, []byte(srcBootID), []byte(dstBootID)), 225 ICVLen: globalKey.Aead.ICVLen, 226 } 227 } else { 228 nodeKey.Auth = &netlink.XfrmStateAlgo{ 229 Name: globalKey.Auth.Name, 230 Key: computeNodeIPsecKey(globalKey.Auth.Key, srcNodeIP, dstNodeIP, []byte(srcBootID), []byte(dstBootID)), 231 } 232 233 nodeKey.Crypt = &netlink.XfrmStateAlgo{ 234 Name: globalKey.Crypt.Name, 235 Key: computeNodeIPsecKey(globalKey.Crypt.Key, srcNodeIP, dstNodeIP, []byte(srcBootID), []byte(dstBootID)), 236 } 237 } 238 239 return nodeKey 240 } 241 242 // We want one IPsec key per node pair. For a pair of nodes A and B with IP 243 // addresses a and b, and boot ids x and y respectively, we will therefore 244 // install two different keys: 245 // Node A <> Node B 246 // XFRM IN: key(b+a+y+x) XFRM IN: key(a+b+x+y) 247 // XFRM OUT: key(a+b+x+y) XFRM OUT: key(b+a+y+x) 248 // This is done such that, for each pair of nodes A, B, the key used for 249 // decryption on A (XFRM IN) is the same key used for encryption on B (XFRM 250 // OUT), and vice versa. And its ESN automatically resets on each node reboot. 251 func getNodeIPsecKey(localNodeIP, remoteNodeIP net.IP, localBootID, remoteBootID string, dir netlink.Dir) *ipSecKey { 252 globalKey := getGlobalIPsecKey(localNodeIP) 253 if globalKey == nil { 254 return nil 255 } 256 if !globalKey.ESN { 257 return globalKey 258 } 259 260 if dir == netlink.XFRM_DIR_OUT { 261 return deriveNodeIPsecKey(globalKey, localNodeIP, remoteNodeIP, localBootID, remoteBootID) 262 } 263 return deriveNodeIPsecKey(globalKey, remoteNodeIP, localNodeIP, remoteBootID, localBootID) 264 } 265 266 func ipSecNewState(keys *ipSecKey) *netlink.XfrmState { 267 state := netlink.XfrmState{ 268 Mode: netlink.XFRM_MODE_TUNNEL, 269 Proto: netlink.XFRM_PROTO_ESP, 270 ESN: keys.ESN, 271 Spi: int(keys.Spi), 272 Reqid: keys.ReqID, 273 } 274 if keys.ESN { 275 state.ReplayWindow = 1024 276 } 277 if keys.Aead != nil { 278 state.Aead = keys.Aead 279 } else { 280 state.Crypt = keys.Crypt 281 state.Auth = keys.Auth 282 } 283 return &state 284 } 285 286 func ipSecNewPolicy() *netlink.XfrmPolicy { 287 policy := netlink.XfrmPolicy{} 288 return &policy 289 } 290 291 func ipSecAttachPolicyTempl(policy *netlink.XfrmPolicy, keys *ipSecKey, srcIP, dstIP net.IP, spi bool, optional int) { 292 tmpl := netlink.XfrmPolicyTmpl{ 293 Proto: netlink.XFRM_PROTO_ESP, 294 Mode: netlink.XFRM_MODE_TUNNEL, 295 Reqid: keys.ReqID, 296 Dst: dstIP, 297 Src: srcIP, 298 Optional: optional, 299 } 300 301 if spi { 302 tmpl.Spi = int(keys.Spi) 303 } 304 305 policy.Tmpls = append(policy.Tmpls, tmpl) 306 } 307 308 // xfrmStateReplace attempts to add a new XFRM state only if one doesn't 309 // already exist. If it doesn't but some other XFRM state conflicts, then 310 // we attempt to remove the conflicting state before trying to add again. 311 func xfrmStateReplace(log *slog.Logger, new *netlink.XfrmState, remoteRebooted bool) error { 312 states, err := xfrmStateCache.XfrmStateList() 313 if err != nil { 314 return fmt.Errorf("Cannot get XFRM state: %w", err) 315 } 316 317 scopedLog := log.With( 318 logfields.SPI, new.Spi, 319 logfields.SourceIP, new.Src, 320 logfields.DestinationIP, new.Dst, 321 logfields.TrafficDirection, getDirFromXfrmMark(new.Mark), 322 logfields.NodeID, getNodeIDAsHexFromXfrmMark(new.Mark), 323 ) 324 325 // Check if the XFRM state already exists 326 for _, s := range states { 327 if xfrmIPEqual(s.Src, new.Src) && xfrmIPEqual(s.Dst, new.Dst) && 328 xfrmMarkEqual(s.Mark, new.Mark) && s.Spi == new.Spi { 329 if !xfrmKeyEqual(&s, new) { 330 // The states are the same, including the SPI, but the 331 // encryption key changed. This is expected on upgrade because 332 // we changed the way we compute the per-node-pair key. 333 scopedLog.Info("Removing XFRM state with old IPsec key") 334 xfrmStateCache.XfrmStateDel(&s) 335 break 336 } 337 if !xfrmMarkEqual(s.OutputMark, new.OutputMark) { 338 // If only the output-marks differ, then we should be able 339 // to simply update the XFRM state atomically. 340 return xfrmStateCache.XfrmStateUpdate(new) 341 } 342 if remoteRebooted && new.ESN { 343 // This should happen only when a node reboots when the boot ID 344 // is used to compute the key (i.e., if ESN is also enabled). 345 // We can safely perform a non-atomic swap of the XFRM state 346 // for both the IN and OUT directions because: 347 // - For the IN direction, we can't leak anything. At most 348 // we'll drop a few encrypted packets while updating. 349 // - For the OUT direction, we also can't leak anything due to 350 // having an existing XFRM policy which will match and drop 351 // packets if the state is missing. At most we will drop a 352 // few encrypted packets while updating. 353 scopedLog.Info("Non-atomically updating IPsec XFRM state due to remote boot ID change") 354 xfrmStateCache.XfrmStateDel(&s) 355 break 356 } 357 return nil 358 } 359 } 360 361 var ( 362 oldXFRMOutMark = &netlink.XfrmMark{ 363 Value: ipSecXfrmMarkSetSPI(linux_defaults.RouteMarkEncrypt, uint8(new.Spi)), 364 Mask: linux_defaults.IPsecOldMarkMaskOut, 365 } 366 errs = resiliency.NewErrorSet("failed to delete old xfrm states", len(states)) 367 ) 368 for _, s := range states { 369 // This is either the XFRM OUT state or the XFRM IN state from a 370 // previous Cilium version. Because their marks match the new mark 371 // (e.g., 0xXXXX3e00/0xffffff00 ∈ 0x3e00/0xff00), the kernel considers 372 // the two states conflict and we won't be able to add the new ones 373 // until the old one is removed. 374 // 375 // Thus, we temporarily remove the old, conflicting XFRM state and 376 // re-add it in a defer. In between the removal of the old state and 377 // the addition of the new, we can have a packet drops due to the 378 // missing state. These drops should be limited to the specific node 379 // pair we are handling here and the window during which they can 380 // happen should be really small. This is also specific to the upgrade 381 // and can be removed in v1.16. 382 if s.Spi == new.Spi && xfrmIPEqual(s.Dst, new.Dst) { 383 var dir string 384 // The old XFRM IN state matches on 0.0.0.0 so it conflicts even 385 // though the source IP addresses of old and new are different. 386 // Thus, we don't need to compare source IP addresses for the IN 387 // states. 388 if xfrmIPEqual(s.Src, new.Src) && xfrmMarkEqual(s.Mark, oldXFRMOutMark) { 389 dir = "OUT" 390 } else if xfrmMarkEqual(s.Mark, oldXFRMInMark) { 391 dir = "IN" 392 } else { 393 continue 394 } 395 396 err, deferFn := xfrmTemporarilyRemoveState(scopedLog, s, dir) 397 if err != nil { 398 errs.Add(fmt.Errorf("Failed to remove old XFRM %s state %s: %w", dir, s.String(), err)) 399 } else { 400 defer deferFn() 401 } 402 } 403 } 404 if err := errs.Error(); err != nil { 405 scopedLog.Error("Failed to clean up old XFRM state", logfields.Error, err) 406 return err 407 } 408 409 // It doesn't exist so let's attempt to add it. 410 firstAttemptErr := xfrmStateCache.XfrmStateAdd(new) 411 if !os.IsExist(firstAttemptErr) { 412 return firstAttemptErr 413 } 414 scopedLog.Error("Failed to add XFRM state due to conflicting state") 415 416 // An existing state conflicts with this one. We need to remove the 417 // existing one first. 418 deletedSomething, err := xfrmDeleteConflictingState(log, states, new) 419 if err != nil { 420 return err 421 } 422 423 // If no conflicting state was found and deleted, there's no point in 424 // attempting to add again. 425 if !deletedSomething { 426 return firstAttemptErr 427 } 428 return xfrmStateCache.XfrmStateAdd(new) 429 } 430 431 // Temporarily remove an XFRM state to allow the addition of another, 432 // conflicting XFRM state. This function removes the conflicting state and 433 // prepares a defer callback to re-add it with proper logging. 434 func xfrmTemporarilyRemoveState(scopedLog *slog.Logger, state netlink.XfrmState, dir string) (error, func()) { 435 stats, err := procfs.NewXfrmStat() 436 errorCnt := 0 437 if err != nil { 438 scopedLog.Error("Error while getting XFRM stats before state removal", logfields.Error, err) 439 } else { 440 if dir == "IN" { 441 errorCnt = stats.XfrmInNoStates 442 } else { 443 errorCnt = stats.XfrmOutNoStates 444 } 445 } 446 447 start := time.Now() 448 if err := xfrmStateCache.XfrmStateDel(&state); err != nil { 449 return err, nil 450 } 451 return nil, func() { 452 if err := xfrmStateCache.XfrmStateAdd(&state); err != nil { 453 scopedLog.Error("Failed to re-add old XFRM state", 454 "directory", dir, logfields.Error, err) 455 } 456 elapsed := time.Since(start) 457 458 stats, err := procfs.NewXfrmStat() 459 if err != nil { 460 scopedLog.Error("Error while getting XFRM stats after state removal", logfields.Error, err) 461 errorCnt = 0 462 } else { 463 if dir == "IN" { 464 errorCnt = stats.XfrmInNoStates - errorCnt 465 } else { 466 errorCnt = stats.XfrmOutNoStates - errorCnt 467 } 468 } 469 scopedLog.Info("Temporarily removed old XFRM state", 470 "directory", dir, "packetsDropped", errorCnt, logfields.Duration, elapsed) 471 } 472 } 473 474 // Attempt to remove any XFRM state that conflicts with the state we just tried 475 // to add. To find those conflicting states, we need to use the same logic that 476 // the kernel used to reject our check with EEXIST. That logic is upstream in 477 // __xfrm_state_lookup. 478 func xfrmDeleteConflictingState(log *slog.Logger, states []netlink.XfrmState, new *netlink.XfrmState) (bool, error) { 479 var ( 480 deletedSomething bool 481 errs = resiliency.NewErrorSet("failed to delete conflicting XFRM states", len(states)) 482 ) 483 for _, s := range states { 484 if new.Spi == s.Spi && (new.Mark == nil) == (s.Mark == nil) && 485 (new.Mark == nil || new.Mark.Value&new.Mark.Mask&s.Mark.Mask == s.Mark.Value) && 486 xfrmIPEqual(new.Src, s.Src) && xfrmIPEqual(new.Dst, s.Dst) { 487 if err := xfrmStateCache.XfrmStateDel(&s); err != nil { 488 errs.Add(err) 489 continue 490 } 491 deletedSomething = true 492 log.Info("Removed a conflicting XFRM state", 493 logfields.SPI, s.Spi, 494 logfields.SourceIP, s.Src, 495 logfields.DestinationIP, s.Dst, 496 logfields.TrafficDirection, getDirFromXfrmMark(s.Mark), 497 logfields.NodeID, getNodeIDAsHexFromXfrmMark(s.Mark), 498 ) 499 } 500 } 501 return deletedSomething, errs.Error() 502 } 503 504 // This function compares two IP addresses and returns true if they are equal. 505 // This is unfortunately necessary because our netlink library returns nil IPv6 506 // addresses as nil IPv4 addresses and net.IP.Equal rightfully considers those 507 // are different. 508 func xfrmIPEqual(ip1, ip2 net.IP) bool { 509 if ip1.IsUnspecified() && ip2.IsUnspecified() { 510 return true 511 } 512 return ip1.Equal(ip2) 513 } 514 515 // Returns true if two XFRM marks are identical. They should be either both nil 516 // or have the same mark value and mask. 517 func xfrmMarkEqual(mark1, mark2 *netlink.XfrmMark) bool { 518 if (mark1 == nil) != (mark2 == nil) { 519 return false 520 } 521 return mark1 == nil || (mark1.Value == mark2.Value && mark1.Mask == mark2.Mask) 522 } 523 524 // Returns true if the two XFRM states have the same encryption key. 525 func xfrmKeyEqual(s1, s2 *netlink.XfrmState) bool { 526 if (s1.Aead == nil) != (s2.Aead == nil) || 527 (s1.Crypt == nil) != (s2.Crypt == nil) || 528 (s1.Auth == nil) != (s2.Auth == nil) { 529 return false 530 } 531 if s1.Aead != nil { 532 return bytes.Equal(s1.Aead.Key, s2.Aead.Key) 533 } 534 return bytes.Equal(s1.Crypt.Key, s2.Crypt.Key) && 535 bytes.Equal(s1.Auth.Key, s2.Auth.Key) 536 } 537 538 func ipSecReplaceStateIn(log *slog.Logger, localIP, remoteIP net.IP, nodeID uint16, zeroMark bool, localBootID, remoteBootID string, remoteRebooted bool, reqID int) (uint8, error) { 539 key := getNodeIPsecKey(localIP, remoteIP, localBootID, remoteBootID, netlink.XFRM_DIR_IN) 540 if key == nil { 541 return 0, fmt.Errorf("IPSec key missing") 542 } 543 key.ReqID = reqID 544 state := ipSecNewState(key) 545 state.Src = remoteIP 546 state.Dst = localIP 547 state.Mark = generateDecryptMark(linux_defaults.RouteMarkDecrypt, nodeID) 548 if zeroMark { 549 state.OutputMark = &netlink.XfrmMark{ 550 Value: 0, 551 Mask: linux_defaults.OutputMarkMask, 552 } 553 } else if reqID == EncryptedOverlayReqID { 554 state.OutputMark = &netlink.XfrmMark{ 555 Value: linux_defaults.RouteMarkDecryptedOverlay, 556 Mask: linux_defaults.OutputMarkMask, 557 } 558 } else { 559 state.OutputMark = &netlink.XfrmMark{ 560 Value: linux_defaults.RouteMarkDecrypt, 561 Mask: linux_defaults.OutputMarkMask, 562 } 563 } 564 // We want to clear the node ID regardless of zeroMark parameter. That 565 // value is never needed after decryption. 566 state.OutputMark.Mask |= linux_defaults.IPsecMarkMaskNodeID 567 568 return key.Spi, xfrmStateReplace(log, state, remoteRebooted) 569 } 570 571 func ipSecReplaceStateOut(log *slog.Logger, localIP, remoteIP net.IP, nodeID uint16, localBootID, remoteBootID string, remoteRebooted bool, reqID int) (uint8, error) { 572 key := getNodeIPsecKey(localIP, remoteIP, localBootID, remoteBootID, netlink.XFRM_DIR_OUT) 573 if key == nil { 574 return 0, fmt.Errorf("IPSec key missing") 575 } 576 key.ReqID = reqID 577 state := ipSecNewState(key) 578 state.Src = localIP 579 state.Dst = remoteIP 580 state.Mark = generateEncryptMark(key.Spi, nodeID) 581 state.OutputMark = &netlink.XfrmMark{ 582 Value: linux_defaults.RouteMarkEncrypt, 583 Mask: linux_defaults.OutputMarkMask, 584 } 585 return key.Spi, xfrmStateReplace(log, state, remoteRebooted) 586 } 587 588 func _ipSecReplacePolicyInFwd(src, dst *net.IPNet, tmplSrc, tmplDst net.IP, proxyMark bool, dir netlink.Dir, reqID int) error { 589 optional := int(0) 590 // We can use the global IPsec key here because we are not going to 591 // actually use the secret itself. 592 key := getGlobalIPsecKey(dst.IP) 593 if key == nil { 594 return fmt.Errorf("IPSec key missing") 595 } 596 key.ReqID = reqID 597 598 wildcardIP := wildcardIPv4 599 wildcardCIDR := wildcardCIDRv4 600 if tmplDst.To4() == nil { 601 wildcardIP = wildcardIPv6 602 wildcardCIDR = wildcardCIDRv6 603 } 604 605 policy := ipSecNewPolicy() 606 policy.Dir = dir 607 if dir == netlink.XFRM_DIR_IN { 608 policy.Src = src 609 policy.Dst = dst 610 policy.Mark = &netlink.XfrmMark{ 611 Mask: linux_defaults.IPsecMarkBitMask, 612 } 613 if proxyMark { 614 // We require a policy to match on packets going to the proxy which are 615 // therefore carrying the proxy mark. We however don't need a policy 616 // for the encrypted packets because there is already a state matching 617 // them. 618 policy.Mark.Value = linux_defaults.RouteMarkToProxy 619 // We must mark the IN policy for the proxy optional simply because it 620 // is lacking a corresponding state. 621 optional = 1 622 // We set the source tmpl address to 0/0 to explicit that it 623 // doesn't matter. 624 tmplSrc = wildcardIP 625 } else { 626 policy.Mark.Value = linux_defaults.RouteMarkDecrypt 627 } 628 } 629 // We always make forward rules optional. The only reason we have these 630 // at all is to appease the XFRM route hooks, we don't really care about 631 // policy because Cilium BPF programs do that. 632 if dir == netlink.XFRM_DIR_FWD { 633 optional = 1 634 policy.Priority = linux_defaults.IPsecFwdPriority 635 // In case of fwd policies, we should tell the kernel the tmpl src 636 // doesn't matter; we want all fwd packets to go through. 637 policy.Src = wildcardCIDR 638 policy.Dst = wildcardCIDR 639 } 640 ipSecAttachPolicyTempl(policy, key, tmplSrc, tmplDst, false, optional) 641 return netlink.XfrmPolicyUpdate(policy) 642 } 643 644 func ipSecReplacePolicyIn(src, dst *net.IPNet, tmplSrc, tmplDst net.IP, reqID int) error { 645 if err := _ipSecReplacePolicyInFwd(src, dst, tmplSrc, tmplDst, true, netlink.XFRM_DIR_IN, reqID); err != nil { 646 return err 647 } 648 return _ipSecReplacePolicyInFwd(src, dst, tmplSrc, tmplDst, false, netlink.XFRM_DIR_IN, reqID) 649 } 650 651 func IpSecReplacePolicyFwd(dst *net.IPNet, tmplDst net.IP, reqID int) error { 652 // The source CIDR and IP aren't used in the case of FWD policies. 653 return _ipSecReplacePolicyInFwd(nil, dst, net.IP{}, tmplDst, false, netlink.XFRM_DIR_FWD, reqID) 654 } 655 656 // Installs a catch-all policy for outgoing traffic that has the encryption 657 // bit. The goal here is to catch any traffic that may passthrough our 658 // encryption while we are replacing XFRM policies & states. Those operations 659 // cannot always be performed atomically so we may have brief moments where 660 // there is no XFRM policy to encrypt a subset of traffic. This policy ensures 661 // we drop such traffic and don't let it flow in plain text. 662 // 663 // We do need to match on the mark because there is also traffic flowing 664 // through XFRM that we don't want to encrypt (e.g., hostns traffic). 665 func IPsecDefaultDropPolicy(log *slog.Logger, ipv6 bool) error { 666 log = log.With(logfields.LogSubsys, subsystem) 667 668 defaultDropPolicy := defaultDropPolicyIPv4 669 family := netlink.FAMILY_V4 670 if ipv6 { 671 defaultDropPolicy = defaultDropPolicyIPv6 672 family = netlink.FAMILY_V6 673 } 674 675 err := netlink.XfrmPolicyUpdate(defaultDropPolicy) 676 677 // We move the existing XFRM OUT policy to a lower priority to allow the 678 // new priorities to take precedence. 679 // This code can be removed in Cilium v1.15 to instead remove the old XFRM 680 // OUT policy and state. 681 removeStaleXFRMOnce := &removeStaleIPv4XFRMOnce 682 if ipv6 { 683 removeStaleXFRMOnce = &removeStaleIPv6XFRMOnce 684 } 685 removeStaleXFRMOnce.Do(func() { 686 deprioritizeOldOutPolicy(log, family) 687 }) 688 689 return err 690 } 691 692 // Lowers the priority of the old XFRM OUT policy. We rely on the mark mask to 693 // identify it. By lowering the priority, we will allow the new XFRM OUT 694 // policies to take precedence. We cannot simply remove and replace the old 695 // XFRM OUT configs because that would cause traffic interruptions on upgrades. 696 func deprioritizeOldOutPolicy(log *slog.Logger, family int) { 697 policies, err := netlink.XfrmPolicyList(family) 698 if err != nil { 699 log.Error("Cannot get XFRM policies", logfields.Error, err) 700 } 701 for _, p := range policies { 702 if p.Dir == netlink.XFRM_DIR_OUT && p.Mark.Mask == linux_defaults.IPsecOldMarkMaskOut { 703 p.Priority = oldXFRMOutPolicyPriority 704 if err := netlink.XfrmPolicyUpdate(&p); err != nil { 705 log.Error("Failed to deprioritize old XFRM policy", 706 logfields.Error, err, 707 logfields.SourceCIDR, p.Src, 708 logfields.DestinationCIDR, p.Dst, 709 logfields.TrafficDirection, getDirFromXfrmMark(p.Mark), 710 logfields.NodeID, getNodeIDAsHexFromXfrmMark(p.Mark), 711 ) 712 } 713 } 714 } 715 } 716 717 // ipSecXfrmMarkSetSPI takes a XfrmMark base value, an SPI, returns the mark 718 // value with the SPI value encoded in it 719 func ipSecXfrmMarkSetSPI(markValue uint32, spi uint8) uint32 { 720 return markValue | (uint32(spi) << linux_defaults.IPsecXFRMMarkSPIShift) 721 } 722 723 func getNodeIDAsHexFromXfrmMark(mark *netlink.XfrmMark) string { 724 return fmt.Sprintf("0x%x", ipsec.GetNodeIDFromXfrmMark(mark)) 725 } 726 727 func getDirFromXfrmMark(mark *netlink.XfrmMark) dir { 728 switch { 729 case mark == nil: 730 return dirUnspec 731 case mark.Value&linux_defaults.RouteMarkDecrypt != 0: 732 return dirIngress 733 case mark.Value&linux_defaults.RouteMarkEncrypt != 0: 734 return dirEgress 735 } 736 return dirUnspec 737 } 738 739 func generateEncryptMark(spi uint8, nodeID uint16) *netlink.XfrmMark { 740 val := ipSecXfrmMarkSetSPI(linux_defaults.RouteMarkEncrypt, spi) 741 val |= uint32(nodeID) << 16 742 return &netlink.XfrmMark{ 743 Value: val, 744 Mask: linux_defaults.IPsecMarkMaskOut, 745 } 746 } 747 748 func generateDecryptMark(decryptBit uint32, nodeID uint16) *netlink.XfrmMark { 749 val := decryptBit | (uint32(nodeID) << 16) 750 return &netlink.XfrmMark{ 751 Value: val, 752 Mask: linux_defaults.IPsecMarkMaskIn, 753 } 754 } 755 756 func ipSecReplacePolicyOut(src, dst *net.IPNet, tmplSrc, tmplDst net.IP, nodeID uint16, dir IPSecDir, reqID int) error { 757 // TODO: Remove old policy pointing to target net 758 759 // We can use the global IPsec key here because we are not going to 760 // actually use the secret itself. 761 key := getGlobalIPsecKey(dst.IP) 762 if key == nil { 763 return fmt.Errorf("IPSec key missing") 764 } 765 key.ReqID = reqID 766 767 policy := ipSecNewPolicy() 768 if dir == IPSecDirOutNode { 769 policy.Src = wildcardCIDRv4 770 } else { 771 policy.Src = src 772 } 773 policy.Dst = dst 774 policy.Dir = netlink.XFRM_DIR_OUT 775 policy.Mark = generateEncryptMark(key.Spi, nodeID) 776 ipSecAttachPolicyTempl(policy, key, tmplSrc, tmplDst, true, 0) 777 return netlink.XfrmPolicyUpdate(policy) 778 } 779 780 // Returns true if the given mark matches on the node ID. This works because 781 // the node ID match is always in the first 16 bits. 782 func matchesOnNodeID(mark *netlink.XfrmMark) bool { 783 return mark != nil && 784 mark.Mask&linux_defaults.IPsecMarkMaskNodeID == linux_defaults.IPsecMarkMaskNodeID 785 } 786 787 func ipsecDeleteXfrmState(log *slog.Logger, nodeID uint16) error { 788 scopedLog := log.With( 789 logfields.NodeID, nodeID, 790 ) 791 792 xfrmStateList, err := xfrmStateCache.XfrmStateList() 793 if err != nil { 794 scopedLog.Warn("Failed to list XFRM states for deletion", logfields.Error, err) 795 return err 796 } 797 798 xfrmStatesToDelete := []netlink.XfrmState{} 799 oldXfrmInStates := map[oldXfrmStateKey]netlink.XfrmState{} 800 for _, s := range xfrmStateList { 801 if matchesOnNodeID(s.Mark) && ipsec.GetNodeIDFromXfrmMark(s.Mark) == nodeID { 802 xfrmStatesToDelete = append(xfrmStatesToDelete, s) 803 } 804 if xfrmMarkEqual(s.Mark, oldXFRMInMark) { 805 key := oldXfrmStateKey{ 806 Spi: s.Spi, 807 Dst: [16]byte(s.Dst.To16()), 808 } 809 oldXfrmInStates[key] = s 810 } 811 } 812 813 errs := resiliency.NewErrorSet(fmt.Sprintf("failed to delete node (%d) xfrm states", nodeID), len(xfrmStateList)) 814 for _, s := range xfrmStatesToDelete { 815 key := oldXfrmStateKey{ 816 Spi: s.Spi, 817 Dst: [16]byte(s.Dst.To16()), 818 } 819 var oldXfrmInState *netlink.XfrmState = nil 820 old, ok := oldXfrmInStates[key] 821 if ok { 822 oldXfrmInState = &old 823 } 824 if err := safeDeleteXfrmState(log, &s, oldXfrmInState); err != nil { 825 errs.Add(fmt.Errorf("failed to delete xfrm state (%s): %w", s.String(), err)) 826 } 827 } 828 829 return errs.Error() 830 } 831 832 // safeDeleteXfrmState deletes the given XFRM state. Specifically, if the 833 // state is to catch ingress traffic marked with nodeID (0xXXXX0d00), we 834 // temporarily remove the old XFRM state that matches 0xd00/0xf00. This is to 835 // workaround a kernel issue that prevents us from deleting a specific XFRM 836 // state (e.g. catching 0xXXXX0d00/0xffff0f00) when there is also a general 837 // xfrm state (e.g. catching 0xd00/0xf00). When both XFRM states coexist, 838 // kernel deletes the general XFRM state instead of the specific one, even if 839 // the deleting request is for the specific one. 840 func safeDeleteXfrmState(log *slog.Logger, state *netlink.XfrmState, oldState *netlink.XfrmState) (err error) { 841 if getDirFromXfrmMark(state.Mark) == dirIngress && ipsec.GetNodeIDFromXfrmMark(state.Mark) != 0 && oldState != nil { 842 843 errs := resiliency.NewErrorSet("failed to delete old xfrm states", 1) 844 845 scopedLog := log.With( 846 logfields.SPI, state.Spi, 847 logfields.SourceIP, state.Src, 848 logfields.DestinationIP, state.Dst, 849 logfields.TrafficDirection, getDirFromXfrmMark(state.Mark), 850 logfields.NodeID, getNodeIDAsHexFromXfrmMark(state.Mark), 851 ) 852 853 err, deferFn := xfrmTemporarilyRemoveState(scopedLog, *oldState, string(dirIngress)) 854 if err != nil { 855 errs.Add(fmt.Errorf("Failed to remove old XFRM %s state %s: %w", string(dirIngress), oldState.String(), err)) 856 } else { 857 defer deferFn() 858 } 859 if err := errs.Error(); err != nil { 860 scopedLog.Error("Failed to clean up old XFRM state", logfields.Error, err) 861 return err 862 } 863 } 864 865 return xfrmStateCache.XfrmStateDel(state) 866 } 867 868 func ipsecDeleteXfrmPolicy(log *slog.Logger, nodeID uint16) error { 869 scopedLog := log.With( 870 logfields.NodeID, nodeID, 871 ) 872 873 xfrmPolicyList, err := netlink.XfrmPolicyList(netlink.FAMILY_ALL) 874 if err != nil { 875 scopedLog.Warn("Failed to list XFRM policies for deletion", logfields.Error, err) 876 return fmt.Errorf("failed to list xfrm policies: %w", err) 877 } 878 errs := resiliency.NewErrorSet("failed to delete xfrm policies", len(xfrmPolicyList)) 879 for _, p := range xfrmPolicyList { 880 if matchesOnNodeID(p.Mark) && ipsec.GetNodeIDFromXfrmMark(p.Mark) == nodeID { 881 if err := netlink.XfrmPolicyDel(&p); err != nil { 882 errs.Add(fmt.Errorf("unable to delete xfrm policy %s: %w", p.String(), err)) 883 } 884 } 885 } 886 if err := errs.Error(); err != nil { 887 scopedLog.Warn("Failed to delete XFRM policy", logfields.Error, err) 888 return err 889 } 890 891 return nil 892 } 893 894 /* UpsertIPsecEndpoint updates the IPSec context for a new endpoint inserted in 895 * the ipcache. Currently we support a global crypt/auth keyset that will encrypt 896 * all traffic between endpoints. An IPSec context consists of two pieces a policy 897 * and a state, the security policy database (SPD) and security association 898 * database (SAD). These are implemented using the Linux kernels XFRM implementation. 899 * 900 * For all traffic that matches a policy, the policy tuple used is 901 * (sip/mask, dip/mask, dev) with an optional mark field used in the Cilium implementation 902 * to ensure only expected traffic is encrypted. The state hashtable is searched for 903 * a matching state associated with that flow. The Linux kernel will do a series of 904 * hash lookups to find the most specific state (xfrm_dst) possible. The hash keys searched are 905 * the following, (daddr, saddr, reqid, encap_family), (daddr, wildcard, reqid, encap), 906 * (mark, daddr, spi, proto, encap). Any "hits" in the hash table will subsequently 907 * have the SPI checked to ensure it also matches. Encap is ignored in our case here 908 * and can be used with UDP encap if wanted. 909 * 910 * The implications of the (inflexible!) hash key implementation is that in-order 911 * to have a policy/state match we _must_ insert a state for each daddr. For Cilium 912 * this translates to a state entry per node. We learn the nodes/endpoints by 913 * listening to ipcache events. Finally, because IPSec is unidirectional a state 914 * is needed for both ingress and egress. Denoted by the DIR on the xfrm cmd line 915 * in the policy lookup. In the Cilium case, where we have IPSec between all 916 * endpoints this results in two policy rules per node, one for ingress 917 * and one for egress. 918 * 919 * For a concrete example consider two cluster nodes using transparent mode e.g. 920 * without an IPSec tunnel IP. Cluster Node A has host_ip 10.156.0.1 with an 921 * endpoint assigned to IP 10.156.2.2 and cluster Node B has host_ip 10.182.0.1 922 * with an endpoint using IP 10.182.3.3. Then on Node A there will be a two policy 923 * entries and a set of State entries, 924 * 925 * Policy1(src=10.182.0.0/16,dst=10.156.0.1/16,dir=in,tmpl(spi=#spi,reqid=#reqid)) 926 * Policy2(src=10.156.0.0/16,dst=10.182.0.1/16,dir=out,tmpl(spi=#spi,reqid=#reqid)) 927 * State1(src=*,dst=10.182.0.1,spi=#spi,reqid=#reqid,...) 928 * State2(src=*,dst=10.156.0.1,spi=#spi,reqid=#reqid,...) 929 * 930 * Design Note: For newer kernels a BPF xfrm interface would greatly simplify the 931 * state space. Basic idea would be to reference a state using any key generated 932 * from BPF program allowing for a single state per security ctx. 933 */ 934 func UpsertIPsecEndpoint(log *slog.Logger, local, remote *net.IPNet, outerLocal, outerRemote net.IP, remoteNodeID uint16, remoteBootID string, dir IPSecDir, outputMark, remoteRebooted bool, reqID int) (uint8, error) { 935 log = log.With(logfields.LogSubsys, subsystem) 936 937 var spi uint8 938 var err error 939 940 /* TODO: state reference ID is (dip,spi) which can be duplicated in the current global 941 * mode. The duplication is on _all_ ingress states because dst_ip == host_ip in this 942 * case and only a single spi entry is in use. Currently no check is done to avoid 943 * attempting to add duplicate (dip,spi) states and we get 'file exist' error. These 944 * errors are expected at the moment but perhaps it would be better to avoid calling 945 * netlink API at all when we "know" an entry is a duplicate. To do this the xfer 946 * state would need to be cached in the ipcache. 947 */ 948 if !outerLocal.Equal(outerRemote) { 949 localBootID := node.GetBootID() 950 if dir == IPSecDirIn || dir == IPSecDirBoth { 951 if spi, err = ipSecReplaceStateIn(log, outerLocal, outerRemote, remoteNodeID, outputMark, localBootID, remoteBootID, remoteRebooted, reqID); err != nil { 952 return 0, fmt.Errorf("unable to replace local state: %w", err) 953 } 954 if err = ipSecReplacePolicyIn(remote, local, outerRemote, outerLocal, reqID); err != nil { 955 if !os.IsExist(err) { 956 return 0, fmt.Errorf("unable to replace policy in: %w", err) 957 } 958 } 959 if err = IpSecReplacePolicyFwd(local, outerLocal, reqID); err != nil { 960 if !os.IsExist(err) { 961 return 0, fmt.Errorf("unable to replace policy fwd: %w", err) 962 } 963 } 964 } 965 966 if dir == IPSecDirOut || dir == IPSecDirOutNode || dir == IPSecDirBoth { 967 if spi, err = ipSecReplaceStateOut(log, outerLocal, outerRemote, remoteNodeID, localBootID, remoteBootID, remoteRebooted, reqID); err != nil { 968 return 0, fmt.Errorf("unable to replace remote state: %w", err) 969 } 970 971 if err = ipSecReplacePolicyOut(local, remote, outerLocal, outerRemote, remoteNodeID, dir, reqID); err != nil { 972 if !os.IsExist(err) { 973 return 0, fmt.Errorf("unable to replace policy out: %w", err) 974 } 975 } 976 } 977 } 978 return spi, nil 979 } 980 981 // UpsertIPsecEndpointPolicy adds a policy to the xfrm rules. Used to add a policy when the state 982 // rule is already available. 983 func UpsertIPsecEndpointPolicy(local, remote *net.IPNet, localTmpl, remoteTmpl net.IP, remoteNodeID uint16, dir IPSecDir, reqID int) error { 984 if err := ipSecReplacePolicyOut(local, remote, localTmpl, remoteTmpl, remoteNodeID, dir, reqID); err != nil { 985 if !os.IsExist(err) { 986 return fmt.Errorf("unable to replace templated policy out: %w", err) 987 } 988 } 989 return nil 990 } 991 992 // DeleteIPsecEndpoint deletes a endpoint associated with the remote IP address 993 func DeleteIPsecEndpoint(log *slog.Logger, nodeID uint16) error { 994 log = log.With(logfields.LogSubsys, subsystem) 995 return errors.Join(ipsecDeleteXfrmState(log, nodeID), ipsecDeleteXfrmPolicy(log, nodeID)) 996 } 997 998 func isXfrmPolicyCilium(policy netlink.XfrmPolicy) bool { 999 if policy.Mark == nil { 1000 // Check if its our fwd rule, we don't have a mark 1001 // on this rule so use priority. 1002 if policy.Dir == netlink.XFRM_DIR_FWD && 1003 policy.Priority == linux_defaults.IPsecFwdPriority { 1004 return true 1005 } 1006 return false 1007 } 1008 1009 if (policy.Mark.Value & linux_defaults.RouteMarkDecrypt) != 0 { 1010 return true 1011 } 1012 if (policy.Mark.Value & linux_defaults.RouteMarkEncrypt) != 0 { 1013 return true 1014 } 1015 return false 1016 } 1017 1018 func isXfrmStateCilium(state netlink.XfrmState) bool { 1019 if state.Mark == nil { 1020 return false 1021 } 1022 if (state.Mark.Value & linux_defaults.RouteMarkDecrypt) != 0 { 1023 return true 1024 } 1025 if (state.Mark.Value & linux_defaults.RouteMarkEncrypt) != 0 { 1026 return true 1027 } 1028 return false 1029 } 1030 1031 // DeleteXFRM remove any remaining XFRM policy or state from tables 1032 func DeleteXFRM(log *slog.Logger) error { 1033 return DeleteXFRMWithReqID(log, 0) 1034 } 1035 1036 // DeleteXFRMWithReqID remove any XFRM policy or state from tables which matches the reqID 1037 // If reqID is 0, it will remove all XFRM policy or state 1038 func DeleteXFRMWithReqID(log *slog.Logger, reqID int) error { 1039 log = log.With(logfields.LogSubsys, subsystem) 1040 1041 xfrmPolicyList, err := netlink.XfrmPolicyList(netlink.FAMILY_ALL) 1042 if err != nil { 1043 return err 1044 } 1045 1046 ee := resiliency.NewErrorSet("failed to delete XFRM policies", len(xfrmPolicyList)) 1047 policy: 1048 for _, p := range xfrmPolicyList { 1049 if !isXfrmPolicyCilium(p) { 1050 continue 1051 } 1052 1053 // check if there exists a template with req ID as the one we are looking for 1054 // if so, delete the policy. 1055 for _, tmpl := range p.Tmpls { 1056 if reqID == 0 || tmpl.Reqid == reqID { 1057 if err := netlink.XfrmPolicyDel(&p); err != nil { 1058 ee.Add(err) 1059 } 1060 continue policy 1061 } 1062 } 1063 } 1064 if err := ee.Error(); err != nil { 1065 return err 1066 } 1067 1068 xfrmStateList, err := xfrmStateCache.XfrmStateList() 1069 if err != nil { 1070 log.Warn("unable to fetch xfrm state list", logfields.Error, err) 1071 return err 1072 } 1073 ee = resiliency.NewErrorSet("failed to delete XFRM states", len(xfrmStateList)) 1074 for _, s := range xfrmStateList { 1075 if isXfrmStateCilium(s) && (reqID == 0 || s.Reqid == reqID) { 1076 if err := xfrmStateCache.XfrmStateDel(&s); err != nil { 1077 ee.Add(err) 1078 } 1079 } 1080 } 1081 1082 return ee.Error() 1083 } 1084 1085 func decodeIPSecKey(keyRaw string) (int, []byte, error) { 1086 // As we have released the v1.4.0 docs telling the users to write the 1087 // k8s secret with the prefix "0x" we have to remove it if it is present, 1088 // so we can decode the secret. 1089 if keyRaw == "\"\"" { 1090 return 0, nil, nil 1091 } 1092 keyTrimmed := strings.TrimPrefix(keyRaw, "0x") 1093 key, err := hex.DecodeString(keyTrimmed) 1094 return len(keyTrimmed), key, err 1095 } 1096 1097 // LoadIPSecKeysFile imports IPSec auth and crypt keys from a file. The format 1098 // is to put a key per line as follows, (auth-algo auth-key enc-algo enc-key) 1099 // Returns the authentication overhead in bytes, the key ID, and an error. 1100 func LoadIPSecKeysFile(log *slog.Logger, path string) (int, uint8, error) { 1101 log.Info("Loading IPsec keyfile", 1102 logfields.Path, path, 1103 logfields.LogSubsys, subsystem, 1104 ) 1105 1106 file, err := os.Open(path) 1107 if err != nil { 1108 return 0, 0, err 1109 } 1110 defer file.Close() 1111 return LoadIPSecKeys(log, file) 1112 } 1113 1114 func LoadIPSecKeys(log *slog.Logger, r io.Reader) (int, uint8, error) { 1115 log = log.With(logfields.LogSubsys, subsystem) 1116 var spi uint8 1117 var keyLen int 1118 1119 ipSecLock.Lock() 1120 defer ipSecLock.Unlock() 1121 1122 if err := encrypt.MapCreate(); err != nil { 1123 return 0, 0, fmt.Errorf("Encrypt map create failed: %w", err) 1124 } 1125 1126 scanner := bufio.NewScanner(r) 1127 scanner.Split(bufio.ScanLines) 1128 for scanner.Scan() { 1129 var ( 1130 oldSpi uint8 1131 aeadKey []byte 1132 authKey []byte 1133 esn bool 1134 err error 1135 offsetBase int 1136 ) 1137 1138 ipSecKey := &ipSecKey{ 1139 ReqID: DefaultReqID, 1140 } 1141 1142 // Scanning IPsec keys with one of the following formats: 1143 // 1. [spi] aead-algo aead-key icv-len 1144 // 2. [spi] auth-algo auth-key enc-algo enc-key [IP] 1145 s := strings.Split(scanner.Text(), " ") 1146 if len(s) < 3 { 1147 // Regardless of the format used, the IPsec secret should have at 1148 // least 3 fields separated by white spaces. 1149 return 0, 0, fmt.Errorf("missing IPSec key or invalid format") 1150 } 1151 1152 spi, offsetBase, esn, err = parseSPI(log, s[offsetSPI]) 1153 if err != nil { 1154 return 0, 0, fmt.Errorf("failed to parse SPI: %w", err) 1155 } 1156 1157 if len(s) > offsetBase+maxOffset+1 { 1158 return 0, 0, fmt.Errorf("invalid format: too many fields in the IPsec secret") 1159 } else if len(s) == offsetBase+offsetICV+1 { 1160 // We're in the first case, with "[spi] aead-algo aead-key icv-len". 1161 aeadName := s[offsetBase+offsetAeadAlgo] 1162 if !strings.HasPrefix(aeadName, "rfc") { 1163 return 0, 0, fmt.Errorf("invalid AEAD algorithm %q", aeadName) 1164 } 1165 1166 _, aeadKey, err = decodeIPSecKey(s[offsetBase+offsetAeadKey]) 1167 if err != nil { 1168 return 0, 0, fmt.Errorf("unable to decode AEAD key string %q", s[offsetBase+offsetAeadKey]) 1169 } 1170 1171 icvLen, err := strconv.Atoi(s[offsetICV+offsetBase]) 1172 if err != nil { 1173 return 0, 0, fmt.Errorf("ICV length is invalid or missing") 1174 } 1175 1176 if icvLen != 96 && icvLen != 128 && icvLen != 256 { 1177 return 0, 0, fmt.Errorf("only ICV lengths 96, 128, and 256 are accepted") 1178 } 1179 1180 ipSecKey.Aead = &netlink.XfrmStateAlgo{ 1181 Name: aeadName, 1182 Key: aeadKey, 1183 ICVLen: icvLen, 1184 } 1185 keyLen = icvLen / 8 1186 } else { 1187 // We're in the second case, with "[spi] auth-algo auth-key enc-algo enc-key [IP]". 1188 authAlgo := s[offsetBase+offsetAuthAlgo] 1189 keyLen, authKey, err = decodeIPSecKey(s[offsetBase+offsetAuthKey]) 1190 if err != nil { 1191 return 0, 0, fmt.Errorf("unable to decode authentication key string %q", s[offsetBase+offsetAuthKey]) 1192 } 1193 1194 encAlgo := s[offsetBase+offsetEncAlgo] 1195 _, encKey, err := decodeIPSecKey(s[offsetBase+offsetEncKey]) 1196 if err != nil { 1197 return 0, 0, fmt.Errorf("unable to decode encryption key string %q", s[offsetBase+offsetEncKey]) 1198 } 1199 1200 ipSecKey.Auth = &netlink.XfrmStateAlgo{ 1201 Name: authAlgo, 1202 Key: authKey, 1203 } 1204 ipSecKey.Crypt = &netlink.XfrmStateAlgo{ 1205 Name: encAlgo, 1206 Key: encKey, 1207 } 1208 } 1209 1210 ipSecKey.Spi = spi 1211 ipSecKey.ESN = esn 1212 1213 if len(s) == offsetBase+offsetIP+1 { 1214 // The IPsec secret has the optional IP address field at the end. 1215 log.Warn("IPsec secrets with an IP address as the last argument are deprecated and will be unsupported in v1.13.") 1216 if ipSecKeysGlobal[s[offsetBase+offsetIP]] != nil { 1217 oldSpi = ipSecKeysGlobal[s[offsetBase+offsetIP]].Spi 1218 } 1219 ipSecKeysGlobal[s[offsetBase+offsetIP]] = ipSecKey 1220 } else { 1221 if ipSecKeysGlobal[""] != nil { 1222 oldSpi = ipSecKeysGlobal[""].Spi 1223 } 1224 ipSecKeysGlobal[""] = ipSecKey 1225 } 1226 1227 ipSecKeysRemovalTime[oldSpi] = time.Now() 1228 ipSecCurrentKeySPI = spi 1229 } 1230 return keyLen, spi, nil 1231 } 1232 1233 func parseSPI(log *slog.Logger, spiStr string) (uint8, int, bool, error) { 1234 esn := false 1235 if spiStr[len(spiStr)-1] == '+' { 1236 esn = true 1237 spiStr = spiStr[:len(spiStr)-1] 1238 } 1239 spi, err := strconv.Atoi(spiStr) 1240 if err != nil { 1241 // If no version info is provided assume using key format without 1242 // versioning and assign SPI. 1243 log.Warn("IPsec secrets without an SPI as the first argument are deprecated and will be unsupported in v1.13.") 1244 return 1, -1, esn, nil 1245 } 1246 if spi > linux_defaults.IPsecMaxKeyVersion { 1247 return 0, 0, false, fmt.Errorf("encryption key space exhausted. ID must be nonzero and less than %d. Attempted %q", linux_defaults.IPsecMaxKeyVersion+1, spiStr) 1248 } 1249 if spi == 0 { 1250 return 0, 0, false, fmt.Errorf("zero is not a valid key ID. ID must be nonzero and less than %d. Attempted %q", linux_defaults.IPsecMaxKeyVersion+1, spiStr) 1251 } 1252 if !esn { 1253 log.Warn(fmt.Sprintf("global IPsec keys are deprecated and will be removed in v1.17. Use per-tunnel keys instead by adding a '+' sign after the SPI (%d+ in your case).", spi)) 1254 } 1255 return uint8(spi), 0, esn, nil 1256 } 1257 1258 func SetIPSecSPI(log *slog.Logger, spi uint8) error { 1259 log = log.With(logfields.LogSubsys, subsystem) 1260 if err := encrypt.MapUpdateContext(0, spi); err != nil { 1261 log.Warn("cilium_encrypt_state map updated failed", logfields.Error, err) 1262 return err 1263 } 1264 return nil 1265 } 1266 1267 // DeleteIPsecEncryptRoute removes nodes in main routing table by walking 1268 // routes and matching route protocol type. 1269 func DeleteIPsecEncryptRoute(log *slog.Logger) { 1270 log = log.With(logfields.LogSubsys, subsystem) 1271 filter := &netlink.Route{ 1272 Protocol: route.EncryptRouteProtocol, 1273 } 1274 1275 for _, family := range []int{netlink.FAMILY_V4, netlink.FAMILY_V6} { 1276 routes, err := netlink.RouteListFiltered(family, filter, netlink.RT_FILTER_PROTOCOL) 1277 if err != nil { 1278 log.Error("Unable to list ipsec encrypt routes", logfields.Error, err) 1279 return 1280 } 1281 1282 for _, rt := range routes { 1283 if err := netlink.RouteDel(&rt); err != nil { 1284 log.Warn("Unable to delete ipsec encrypt route", "route", rt.String(), logfields.Error, err) 1285 } 1286 } 1287 } 1288 } 1289 1290 func keyfileWatcher(log *slog.Logger, ctx context.Context, watcher *fswatcher.Watcher, keyfilePath string, nodeHandler datapath.NodeHandler, health cell.Health) error { 1291 for { 1292 select { 1293 case event := <-watcher.Events: 1294 if event.Op&(fsnotify.Create|fsnotify.Write) == 0 { 1295 continue 1296 } 1297 1298 _, spi, err := LoadIPSecKeysFile(log, keyfilePath) 1299 if err != nil { 1300 health.Degraded(fmt.Sprintf("Failed to load keyfile %q", keyfilePath), err) 1301 log.Error("Failed to load IPsec keyfile", logfields.Error, err) 1302 continue 1303 } 1304 1305 // Update the IPSec key identity in the local node. 1306 // This will set addrs.ipsecKeyIdentity in the node 1307 // package, and eventually trigger an update to 1308 // publish the updated information to k8s/kvstore. 1309 node.SetIPsecKeyIdentity(spi) 1310 1311 // AllNodeValidateImplementation will eventually call 1312 // nodeUpdate(), which is responsible for updating the 1313 // IPSec policies and states for all the different EPs 1314 // with ipsec.UpsertIPsecEndpoint() 1315 nodeHandler.AllNodeValidateImplementation() 1316 1317 // Push SPI update into BPF datapath now that XFRM state 1318 // is configured. 1319 if err := SetIPSecSPI(log, spi); err != nil { 1320 health.Degraded("Failed to set IPsec SPI", err) 1321 log.Error("Failed to set IPsec SPI", logfields.Error, err) 1322 continue 1323 } 1324 health.OK("Watching keyfiles") 1325 case err := <-watcher.Errors: 1326 log.Warn("Error encountered while watching file with fsnotify", 1327 logfields.Error, err, 1328 logfields.Path, keyfilePath, 1329 ) 1330 1331 case <-ctx.Done(): 1332 health.Stopped("Context done") 1333 watcher.Close() 1334 return nil 1335 } 1336 } 1337 } 1338 1339 func StartKeyfileWatcher(log *slog.Logger, group job.Group, keyfilePath string, nodeHandler datapath.NodeHandler) error { 1340 if !option.Config.EnableIPsecKeyWatcher { 1341 return nil 1342 } 1343 1344 watcher, err := fswatcher.New([]string{keyfilePath}) 1345 if err != nil { 1346 return err 1347 } 1348 1349 group.Add(job.OneShot("keyfile-watcher", func(ctx context.Context, health cell.Health) error { 1350 return keyfileWatcher(log, ctx, watcher, keyfilePath, nodeHandler, health) 1351 })) 1352 1353 return nil 1354 } 1355 1356 // ipSecSPICanBeReclaimed is used to test whether a given SPI can be reclaimed 1357 // or not (i.e. if it's not in use, and if not, if enough time has passed since 1358 // when it was replaced by a newer one). 1359 // 1360 // In addition to the SPI, this function takes also a reclaimTimestamp 1361 // parameter which represents the time at which we started reclaiming old keys. 1362 // This is needed as we need to test the same SPI multiple times (since for any 1363 // given SPI there are multiple policies and states associated with it), and we 1364 // don't want to get inconsistent results because we are calling time.Now() 1365 // directly in this function. 1366 func ipSecSPICanBeReclaimed(spi uint8, reclaimTimestamp time.Time) bool { 1367 // The SPI associated with the key currently in use should not be reclaimed 1368 if spi == ipSecCurrentKeySPI { 1369 return false 1370 } 1371 1372 // Otherwise retrieve the time at which the key for the given SPI was removed 1373 keyRemovalTime, ok := ipSecKeysRemovalTime[spi] 1374 if !ok { 1375 // If not found in the keyRemovalTime map, assume the key was 1376 // deleted just now. 1377 // In this way if the agent gets restarted before an old key is 1378 // removed we will always wait at least IPsecKeyRotationDuration time 1379 // before reclaiming it 1380 ipSecKeysRemovalTime[spi] = time.Now() 1381 1382 return false 1383 } 1384 1385 // If the key was deleted less than the IPSec key deletion delay 1386 // time ago, it should not be reclaimed 1387 if reclaimTimestamp.Sub(keyRemovalTime) < option.Config.IPsecKeyRotationDuration { 1388 return false 1389 } 1390 1391 return true 1392 } 1393 1394 func deleteStaleXfrmStates(reclaimTimestamp time.Time) error { 1395 xfrmStateList, err := xfrmStateCache.XfrmStateList() 1396 if err != nil { 1397 return err 1398 } 1399 1400 errs := resiliency.NewErrorSet("failed to delete stale xfrm states", len(xfrmStateList)) 1401 for _, s := range xfrmStateList { 1402 stateSPI := uint8(s.Spi) 1403 if !ipSecSPICanBeReclaimed(stateSPI, reclaimTimestamp) { 1404 continue 1405 } 1406 if err := xfrmStateCache.XfrmStateDel(&s); err != nil { 1407 errs.Add(fmt.Errorf("failed to delete stale xfrm state spi (%d): %w", stateSPI, err)) 1408 } 1409 } 1410 1411 return errs.Error() 1412 } 1413 1414 func deleteStaleXfrmPolicies(log *slog.Logger, reclaimTimestamp time.Time) error { 1415 scopedLog := log.With(logfields.SPI, ipSecCurrentKeySPI) 1416 1417 xfrmPolicyList, err := netlink.XfrmPolicyList(netlink.FAMILY_ALL) 1418 if err != nil { 1419 return err 1420 } 1421 1422 errs := resiliency.NewErrorSet("failed to delete stale xfrm policies", len(xfrmPolicyList)) 1423 for _, p := range xfrmPolicyList { 1424 policySPI := ipsec.GetSPIFromXfrmPolicy(&p) 1425 if !ipSecSPICanBeReclaimed(policySPI, reclaimTimestamp) { 1426 continue 1427 } 1428 1429 // Only OUT XFRM policies depend on the SPI 1430 if p.Dir != netlink.XFRM_DIR_OUT { 1431 continue 1432 } 1433 1434 if isDefaultDropPolicy(&p) { 1435 continue 1436 } 1437 1438 scopedLog.Info("Deleting stale XFRM policy", 1439 logfields.OldSPI, policySPI, 1440 logfields.SourceIP, p.Src, 1441 logfields.DestinationIP, p.Dst, 1442 logfields.TrafficDirection, getDirFromXfrmMark(p.Mark), 1443 logfields.NodeID, getNodeIDAsHexFromXfrmMark(p.Mark), 1444 ) 1445 if err := netlink.XfrmPolicyDel(&p); err != nil { 1446 errs.Add(fmt.Errorf("failed to delete stale xfrm policy spi (%d): %w", policySPI, err)) 1447 } 1448 } 1449 1450 return errs.Error() 1451 } 1452 1453 func isDefaultDropPolicy(p *netlink.XfrmPolicy) bool { 1454 return equalDefaultDropPolicy(defaultDropPolicyIPv4, p) || 1455 equalDefaultDropPolicy(defaultDropPolicyIPv6, p) 1456 } 1457 1458 func equalDefaultDropPolicy(defaultDropPolicy, p *netlink.XfrmPolicy) bool { 1459 return p.Priority == defaultDropPolicy.Priority && 1460 p.Action == defaultDropPolicy.Action && 1461 p.Dir == defaultDropPolicy.Dir && 1462 xfrmMarkEqual(p.Mark, defaultDropPolicy.Mark) && 1463 p.Src.String() == defaultDropPolicy.Src.String() && 1464 p.Dst.String() == defaultDropPolicy.Dst.String() 1465 } 1466 1467 type staleKeyReclaimer struct { 1468 log *slog.Logger 1469 } 1470 1471 func (skr staleKeyReclaimer) onTimer(ctx context.Context) error { 1472 ipSecLock.Lock() 1473 defer ipSecLock.Unlock() 1474 1475 // In case no IPSec key has been loaded yet, don't try to reclaim any 1476 // old key 1477 if ipSecCurrentKeySPI == 0 { 1478 return nil 1479 } 1480 1481 reclaimTimestamp := time.Now() 1482 1483 scopedLog := skr.log.With(logfields.SPI, ipSecCurrentKeySPI) 1484 if err := deleteStaleXfrmStates(reclaimTimestamp); err != nil { 1485 scopedLog.Warn("Failed to delete stale XFRM states", logfields.Error, err) 1486 return err 1487 } 1488 if err := deleteStaleXfrmPolicies(skr.log, reclaimTimestamp); err != nil { 1489 scopedLog.Warn("Failed to delete stale XFRM policies", logfields.Error, err) 1490 return err 1491 } 1492 1493 return nil 1494 }