github.com/cilium/cilium@v1.16.2/pkg/datapath/linux/ipsec/ipsec_linux.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  //go:build linux
     5  
     6  package ipsec
     7  
     8  import (
     9  	"bufio"
    10  	"bytes"
    11  	"context"
    12  	"crypto/sha256"
    13  	"crypto/sha512"
    14  	"encoding/hex"
    15  	"errors"
    16  	"fmt"
    17  	"io"
    18  	"log/slog"
    19  	"net"
    20  	"os"
    21  	"strconv"
    22  	"strings"
    23  	"sync"
    24  
    25  	"github.com/cilium/hive/cell"
    26  	"github.com/cilium/hive/job"
    27  	"github.com/fsnotify/fsnotify"
    28  	"github.com/prometheus/procfs"
    29  	"github.com/vishvananda/netlink"
    30  
    31  	"github.com/cilium/cilium/pkg/common/ipsec"
    32  	"github.com/cilium/cilium/pkg/datapath/linux/linux_defaults"
    33  	"github.com/cilium/cilium/pkg/datapath/linux/route"
    34  	datapath "github.com/cilium/cilium/pkg/datapath/types"
    35  	"github.com/cilium/cilium/pkg/fswatcher"
    36  	"github.com/cilium/cilium/pkg/lock"
    37  	"github.com/cilium/cilium/pkg/logging/logfields"
    38  	"github.com/cilium/cilium/pkg/maps/encrypt"
    39  	"github.com/cilium/cilium/pkg/node"
    40  	"github.com/cilium/cilium/pkg/option"
    41  	"github.com/cilium/cilium/pkg/resiliency"
    42  	"github.com/cilium/cilium/pkg/time"
    43  )
    44  
    45  type IPSecDir string
    46  
    47  const (
    48  	IPSecDirIn      IPSecDir = "IPSEC_IN"
    49  	IPSecDirOut     IPSecDir = "IPSEC_OUT"
    50  	IPSecDirBoth    IPSecDir = "IPSEC_BOTH"
    51  	IPSecDirOutNode IPSecDir = "IPSEC_OUT_NODE"
    52  
    53  	// Constants used to decode the IPsec secret in both formats:
    54  	// 1. [spi] aead-algo aead-key icv-len
    55  	// 2. [spi] auth-algo auth-key enc-algo enc-key [IP]
    56  	offsetSPI      = 0
    57  	offsetAeadAlgo = 1
    58  	offsetAeadKey  = 2
    59  	offsetICV      = 3
    60  	offsetAuthAlgo = 1
    61  	offsetAuthKey  = 2
    62  	offsetEncAlgo  = 3
    63  	offsetEncKey   = 4
    64  	offsetIP       = 5
    65  	maxOffset      = offsetIP
    66  
    67  	defaultDropPriority      = 100
    68  	oldXFRMOutPolicyPriority = 50
    69  
    70  	// DefaultReqID is the default reqid used for all IPSec rules.
    71  	DefaultReqID = 1
    72  
    73  	// EncryptedOverlayReqID is the reqid used for encrypting overlay traffic.
    74  	EncryptedOverlayReqID = 2
    75  )
    76  
    77  type dir string
    78  
    79  const (
    80  	dirUnspec  dir = "unspecified"
    81  	dirIngress dir = "ingress"
    82  	dirEgress  dir = "egress"
    83  )
    84  
    85  type ipSecKey struct {
    86  	Spi   uint8
    87  	ESN   bool
    88  	ReqID int
    89  	Auth  *netlink.XfrmStateAlgo
    90  	Crypt *netlink.XfrmStateAlgo
    91  	Aead  *netlink.XfrmStateAlgo
    92  }
    93  
    94  type oldXfrmStateKey struct {
    95  	Spi int
    96  	Dst [16]byte
    97  }
    98  
    99  var (
   100  	ipSecLock lock.RWMutex
   101  
   102  	// ipSecKeysGlobal can be accessed by multiple subsystems concurrently,
   103  	// so it should be accessed only through the getIPSecKeys and
   104  	// LoadIPSecKeys functions, which will ensure the proper lock is held
   105  	ipSecKeysGlobal = make(map[string]*ipSecKey)
   106  
   107  	// ipSecCurrentKeySPI is the SPI of the IPSec currently in use
   108  	ipSecCurrentKeySPI uint8
   109  
   110  	// ipSecKeysRemovalTime is used to track at which time a given key is
   111  	// replaced with a newer one, allowing to reclaim old keys only after
   112  	// enough time has passed since their replacement
   113  	ipSecKeysRemovalTime = make(map[uint8]time.Time)
   114  
   115  	wildcardIPv4   = net.ParseIP("0.0.0.0")
   116  	wildcardCIDRv4 = &net.IPNet{
   117  		IP:   wildcardIPv4,
   118  		Mask: net.IPv4Mask(0, 0, 0, 0),
   119  	}
   120  	wildcardIPv6   = net.ParseIP("0::0")
   121  	wildcardCIDRv6 = &net.IPNet{
   122  		IP:   wildcardIPv6,
   123  		Mask: net.CIDRMask(0, 128),
   124  	}
   125  
   126  	defaultDropMark = &netlink.XfrmMark{
   127  		Value: linux_defaults.RouteMarkEncrypt,
   128  		Mask:  linux_defaults.IPsecMarkBitMask,
   129  	}
   130  	defaultDropPolicyIPv4 = &netlink.XfrmPolicy{
   131  		Dir:      netlink.XFRM_DIR_OUT,
   132  		Src:      wildcardCIDRv4,
   133  		Dst:      wildcardCIDRv4,
   134  		Mark:     defaultDropMark,
   135  		Action:   netlink.XFRM_POLICY_BLOCK,
   136  		Priority: defaultDropPriority,
   137  	}
   138  	defaultDropPolicyIPv6 = &netlink.XfrmPolicy{
   139  		Dir:      netlink.XFRM_DIR_OUT,
   140  		Src:      wildcardCIDRv6,
   141  		Dst:      wildcardCIDRv6,
   142  		Mark:     defaultDropMark,
   143  		Action:   netlink.XFRM_POLICY_BLOCK,
   144  		Priority: defaultDropPriority,
   145  	}
   146  
   147  	// To attempt to remove any stale XFRM configs once at startup, after
   148  	// we've added the catch-all default-drop policy.
   149  	removeStaleIPv4XFRMOnce sync.Once
   150  	removeStaleIPv6XFRMOnce sync.Once
   151  
   152  	oldXFRMInMark *netlink.XfrmMark = &netlink.XfrmMark{
   153  		Value: linux_defaults.RouteMarkDecrypt,
   154  		Mask:  linux_defaults.IPsecMarkBitMask,
   155  	}
   156  	// xfrmStateCache is a cache of XFRM states to avoid querying each time.
   157  	// This is especially important for backgroundSync that is used to validate
   158  	// if the XFRM state is correct, without usually modyfing anything.
   159  	// The cache is invalidated whenever a new XFRM state is added/updated/removed,
   160  	// but also in case of TTL expiration.
   161  	// It provides XfrmStateAdd/Update/Del wrappers that ensure cache
   162  	// is correctly invalidate.
   163  	xfrmStateCache = NewXfrmStateListCache(time.Minute)
   164  )
   165  
   166  func getGlobalIPsecKey(ip net.IP) *ipSecKey {
   167  	ipSecLock.RLock()
   168  	defer ipSecLock.RUnlock()
   169  
   170  	key, scoped := ipSecKeysGlobal[ip.String()]
   171  	if !scoped {
   172  		key = ipSecKeysGlobal[""]
   173  	}
   174  	return key
   175  }
   176  
   177  // computeNodeIPsecKey computes per-node-pair IPsec keys from the global,
   178  // pre-shared key. The per-node-pair keys are computed with a SHA256 hash of
   179  // the global key, source node IP, destination node IP appended together.
   180  func computeNodeIPsecKey(globalKey, srcNodeIP, dstNodeIP, srcBootID, dstBootID []byte) []byte {
   181  	inputLen := len(globalKey) + len(srcNodeIP) + len(dstNodeIP) + len(srcBootID) + len(dstBootID)
   182  	input := make([]byte, 0, inputLen)
   183  	input = append(input, globalKey...)
   184  	input = append(input, srcNodeIP...)
   185  	input = append(input, dstNodeIP...)
   186  	input = append(input, srcBootID[:36]...)
   187  	input = append(input, dstBootID[:36]...)
   188  
   189  	var hash []byte
   190  	if len(globalKey) <= 32 {
   191  		h := sha256.Sum256(input)
   192  		hash = h[:]
   193  	} else {
   194  		h := sha512.Sum512(input)
   195  		hash = h[:]
   196  	}
   197  	return hash[:len(globalKey)]
   198  }
   199  
   200  // canonicalIP returns a canonical IPv4 address (4 bytes)
   201  // in case we were dealing with a v4 mapped V6 address.
   202  func canonicalIP(ip net.IP) net.IP {
   203  	if v4 := ip.To4(); v4 != nil {
   204  		return v4
   205  	}
   206  	return ip
   207  }
   208  
   209  // deriveNodeIPsecKey builds a per-node-pair ipSecKey object from the global
   210  // ipSecKey object.
   211  func deriveNodeIPsecKey(globalKey *ipSecKey, srcNodeIP, dstNodeIP net.IP, srcBootID, dstBootID string) *ipSecKey {
   212  	nodeKey := &ipSecKey{
   213  		Spi:   globalKey.Spi,
   214  		ReqID: globalKey.ReqID,
   215  		ESN:   globalKey.ESN,
   216  	}
   217  
   218  	srcNodeIP = canonicalIP(srcNodeIP)
   219  	dstNodeIP = canonicalIP(dstNodeIP)
   220  
   221  	if globalKey.Aead != nil {
   222  		nodeKey.Aead = &netlink.XfrmStateAlgo{
   223  			Name:   globalKey.Aead.Name,
   224  			Key:    computeNodeIPsecKey(globalKey.Aead.Key, srcNodeIP, dstNodeIP, []byte(srcBootID), []byte(dstBootID)),
   225  			ICVLen: globalKey.Aead.ICVLen,
   226  		}
   227  	} else {
   228  		nodeKey.Auth = &netlink.XfrmStateAlgo{
   229  			Name: globalKey.Auth.Name,
   230  			Key:  computeNodeIPsecKey(globalKey.Auth.Key, srcNodeIP, dstNodeIP, []byte(srcBootID), []byte(dstBootID)),
   231  		}
   232  
   233  		nodeKey.Crypt = &netlink.XfrmStateAlgo{
   234  			Name: globalKey.Crypt.Name,
   235  			Key:  computeNodeIPsecKey(globalKey.Crypt.Key, srcNodeIP, dstNodeIP, []byte(srcBootID), []byte(dstBootID)),
   236  		}
   237  	}
   238  
   239  	return nodeKey
   240  }
   241  
   242  // We want one IPsec key per node pair. For a pair of nodes A and B with IP
   243  // addresses a and b, and boot ids x and y respectively, we will therefore
   244  // install two different keys:
   245  // Node A               <> Node B
   246  // XFRM IN:  key(b+a+y+x)      XFRM IN:  key(a+b+x+y)
   247  // XFRM OUT: key(a+b+x+y)      XFRM OUT: key(b+a+y+x)
   248  // This is done such that, for each pair of nodes A, B, the key used for
   249  // decryption on A (XFRM IN) is the same key used for encryption on B (XFRM
   250  // OUT), and vice versa. And its ESN automatically resets on each node reboot.
   251  func getNodeIPsecKey(localNodeIP, remoteNodeIP net.IP, localBootID, remoteBootID string, dir netlink.Dir) *ipSecKey {
   252  	globalKey := getGlobalIPsecKey(localNodeIP)
   253  	if globalKey == nil {
   254  		return nil
   255  	}
   256  	if !globalKey.ESN {
   257  		return globalKey
   258  	}
   259  
   260  	if dir == netlink.XFRM_DIR_OUT {
   261  		return deriveNodeIPsecKey(globalKey, localNodeIP, remoteNodeIP, localBootID, remoteBootID)
   262  	}
   263  	return deriveNodeIPsecKey(globalKey, remoteNodeIP, localNodeIP, remoteBootID, localBootID)
   264  }
   265  
   266  func ipSecNewState(keys *ipSecKey) *netlink.XfrmState {
   267  	state := netlink.XfrmState{
   268  		Mode:  netlink.XFRM_MODE_TUNNEL,
   269  		Proto: netlink.XFRM_PROTO_ESP,
   270  		ESN:   keys.ESN,
   271  		Spi:   int(keys.Spi),
   272  		Reqid: keys.ReqID,
   273  	}
   274  	if keys.ESN {
   275  		state.ReplayWindow = 1024
   276  	}
   277  	if keys.Aead != nil {
   278  		state.Aead = keys.Aead
   279  	} else {
   280  		state.Crypt = keys.Crypt
   281  		state.Auth = keys.Auth
   282  	}
   283  	return &state
   284  }
   285  
   286  func ipSecNewPolicy() *netlink.XfrmPolicy {
   287  	policy := netlink.XfrmPolicy{}
   288  	return &policy
   289  }
   290  
   291  func ipSecAttachPolicyTempl(policy *netlink.XfrmPolicy, keys *ipSecKey, srcIP, dstIP net.IP, spi bool, optional int) {
   292  	tmpl := netlink.XfrmPolicyTmpl{
   293  		Proto:    netlink.XFRM_PROTO_ESP,
   294  		Mode:     netlink.XFRM_MODE_TUNNEL,
   295  		Reqid:    keys.ReqID,
   296  		Dst:      dstIP,
   297  		Src:      srcIP,
   298  		Optional: optional,
   299  	}
   300  
   301  	if spi {
   302  		tmpl.Spi = int(keys.Spi)
   303  	}
   304  
   305  	policy.Tmpls = append(policy.Tmpls, tmpl)
   306  }
   307  
   308  // xfrmStateReplace attempts to add a new XFRM state only if one doesn't
   309  // already exist. If it doesn't but some other XFRM state conflicts, then
   310  // we attempt to remove the conflicting state before trying to add again.
   311  func xfrmStateReplace(log *slog.Logger, new *netlink.XfrmState, remoteRebooted bool) error {
   312  	states, err := xfrmStateCache.XfrmStateList()
   313  	if err != nil {
   314  		return fmt.Errorf("Cannot get XFRM state: %w", err)
   315  	}
   316  
   317  	scopedLog := log.With(
   318  		logfields.SPI, new.Spi,
   319  		logfields.SourceIP, new.Src,
   320  		logfields.DestinationIP, new.Dst,
   321  		logfields.TrafficDirection, getDirFromXfrmMark(new.Mark),
   322  		logfields.NodeID, getNodeIDAsHexFromXfrmMark(new.Mark),
   323  	)
   324  
   325  	// Check if the XFRM state already exists
   326  	for _, s := range states {
   327  		if xfrmIPEqual(s.Src, new.Src) && xfrmIPEqual(s.Dst, new.Dst) &&
   328  			xfrmMarkEqual(s.Mark, new.Mark) && s.Spi == new.Spi {
   329  			if !xfrmKeyEqual(&s, new) {
   330  				// The states are the same, including the SPI, but the
   331  				// encryption key changed. This is expected on upgrade because
   332  				// we changed the way we compute the per-node-pair key.
   333  				scopedLog.Info("Removing XFRM state with old IPsec key")
   334  				xfrmStateCache.XfrmStateDel(&s)
   335  				break
   336  			}
   337  			if !xfrmMarkEqual(s.OutputMark, new.OutputMark) {
   338  				// If only the output-marks differ, then we should be able
   339  				// to simply update the XFRM state atomically.
   340  				return xfrmStateCache.XfrmStateUpdate(new)
   341  			}
   342  			if remoteRebooted && new.ESN {
   343  				// This should happen only when a node reboots when the boot ID
   344  				// is used to compute the key (i.e., if ESN is also enabled).
   345  				// We can safely perform a non-atomic swap of the XFRM state
   346  				// for both the IN and OUT directions because:
   347  				// - For the IN direction, we can't leak anything. At most
   348  				//   we'll drop a few encrypted packets while updating.
   349  				// - For the OUT direction, we also can't leak anything due to
   350  				//   having an existing XFRM policy which will match and drop
   351  				//   packets if the state is missing. At most we will drop a
   352  				//   few encrypted packets while updating.
   353  				scopedLog.Info("Non-atomically updating IPsec XFRM state due to remote boot ID change")
   354  				xfrmStateCache.XfrmStateDel(&s)
   355  				break
   356  			}
   357  			return nil
   358  		}
   359  	}
   360  
   361  	var (
   362  		oldXFRMOutMark = &netlink.XfrmMark{
   363  			Value: ipSecXfrmMarkSetSPI(linux_defaults.RouteMarkEncrypt, uint8(new.Spi)),
   364  			Mask:  linux_defaults.IPsecOldMarkMaskOut,
   365  		}
   366  		errs = resiliency.NewErrorSet("failed to delete old xfrm states", len(states))
   367  	)
   368  	for _, s := range states {
   369  		// This is either the XFRM OUT state or the XFRM IN state from a
   370  		// previous Cilium version. Because their marks match the new mark
   371  		// (e.g., 0xXXXX3e00/0xffffff00 ∈ 0x3e00/0xff00), the kernel considers
   372  		// the two states conflict and we won't be able to add the new ones
   373  		// until the old one is removed.
   374  		//
   375  		// Thus, we temporarily remove the old, conflicting XFRM state and
   376  		// re-add it in a defer. In between the removal of the old state and
   377  		// the addition of the new, we can have a packet drops due to the
   378  		// missing state. These drops should be limited to the specific node
   379  		// pair we are handling here and the window during which they can
   380  		// happen should be really small. This is also specific to the upgrade
   381  		// and can be removed in v1.16.
   382  		if s.Spi == new.Spi && xfrmIPEqual(s.Dst, new.Dst) {
   383  			var dir string
   384  			// The old XFRM IN state matches on 0.0.0.0 so it conflicts even
   385  			// though the source IP addresses of old and new are different.
   386  			// Thus, we don't need to compare source IP addresses for the IN
   387  			// states.
   388  			if xfrmIPEqual(s.Src, new.Src) && xfrmMarkEqual(s.Mark, oldXFRMOutMark) {
   389  				dir = "OUT"
   390  			} else if xfrmMarkEqual(s.Mark, oldXFRMInMark) {
   391  				dir = "IN"
   392  			} else {
   393  				continue
   394  			}
   395  
   396  			err, deferFn := xfrmTemporarilyRemoveState(scopedLog, s, dir)
   397  			if err != nil {
   398  				errs.Add(fmt.Errorf("Failed to remove old XFRM %s state %s: %w", dir, s.String(), err))
   399  			} else {
   400  				defer deferFn()
   401  			}
   402  		}
   403  	}
   404  	if err := errs.Error(); err != nil {
   405  		scopedLog.Error("Failed to clean up old XFRM state", logfields.Error, err)
   406  		return err
   407  	}
   408  
   409  	// It doesn't exist so let's attempt to add it.
   410  	firstAttemptErr := xfrmStateCache.XfrmStateAdd(new)
   411  	if !os.IsExist(firstAttemptErr) {
   412  		return firstAttemptErr
   413  	}
   414  	scopedLog.Error("Failed to add XFRM state due to conflicting state")
   415  
   416  	// An existing state conflicts with this one. We need to remove the
   417  	// existing one first.
   418  	deletedSomething, err := xfrmDeleteConflictingState(log, states, new)
   419  	if err != nil {
   420  		return err
   421  	}
   422  
   423  	// If no conflicting state was found and deleted, there's no point in
   424  	// attempting to add again.
   425  	if !deletedSomething {
   426  		return firstAttemptErr
   427  	}
   428  	return xfrmStateCache.XfrmStateAdd(new)
   429  }
   430  
   431  // Temporarily remove an XFRM state to allow the addition of another,
   432  // conflicting XFRM state. This function removes the conflicting state and
   433  // prepares a defer callback to re-add it with proper logging.
   434  func xfrmTemporarilyRemoveState(scopedLog *slog.Logger, state netlink.XfrmState, dir string) (error, func()) {
   435  	stats, err := procfs.NewXfrmStat()
   436  	errorCnt := 0
   437  	if err != nil {
   438  		scopedLog.Error("Error while getting XFRM stats before state removal", logfields.Error, err)
   439  	} else {
   440  		if dir == "IN" {
   441  			errorCnt = stats.XfrmInNoStates
   442  		} else {
   443  			errorCnt = stats.XfrmOutNoStates
   444  		}
   445  	}
   446  
   447  	start := time.Now()
   448  	if err := xfrmStateCache.XfrmStateDel(&state); err != nil {
   449  		return err, nil
   450  	}
   451  	return nil, func() {
   452  		if err := xfrmStateCache.XfrmStateAdd(&state); err != nil {
   453  			scopedLog.Error("Failed to re-add old XFRM state",
   454  				"directory", dir, logfields.Error, err)
   455  		}
   456  		elapsed := time.Since(start)
   457  
   458  		stats, err := procfs.NewXfrmStat()
   459  		if err != nil {
   460  			scopedLog.Error("Error while getting XFRM stats after state removal", logfields.Error, err)
   461  			errorCnt = 0
   462  		} else {
   463  			if dir == "IN" {
   464  				errorCnt = stats.XfrmInNoStates - errorCnt
   465  			} else {
   466  				errorCnt = stats.XfrmOutNoStates - errorCnt
   467  			}
   468  		}
   469  		scopedLog.Info("Temporarily removed old XFRM state",
   470  			"directory", dir, "packetsDropped", errorCnt, logfields.Duration, elapsed)
   471  	}
   472  }
   473  
   474  // Attempt to remove any XFRM state that conflicts with the state we just tried
   475  // to add. To find those conflicting states, we need to use the same logic that
   476  // the kernel used to reject our check with EEXIST. That logic is upstream in
   477  // __xfrm_state_lookup.
   478  func xfrmDeleteConflictingState(log *slog.Logger, states []netlink.XfrmState, new *netlink.XfrmState) (bool, error) {
   479  	var (
   480  		deletedSomething bool
   481  		errs             = resiliency.NewErrorSet("failed to delete conflicting XFRM states", len(states))
   482  	)
   483  	for _, s := range states {
   484  		if new.Spi == s.Spi && (new.Mark == nil) == (s.Mark == nil) &&
   485  			(new.Mark == nil || new.Mark.Value&new.Mark.Mask&s.Mark.Mask == s.Mark.Value) &&
   486  			xfrmIPEqual(new.Src, s.Src) && xfrmIPEqual(new.Dst, s.Dst) {
   487  			if err := xfrmStateCache.XfrmStateDel(&s); err != nil {
   488  				errs.Add(err)
   489  				continue
   490  			}
   491  			deletedSomething = true
   492  			log.Info("Removed a conflicting XFRM state",
   493  				logfields.SPI, s.Spi,
   494  				logfields.SourceIP, s.Src,
   495  				logfields.DestinationIP, s.Dst,
   496  				logfields.TrafficDirection, getDirFromXfrmMark(s.Mark),
   497  				logfields.NodeID, getNodeIDAsHexFromXfrmMark(s.Mark),
   498  			)
   499  		}
   500  	}
   501  	return deletedSomething, errs.Error()
   502  }
   503  
   504  // This function compares two IP addresses and returns true if they are equal.
   505  // This is unfortunately necessary because our netlink library returns nil IPv6
   506  // addresses as nil IPv4 addresses and net.IP.Equal rightfully considers those
   507  // are different.
   508  func xfrmIPEqual(ip1, ip2 net.IP) bool {
   509  	if ip1.IsUnspecified() && ip2.IsUnspecified() {
   510  		return true
   511  	}
   512  	return ip1.Equal(ip2)
   513  }
   514  
   515  // Returns true if two XFRM marks are identical. They should be either both nil
   516  // or have the same mark value and mask.
   517  func xfrmMarkEqual(mark1, mark2 *netlink.XfrmMark) bool {
   518  	if (mark1 == nil) != (mark2 == nil) {
   519  		return false
   520  	}
   521  	return mark1 == nil || (mark1.Value == mark2.Value && mark1.Mask == mark2.Mask)
   522  }
   523  
   524  // Returns true if the two XFRM states have the same encryption key.
   525  func xfrmKeyEqual(s1, s2 *netlink.XfrmState) bool {
   526  	if (s1.Aead == nil) != (s2.Aead == nil) ||
   527  		(s1.Crypt == nil) != (s2.Crypt == nil) ||
   528  		(s1.Auth == nil) != (s2.Auth == nil) {
   529  		return false
   530  	}
   531  	if s1.Aead != nil {
   532  		return bytes.Equal(s1.Aead.Key, s2.Aead.Key)
   533  	}
   534  	return bytes.Equal(s1.Crypt.Key, s2.Crypt.Key) &&
   535  		bytes.Equal(s1.Auth.Key, s2.Auth.Key)
   536  }
   537  
   538  func ipSecReplaceStateIn(log *slog.Logger, localIP, remoteIP net.IP, nodeID uint16, zeroMark bool, localBootID, remoteBootID string, remoteRebooted bool, reqID int) (uint8, error) {
   539  	key := getNodeIPsecKey(localIP, remoteIP, localBootID, remoteBootID, netlink.XFRM_DIR_IN)
   540  	if key == nil {
   541  		return 0, fmt.Errorf("IPSec key missing")
   542  	}
   543  	key.ReqID = reqID
   544  	state := ipSecNewState(key)
   545  	state.Src = remoteIP
   546  	state.Dst = localIP
   547  	state.Mark = generateDecryptMark(linux_defaults.RouteMarkDecrypt, nodeID)
   548  	if zeroMark {
   549  		state.OutputMark = &netlink.XfrmMark{
   550  			Value: 0,
   551  			Mask:  linux_defaults.OutputMarkMask,
   552  		}
   553  	} else if reqID == EncryptedOverlayReqID {
   554  		state.OutputMark = &netlink.XfrmMark{
   555  			Value: linux_defaults.RouteMarkDecryptedOverlay,
   556  			Mask:  linux_defaults.OutputMarkMask,
   557  		}
   558  	} else {
   559  		state.OutputMark = &netlink.XfrmMark{
   560  			Value: linux_defaults.RouteMarkDecrypt,
   561  			Mask:  linux_defaults.OutputMarkMask,
   562  		}
   563  	}
   564  	// We want to clear the node ID regardless of zeroMark parameter. That
   565  	// value is never needed after decryption.
   566  	state.OutputMark.Mask |= linux_defaults.IPsecMarkMaskNodeID
   567  
   568  	return key.Spi, xfrmStateReplace(log, state, remoteRebooted)
   569  }
   570  
   571  func ipSecReplaceStateOut(log *slog.Logger, localIP, remoteIP net.IP, nodeID uint16, localBootID, remoteBootID string, remoteRebooted bool, reqID int) (uint8, error) {
   572  	key := getNodeIPsecKey(localIP, remoteIP, localBootID, remoteBootID, netlink.XFRM_DIR_OUT)
   573  	if key == nil {
   574  		return 0, fmt.Errorf("IPSec key missing")
   575  	}
   576  	key.ReqID = reqID
   577  	state := ipSecNewState(key)
   578  	state.Src = localIP
   579  	state.Dst = remoteIP
   580  	state.Mark = generateEncryptMark(key.Spi, nodeID)
   581  	state.OutputMark = &netlink.XfrmMark{
   582  		Value: linux_defaults.RouteMarkEncrypt,
   583  		Mask:  linux_defaults.OutputMarkMask,
   584  	}
   585  	return key.Spi, xfrmStateReplace(log, state, remoteRebooted)
   586  }
   587  
   588  func _ipSecReplacePolicyInFwd(src, dst *net.IPNet, tmplSrc, tmplDst net.IP, proxyMark bool, dir netlink.Dir, reqID int) error {
   589  	optional := int(0)
   590  	// We can use the global IPsec key here because we are not going to
   591  	// actually use the secret itself.
   592  	key := getGlobalIPsecKey(dst.IP)
   593  	if key == nil {
   594  		return fmt.Errorf("IPSec key missing")
   595  	}
   596  	key.ReqID = reqID
   597  
   598  	wildcardIP := wildcardIPv4
   599  	wildcardCIDR := wildcardCIDRv4
   600  	if tmplDst.To4() == nil {
   601  		wildcardIP = wildcardIPv6
   602  		wildcardCIDR = wildcardCIDRv6
   603  	}
   604  
   605  	policy := ipSecNewPolicy()
   606  	policy.Dir = dir
   607  	if dir == netlink.XFRM_DIR_IN {
   608  		policy.Src = src
   609  		policy.Dst = dst
   610  		policy.Mark = &netlink.XfrmMark{
   611  			Mask: linux_defaults.IPsecMarkBitMask,
   612  		}
   613  		if proxyMark {
   614  			// We require a policy to match on packets going to the proxy which are
   615  			// therefore carrying the proxy mark. We however don't need a policy
   616  			// for the encrypted packets because there is already a state matching
   617  			// them.
   618  			policy.Mark.Value = linux_defaults.RouteMarkToProxy
   619  			// We must mark the IN policy for the proxy optional simply because it
   620  			// is lacking a corresponding state.
   621  			optional = 1
   622  			// We set the source tmpl address to 0/0 to explicit that it
   623  			// doesn't matter.
   624  			tmplSrc = wildcardIP
   625  		} else {
   626  			policy.Mark.Value = linux_defaults.RouteMarkDecrypt
   627  		}
   628  	}
   629  	// We always make forward rules optional. The only reason we have these
   630  	// at all is to appease the XFRM route hooks, we don't really care about
   631  	// policy because Cilium BPF programs do that.
   632  	if dir == netlink.XFRM_DIR_FWD {
   633  		optional = 1
   634  		policy.Priority = linux_defaults.IPsecFwdPriority
   635  		// In case of fwd policies, we should tell the kernel the tmpl src
   636  		// doesn't matter; we want all fwd packets to go through.
   637  		policy.Src = wildcardCIDR
   638  		policy.Dst = wildcardCIDR
   639  	}
   640  	ipSecAttachPolicyTempl(policy, key, tmplSrc, tmplDst, false, optional)
   641  	return netlink.XfrmPolicyUpdate(policy)
   642  }
   643  
   644  func ipSecReplacePolicyIn(src, dst *net.IPNet, tmplSrc, tmplDst net.IP, reqID int) error {
   645  	if err := _ipSecReplacePolicyInFwd(src, dst, tmplSrc, tmplDst, true, netlink.XFRM_DIR_IN, reqID); err != nil {
   646  		return err
   647  	}
   648  	return _ipSecReplacePolicyInFwd(src, dst, tmplSrc, tmplDst, false, netlink.XFRM_DIR_IN, reqID)
   649  }
   650  
   651  func IpSecReplacePolicyFwd(dst *net.IPNet, tmplDst net.IP, reqID int) error {
   652  	// The source CIDR and IP aren't used in the case of FWD policies.
   653  	return _ipSecReplacePolicyInFwd(nil, dst, net.IP{}, tmplDst, false, netlink.XFRM_DIR_FWD, reqID)
   654  }
   655  
   656  // Installs a catch-all policy for outgoing traffic that has the encryption
   657  // bit. The goal here is to catch any traffic that may passthrough our
   658  // encryption while we are replacing XFRM policies & states. Those operations
   659  // cannot always be performed atomically so we may have brief moments where
   660  // there is no XFRM policy to encrypt a subset of traffic. This policy ensures
   661  // we drop such traffic and don't let it flow in plain text.
   662  //
   663  // We do need to match on the mark because there is also traffic flowing
   664  // through XFRM that we don't want to encrypt (e.g., hostns traffic).
   665  func IPsecDefaultDropPolicy(log *slog.Logger, ipv6 bool) error {
   666  	log = log.With(logfields.LogSubsys, subsystem)
   667  
   668  	defaultDropPolicy := defaultDropPolicyIPv4
   669  	family := netlink.FAMILY_V4
   670  	if ipv6 {
   671  		defaultDropPolicy = defaultDropPolicyIPv6
   672  		family = netlink.FAMILY_V6
   673  	}
   674  
   675  	err := netlink.XfrmPolicyUpdate(defaultDropPolicy)
   676  
   677  	// We move the existing XFRM OUT policy to a lower priority to allow the
   678  	// new priorities to take precedence.
   679  	// This code can be removed in Cilium v1.15 to instead remove the old XFRM
   680  	// OUT policy and state.
   681  	removeStaleXFRMOnce := &removeStaleIPv4XFRMOnce
   682  	if ipv6 {
   683  		removeStaleXFRMOnce = &removeStaleIPv6XFRMOnce
   684  	}
   685  	removeStaleXFRMOnce.Do(func() {
   686  		deprioritizeOldOutPolicy(log, family)
   687  	})
   688  
   689  	return err
   690  }
   691  
   692  // Lowers the priority of the old XFRM OUT policy. We rely on the mark mask to
   693  // identify it. By lowering the priority, we will allow the new XFRM OUT
   694  // policies to take precedence. We cannot simply remove and replace the old
   695  // XFRM OUT configs because that would cause traffic interruptions on upgrades.
   696  func deprioritizeOldOutPolicy(log *slog.Logger, family int) {
   697  	policies, err := netlink.XfrmPolicyList(family)
   698  	if err != nil {
   699  		log.Error("Cannot get XFRM policies", logfields.Error, err)
   700  	}
   701  	for _, p := range policies {
   702  		if p.Dir == netlink.XFRM_DIR_OUT && p.Mark.Mask == linux_defaults.IPsecOldMarkMaskOut {
   703  			p.Priority = oldXFRMOutPolicyPriority
   704  			if err := netlink.XfrmPolicyUpdate(&p); err != nil {
   705  				log.Error("Failed to deprioritize old XFRM policy",
   706  					logfields.Error, err,
   707  					logfields.SourceCIDR, p.Src,
   708  					logfields.DestinationCIDR, p.Dst,
   709  					logfields.TrafficDirection, getDirFromXfrmMark(p.Mark),
   710  					logfields.NodeID, getNodeIDAsHexFromXfrmMark(p.Mark),
   711  				)
   712  			}
   713  		}
   714  	}
   715  }
   716  
   717  // ipSecXfrmMarkSetSPI takes a XfrmMark base value, an SPI, returns the mark
   718  // value with the SPI value encoded in it
   719  func ipSecXfrmMarkSetSPI(markValue uint32, spi uint8) uint32 {
   720  	return markValue | (uint32(spi) << linux_defaults.IPsecXFRMMarkSPIShift)
   721  }
   722  
   723  func getNodeIDAsHexFromXfrmMark(mark *netlink.XfrmMark) string {
   724  	return fmt.Sprintf("0x%x", ipsec.GetNodeIDFromXfrmMark(mark))
   725  }
   726  
   727  func getDirFromXfrmMark(mark *netlink.XfrmMark) dir {
   728  	switch {
   729  	case mark == nil:
   730  		return dirUnspec
   731  	case mark.Value&linux_defaults.RouteMarkDecrypt != 0:
   732  		return dirIngress
   733  	case mark.Value&linux_defaults.RouteMarkEncrypt != 0:
   734  		return dirEgress
   735  	}
   736  	return dirUnspec
   737  }
   738  
   739  func generateEncryptMark(spi uint8, nodeID uint16) *netlink.XfrmMark {
   740  	val := ipSecXfrmMarkSetSPI(linux_defaults.RouteMarkEncrypt, spi)
   741  	val |= uint32(nodeID) << 16
   742  	return &netlink.XfrmMark{
   743  		Value: val,
   744  		Mask:  linux_defaults.IPsecMarkMaskOut,
   745  	}
   746  }
   747  
   748  func generateDecryptMark(decryptBit uint32, nodeID uint16) *netlink.XfrmMark {
   749  	val := decryptBit | (uint32(nodeID) << 16)
   750  	return &netlink.XfrmMark{
   751  		Value: val,
   752  		Mask:  linux_defaults.IPsecMarkMaskIn,
   753  	}
   754  }
   755  
   756  func ipSecReplacePolicyOut(src, dst *net.IPNet, tmplSrc, tmplDst net.IP, nodeID uint16, dir IPSecDir, reqID int) error {
   757  	// TODO: Remove old policy pointing to target net
   758  
   759  	// We can use the global IPsec key here because we are not going to
   760  	// actually use the secret itself.
   761  	key := getGlobalIPsecKey(dst.IP)
   762  	if key == nil {
   763  		return fmt.Errorf("IPSec key missing")
   764  	}
   765  	key.ReqID = reqID
   766  
   767  	policy := ipSecNewPolicy()
   768  	if dir == IPSecDirOutNode {
   769  		policy.Src = wildcardCIDRv4
   770  	} else {
   771  		policy.Src = src
   772  	}
   773  	policy.Dst = dst
   774  	policy.Dir = netlink.XFRM_DIR_OUT
   775  	policy.Mark = generateEncryptMark(key.Spi, nodeID)
   776  	ipSecAttachPolicyTempl(policy, key, tmplSrc, tmplDst, true, 0)
   777  	return netlink.XfrmPolicyUpdate(policy)
   778  }
   779  
   780  // Returns true if the given mark matches on the node ID. This works because
   781  // the node ID match is always in the first 16 bits.
   782  func matchesOnNodeID(mark *netlink.XfrmMark) bool {
   783  	return mark != nil &&
   784  		mark.Mask&linux_defaults.IPsecMarkMaskNodeID == linux_defaults.IPsecMarkMaskNodeID
   785  }
   786  
   787  func ipsecDeleteXfrmState(log *slog.Logger, nodeID uint16) error {
   788  	scopedLog := log.With(
   789  		logfields.NodeID, nodeID,
   790  	)
   791  
   792  	xfrmStateList, err := xfrmStateCache.XfrmStateList()
   793  	if err != nil {
   794  		scopedLog.Warn("Failed to list XFRM states for deletion", logfields.Error, err)
   795  		return err
   796  	}
   797  
   798  	xfrmStatesToDelete := []netlink.XfrmState{}
   799  	oldXfrmInStates := map[oldXfrmStateKey]netlink.XfrmState{}
   800  	for _, s := range xfrmStateList {
   801  		if matchesOnNodeID(s.Mark) && ipsec.GetNodeIDFromXfrmMark(s.Mark) == nodeID {
   802  			xfrmStatesToDelete = append(xfrmStatesToDelete, s)
   803  		}
   804  		if xfrmMarkEqual(s.Mark, oldXFRMInMark) {
   805  			key := oldXfrmStateKey{
   806  				Spi: s.Spi,
   807  				Dst: [16]byte(s.Dst.To16()),
   808  			}
   809  			oldXfrmInStates[key] = s
   810  		}
   811  	}
   812  
   813  	errs := resiliency.NewErrorSet(fmt.Sprintf("failed to delete node (%d) xfrm states", nodeID), len(xfrmStateList))
   814  	for _, s := range xfrmStatesToDelete {
   815  		key := oldXfrmStateKey{
   816  			Spi: s.Spi,
   817  			Dst: [16]byte(s.Dst.To16()),
   818  		}
   819  		var oldXfrmInState *netlink.XfrmState = nil
   820  		old, ok := oldXfrmInStates[key]
   821  		if ok {
   822  			oldXfrmInState = &old
   823  		}
   824  		if err := safeDeleteXfrmState(log, &s, oldXfrmInState); err != nil {
   825  			errs.Add(fmt.Errorf("failed to delete xfrm state (%s): %w", s.String(), err))
   826  		}
   827  	}
   828  
   829  	return errs.Error()
   830  }
   831  
   832  // safeDeleteXfrmState deletes the given XFRM state. Specifically, if the
   833  // state is to catch ingress traffic marked with nodeID (0xXXXX0d00), we
   834  // temporarily remove the old XFRM state that matches 0xd00/0xf00. This is to
   835  // workaround a kernel issue that prevents us from deleting a specific XFRM
   836  // state (e.g. catching 0xXXXX0d00/0xffff0f00) when there is also a general
   837  // xfrm state (e.g. catching 0xd00/0xf00). When both XFRM states coexist,
   838  // kernel deletes the general XFRM state instead of the specific one, even if
   839  // the deleting request is for the specific one.
   840  func safeDeleteXfrmState(log *slog.Logger, state *netlink.XfrmState, oldState *netlink.XfrmState) (err error) {
   841  	if getDirFromXfrmMark(state.Mark) == dirIngress && ipsec.GetNodeIDFromXfrmMark(state.Mark) != 0 && oldState != nil {
   842  
   843  		errs := resiliency.NewErrorSet("failed to delete old xfrm states", 1)
   844  
   845  		scopedLog := log.With(
   846  			logfields.SPI, state.Spi,
   847  			logfields.SourceIP, state.Src,
   848  			logfields.DestinationIP, state.Dst,
   849  			logfields.TrafficDirection, getDirFromXfrmMark(state.Mark),
   850  			logfields.NodeID, getNodeIDAsHexFromXfrmMark(state.Mark),
   851  		)
   852  
   853  		err, deferFn := xfrmTemporarilyRemoveState(scopedLog, *oldState, string(dirIngress))
   854  		if err != nil {
   855  			errs.Add(fmt.Errorf("Failed to remove old XFRM %s state %s: %w", string(dirIngress), oldState.String(), err))
   856  		} else {
   857  			defer deferFn()
   858  		}
   859  		if err := errs.Error(); err != nil {
   860  			scopedLog.Error("Failed to clean up old XFRM state", logfields.Error, err)
   861  			return err
   862  		}
   863  	}
   864  
   865  	return xfrmStateCache.XfrmStateDel(state)
   866  }
   867  
   868  func ipsecDeleteXfrmPolicy(log *slog.Logger, nodeID uint16) error {
   869  	scopedLog := log.With(
   870  		logfields.NodeID, nodeID,
   871  	)
   872  
   873  	xfrmPolicyList, err := netlink.XfrmPolicyList(netlink.FAMILY_ALL)
   874  	if err != nil {
   875  		scopedLog.Warn("Failed to list XFRM policies for deletion", logfields.Error, err)
   876  		return fmt.Errorf("failed to list xfrm policies: %w", err)
   877  	}
   878  	errs := resiliency.NewErrorSet("failed to delete xfrm policies", len(xfrmPolicyList))
   879  	for _, p := range xfrmPolicyList {
   880  		if matchesOnNodeID(p.Mark) && ipsec.GetNodeIDFromXfrmMark(p.Mark) == nodeID {
   881  			if err := netlink.XfrmPolicyDel(&p); err != nil {
   882  				errs.Add(fmt.Errorf("unable to delete xfrm policy %s: %w", p.String(), err))
   883  			}
   884  		}
   885  	}
   886  	if err := errs.Error(); err != nil {
   887  		scopedLog.Warn("Failed to delete XFRM policy", logfields.Error, err)
   888  		return err
   889  	}
   890  
   891  	return nil
   892  }
   893  
   894  /* UpsertIPsecEndpoint updates the IPSec context for a new endpoint inserted in
   895   * the ipcache. Currently we support a global crypt/auth keyset that will encrypt
   896   * all traffic between endpoints. An IPSec context consists of two pieces a policy
   897   * and a state, the security policy database (SPD) and security association
   898   * database (SAD). These are implemented using the Linux kernels XFRM implementation.
   899   *
   900   * For all traffic that matches a policy, the policy tuple used is
   901   * (sip/mask, dip/mask, dev) with an optional mark field used in the Cilium implementation
   902   * to ensure only expected traffic is encrypted. The state hashtable is searched for
   903   * a matching state associated with that flow. The Linux kernel will do a series of
   904   * hash lookups to find the most specific state (xfrm_dst) possible. The hash keys searched are
   905   * the following, (daddr, saddr, reqid, encap_family), (daddr, wildcard, reqid, encap),
   906   * (mark, daddr, spi, proto, encap). Any "hits" in the hash table will subsequently
   907   * have the SPI checked to ensure it also matches. Encap is ignored in our case here
   908   * and can be used with UDP encap if wanted.
   909   *
   910   * The implications of the (inflexible!) hash key implementation is that in-order
   911   * to have a policy/state match we _must_ insert a state for each daddr. For Cilium
   912   * this translates to a state entry per node. We learn the nodes/endpoints by
   913   * listening to ipcache events. Finally, because IPSec is unidirectional a state
   914   * is needed for both ingress and egress. Denoted by the DIR on the xfrm cmd line
   915   * in the policy lookup. In the Cilium case, where we have IPSec between all
   916   * endpoints this results in two policy rules per node, one for ingress
   917   * and one for egress.
   918   *
   919   * For a concrete example consider two cluster nodes using transparent mode e.g.
   920   * without an IPSec tunnel IP. Cluster Node A has host_ip 10.156.0.1 with an
   921   * endpoint assigned to IP 10.156.2.2 and cluster Node B has host_ip 10.182.0.1
   922   * with an endpoint using IP 10.182.3.3. Then on Node A there will be a two policy
   923   * entries and a set of State entries,
   924   *
   925   * Policy1(src=10.182.0.0/16,dst=10.156.0.1/16,dir=in,tmpl(spi=#spi,reqid=#reqid))
   926   * Policy2(src=10.156.0.0/16,dst=10.182.0.1/16,dir=out,tmpl(spi=#spi,reqid=#reqid))
   927   * State1(src=*,dst=10.182.0.1,spi=#spi,reqid=#reqid,...)
   928   * State2(src=*,dst=10.156.0.1,spi=#spi,reqid=#reqid,...)
   929   *
   930   * Design Note: For newer kernels a BPF xfrm interface would greatly simplify the
   931   * state space. Basic idea would be to reference a state using any key generated
   932   * from BPF program allowing for a single state per security ctx.
   933   */
   934  func UpsertIPsecEndpoint(log *slog.Logger, local, remote *net.IPNet, outerLocal, outerRemote net.IP, remoteNodeID uint16, remoteBootID string, dir IPSecDir, outputMark, remoteRebooted bool, reqID int) (uint8, error) {
   935  	log = log.With(logfields.LogSubsys, subsystem)
   936  
   937  	var spi uint8
   938  	var err error
   939  
   940  	/* TODO: state reference ID is (dip,spi) which can be duplicated in the current global
   941  	 * mode. The duplication is on _all_ ingress states because dst_ip == host_ip in this
   942  	 * case and only a single spi entry is in use. Currently no check is done to avoid
   943  	 * attempting to add duplicate (dip,spi) states and we get 'file exist' error. These
   944  	 * errors are expected at the moment but perhaps it would be better to avoid calling
   945  	 * netlink API at all when we "know" an entry is a duplicate. To do this the xfer
   946  	 * state would need to be cached in the ipcache.
   947  	 */
   948  	if !outerLocal.Equal(outerRemote) {
   949  		localBootID := node.GetBootID()
   950  		if dir == IPSecDirIn || dir == IPSecDirBoth {
   951  			if spi, err = ipSecReplaceStateIn(log, outerLocal, outerRemote, remoteNodeID, outputMark, localBootID, remoteBootID, remoteRebooted, reqID); err != nil {
   952  				return 0, fmt.Errorf("unable to replace local state: %w", err)
   953  			}
   954  			if err = ipSecReplacePolicyIn(remote, local, outerRemote, outerLocal, reqID); err != nil {
   955  				if !os.IsExist(err) {
   956  					return 0, fmt.Errorf("unable to replace policy in: %w", err)
   957  				}
   958  			}
   959  			if err = IpSecReplacePolicyFwd(local, outerLocal, reqID); err != nil {
   960  				if !os.IsExist(err) {
   961  					return 0, fmt.Errorf("unable to replace policy fwd: %w", err)
   962  				}
   963  			}
   964  		}
   965  
   966  		if dir == IPSecDirOut || dir == IPSecDirOutNode || dir == IPSecDirBoth {
   967  			if spi, err = ipSecReplaceStateOut(log, outerLocal, outerRemote, remoteNodeID, localBootID, remoteBootID, remoteRebooted, reqID); err != nil {
   968  				return 0, fmt.Errorf("unable to replace remote state: %w", err)
   969  			}
   970  
   971  			if err = ipSecReplacePolicyOut(local, remote, outerLocal, outerRemote, remoteNodeID, dir, reqID); err != nil {
   972  				if !os.IsExist(err) {
   973  					return 0, fmt.Errorf("unable to replace policy out: %w", err)
   974  				}
   975  			}
   976  		}
   977  	}
   978  	return spi, nil
   979  }
   980  
   981  // UpsertIPsecEndpointPolicy adds a policy to the xfrm rules. Used to add a policy when the state
   982  // rule is already available.
   983  func UpsertIPsecEndpointPolicy(local, remote *net.IPNet, localTmpl, remoteTmpl net.IP, remoteNodeID uint16, dir IPSecDir, reqID int) error {
   984  	if err := ipSecReplacePolicyOut(local, remote, localTmpl, remoteTmpl, remoteNodeID, dir, reqID); err != nil {
   985  		if !os.IsExist(err) {
   986  			return fmt.Errorf("unable to replace templated policy out: %w", err)
   987  		}
   988  	}
   989  	return nil
   990  }
   991  
   992  // DeleteIPsecEndpoint deletes a endpoint associated with the remote IP address
   993  func DeleteIPsecEndpoint(log *slog.Logger, nodeID uint16) error {
   994  	log = log.With(logfields.LogSubsys, subsystem)
   995  	return errors.Join(ipsecDeleteXfrmState(log, nodeID), ipsecDeleteXfrmPolicy(log, nodeID))
   996  }
   997  
   998  func isXfrmPolicyCilium(policy netlink.XfrmPolicy) bool {
   999  	if policy.Mark == nil {
  1000  		// Check if its our fwd rule, we don't have a mark
  1001  		// on this rule so use priority.
  1002  		if policy.Dir == netlink.XFRM_DIR_FWD &&
  1003  			policy.Priority == linux_defaults.IPsecFwdPriority {
  1004  			return true
  1005  		}
  1006  		return false
  1007  	}
  1008  
  1009  	if (policy.Mark.Value & linux_defaults.RouteMarkDecrypt) != 0 {
  1010  		return true
  1011  	}
  1012  	if (policy.Mark.Value & linux_defaults.RouteMarkEncrypt) != 0 {
  1013  		return true
  1014  	}
  1015  	return false
  1016  }
  1017  
  1018  func isXfrmStateCilium(state netlink.XfrmState) bool {
  1019  	if state.Mark == nil {
  1020  		return false
  1021  	}
  1022  	if (state.Mark.Value & linux_defaults.RouteMarkDecrypt) != 0 {
  1023  		return true
  1024  	}
  1025  	if (state.Mark.Value & linux_defaults.RouteMarkEncrypt) != 0 {
  1026  		return true
  1027  	}
  1028  	return false
  1029  }
  1030  
  1031  // DeleteXFRM remove any remaining XFRM policy or state from tables
  1032  func DeleteXFRM(log *slog.Logger) error {
  1033  	return DeleteXFRMWithReqID(log, 0)
  1034  }
  1035  
  1036  // DeleteXFRMWithReqID remove any XFRM policy or state from tables which matches the reqID
  1037  // If reqID is 0, it will remove all XFRM policy or state
  1038  func DeleteXFRMWithReqID(log *slog.Logger, reqID int) error {
  1039  	log = log.With(logfields.LogSubsys, subsystem)
  1040  
  1041  	xfrmPolicyList, err := netlink.XfrmPolicyList(netlink.FAMILY_ALL)
  1042  	if err != nil {
  1043  		return err
  1044  	}
  1045  
  1046  	ee := resiliency.NewErrorSet("failed to delete XFRM policies", len(xfrmPolicyList))
  1047  policy:
  1048  	for _, p := range xfrmPolicyList {
  1049  		if !isXfrmPolicyCilium(p) {
  1050  			continue
  1051  		}
  1052  
  1053  		// check if there exists a template with req ID as the one we are looking for
  1054  		// if so, delete the policy.
  1055  		for _, tmpl := range p.Tmpls {
  1056  			if reqID == 0 || tmpl.Reqid == reqID {
  1057  				if err := netlink.XfrmPolicyDel(&p); err != nil {
  1058  					ee.Add(err)
  1059  				}
  1060  				continue policy
  1061  			}
  1062  		}
  1063  	}
  1064  	if err := ee.Error(); err != nil {
  1065  		return err
  1066  	}
  1067  
  1068  	xfrmStateList, err := xfrmStateCache.XfrmStateList()
  1069  	if err != nil {
  1070  		log.Warn("unable to fetch xfrm state list", logfields.Error, err)
  1071  		return err
  1072  	}
  1073  	ee = resiliency.NewErrorSet("failed to delete XFRM states", len(xfrmStateList))
  1074  	for _, s := range xfrmStateList {
  1075  		if isXfrmStateCilium(s) && (reqID == 0 || s.Reqid == reqID) {
  1076  			if err := xfrmStateCache.XfrmStateDel(&s); err != nil {
  1077  				ee.Add(err)
  1078  			}
  1079  		}
  1080  	}
  1081  
  1082  	return ee.Error()
  1083  }
  1084  
  1085  func decodeIPSecKey(keyRaw string) (int, []byte, error) {
  1086  	// As we have released the v1.4.0 docs telling the users to write the
  1087  	// k8s secret with the prefix "0x" we have to remove it if it is present,
  1088  	// so we can decode the secret.
  1089  	if keyRaw == "\"\"" {
  1090  		return 0, nil, nil
  1091  	}
  1092  	keyTrimmed := strings.TrimPrefix(keyRaw, "0x")
  1093  	key, err := hex.DecodeString(keyTrimmed)
  1094  	return len(keyTrimmed), key, err
  1095  }
  1096  
  1097  // LoadIPSecKeysFile imports IPSec auth and crypt keys from a file. The format
  1098  // is to put a key per line as follows, (auth-algo auth-key enc-algo enc-key)
  1099  // Returns the authentication overhead in bytes, the key ID, and an error.
  1100  func LoadIPSecKeysFile(log *slog.Logger, path string) (int, uint8, error) {
  1101  	log.Info("Loading IPsec keyfile",
  1102  		logfields.Path, path,
  1103  		logfields.LogSubsys, subsystem,
  1104  	)
  1105  
  1106  	file, err := os.Open(path)
  1107  	if err != nil {
  1108  		return 0, 0, err
  1109  	}
  1110  	defer file.Close()
  1111  	return LoadIPSecKeys(log, file)
  1112  }
  1113  
  1114  func LoadIPSecKeys(log *slog.Logger, r io.Reader) (int, uint8, error) {
  1115  	log = log.With(logfields.LogSubsys, subsystem)
  1116  	var spi uint8
  1117  	var keyLen int
  1118  
  1119  	ipSecLock.Lock()
  1120  	defer ipSecLock.Unlock()
  1121  
  1122  	if err := encrypt.MapCreate(); err != nil {
  1123  		return 0, 0, fmt.Errorf("Encrypt map create failed: %w", err)
  1124  	}
  1125  
  1126  	scanner := bufio.NewScanner(r)
  1127  	scanner.Split(bufio.ScanLines)
  1128  	for scanner.Scan() {
  1129  		var (
  1130  			oldSpi     uint8
  1131  			aeadKey    []byte
  1132  			authKey    []byte
  1133  			esn        bool
  1134  			err        error
  1135  			offsetBase int
  1136  		)
  1137  
  1138  		ipSecKey := &ipSecKey{
  1139  			ReqID: DefaultReqID,
  1140  		}
  1141  
  1142  		// Scanning IPsec keys with one of the following formats:
  1143  		// 1. [spi] aead-algo aead-key icv-len
  1144  		// 2. [spi] auth-algo auth-key enc-algo enc-key [IP]
  1145  		s := strings.Split(scanner.Text(), " ")
  1146  		if len(s) < 3 {
  1147  			// Regardless of the format used, the IPsec secret should have at
  1148  			// least 3 fields separated by white spaces.
  1149  			return 0, 0, fmt.Errorf("missing IPSec key or invalid format")
  1150  		}
  1151  
  1152  		spi, offsetBase, esn, err = parseSPI(log, s[offsetSPI])
  1153  		if err != nil {
  1154  			return 0, 0, fmt.Errorf("failed to parse SPI: %w", err)
  1155  		}
  1156  
  1157  		if len(s) > offsetBase+maxOffset+1 {
  1158  			return 0, 0, fmt.Errorf("invalid format: too many fields in the IPsec secret")
  1159  		} else if len(s) == offsetBase+offsetICV+1 {
  1160  			// We're in the first case, with "[spi] aead-algo aead-key icv-len".
  1161  			aeadName := s[offsetBase+offsetAeadAlgo]
  1162  			if !strings.HasPrefix(aeadName, "rfc") {
  1163  				return 0, 0, fmt.Errorf("invalid AEAD algorithm %q", aeadName)
  1164  			}
  1165  
  1166  			_, aeadKey, err = decodeIPSecKey(s[offsetBase+offsetAeadKey])
  1167  			if err != nil {
  1168  				return 0, 0, fmt.Errorf("unable to decode AEAD key string %q", s[offsetBase+offsetAeadKey])
  1169  			}
  1170  
  1171  			icvLen, err := strconv.Atoi(s[offsetICV+offsetBase])
  1172  			if err != nil {
  1173  				return 0, 0, fmt.Errorf("ICV length is invalid or missing")
  1174  			}
  1175  
  1176  			if icvLen != 96 && icvLen != 128 && icvLen != 256 {
  1177  				return 0, 0, fmt.Errorf("only ICV lengths 96, 128, and 256 are accepted")
  1178  			}
  1179  
  1180  			ipSecKey.Aead = &netlink.XfrmStateAlgo{
  1181  				Name:   aeadName,
  1182  				Key:    aeadKey,
  1183  				ICVLen: icvLen,
  1184  			}
  1185  			keyLen = icvLen / 8
  1186  		} else {
  1187  			// We're in the second case, with "[spi] auth-algo auth-key enc-algo enc-key [IP]".
  1188  			authAlgo := s[offsetBase+offsetAuthAlgo]
  1189  			keyLen, authKey, err = decodeIPSecKey(s[offsetBase+offsetAuthKey])
  1190  			if err != nil {
  1191  				return 0, 0, fmt.Errorf("unable to decode authentication key string %q", s[offsetBase+offsetAuthKey])
  1192  			}
  1193  
  1194  			encAlgo := s[offsetBase+offsetEncAlgo]
  1195  			_, encKey, err := decodeIPSecKey(s[offsetBase+offsetEncKey])
  1196  			if err != nil {
  1197  				return 0, 0, fmt.Errorf("unable to decode encryption key string %q", s[offsetBase+offsetEncKey])
  1198  			}
  1199  
  1200  			ipSecKey.Auth = &netlink.XfrmStateAlgo{
  1201  				Name: authAlgo,
  1202  				Key:  authKey,
  1203  			}
  1204  			ipSecKey.Crypt = &netlink.XfrmStateAlgo{
  1205  				Name: encAlgo,
  1206  				Key:  encKey,
  1207  			}
  1208  		}
  1209  
  1210  		ipSecKey.Spi = spi
  1211  		ipSecKey.ESN = esn
  1212  
  1213  		if len(s) == offsetBase+offsetIP+1 {
  1214  			// The IPsec secret has the optional IP address field at the end.
  1215  			log.Warn("IPsec secrets with an IP address as the last argument are deprecated and will be unsupported in v1.13.")
  1216  			if ipSecKeysGlobal[s[offsetBase+offsetIP]] != nil {
  1217  				oldSpi = ipSecKeysGlobal[s[offsetBase+offsetIP]].Spi
  1218  			}
  1219  			ipSecKeysGlobal[s[offsetBase+offsetIP]] = ipSecKey
  1220  		} else {
  1221  			if ipSecKeysGlobal[""] != nil {
  1222  				oldSpi = ipSecKeysGlobal[""].Spi
  1223  			}
  1224  			ipSecKeysGlobal[""] = ipSecKey
  1225  		}
  1226  
  1227  		ipSecKeysRemovalTime[oldSpi] = time.Now()
  1228  		ipSecCurrentKeySPI = spi
  1229  	}
  1230  	return keyLen, spi, nil
  1231  }
  1232  
  1233  func parseSPI(log *slog.Logger, spiStr string) (uint8, int, bool, error) {
  1234  	esn := false
  1235  	if spiStr[len(spiStr)-1] == '+' {
  1236  		esn = true
  1237  		spiStr = spiStr[:len(spiStr)-1]
  1238  	}
  1239  	spi, err := strconv.Atoi(spiStr)
  1240  	if err != nil {
  1241  		// If no version info is provided assume using key format without
  1242  		// versioning and assign SPI.
  1243  		log.Warn("IPsec secrets without an SPI as the first argument are deprecated and will be unsupported in v1.13.")
  1244  		return 1, -1, esn, nil
  1245  	}
  1246  	if spi > linux_defaults.IPsecMaxKeyVersion {
  1247  		return 0, 0, false, fmt.Errorf("encryption key space exhausted. ID must be nonzero and less than %d. Attempted %q", linux_defaults.IPsecMaxKeyVersion+1, spiStr)
  1248  	}
  1249  	if spi == 0 {
  1250  		return 0, 0, false, fmt.Errorf("zero is not a valid key ID. ID must be nonzero and less than %d. Attempted %q", linux_defaults.IPsecMaxKeyVersion+1, spiStr)
  1251  	}
  1252  	if !esn {
  1253  		log.Warn(fmt.Sprintf("global IPsec keys are deprecated and will be removed in v1.17. Use per-tunnel keys instead by adding a '+' sign after the SPI (%d+ in your case).", spi))
  1254  	}
  1255  	return uint8(spi), 0, esn, nil
  1256  }
  1257  
  1258  func SetIPSecSPI(log *slog.Logger, spi uint8) error {
  1259  	log = log.With(logfields.LogSubsys, subsystem)
  1260  	if err := encrypt.MapUpdateContext(0, spi); err != nil {
  1261  		log.Warn("cilium_encrypt_state map updated failed", logfields.Error, err)
  1262  		return err
  1263  	}
  1264  	return nil
  1265  }
  1266  
  1267  // DeleteIPsecEncryptRoute removes nodes in main routing table by walking
  1268  // routes and matching route protocol type.
  1269  func DeleteIPsecEncryptRoute(log *slog.Logger) {
  1270  	log = log.With(logfields.LogSubsys, subsystem)
  1271  	filter := &netlink.Route{
  1272  		Protocol: route.EncryptRouteProtocol,
  1273  	}
  1274  
  1275  	for _, family := range []int{netlink.FAMILY_V4, netlink.FAMILY_V6} {
  1276  		routes, err := netlink.RouteListFiltered(family, filter, netlink.RT_FILTER_PROTOCOL)
  1277  		if err != nil {
  1278  			log.Error("Unable to list ipsec encrypt routes", logfields.Error, err)
  1279  			return
  1280  		}
  1281  
  1282  		for _, rt := range routes {
  1283  			if err := netlink.RouteDel(&rt); err != nil {
  1284  				log.Warn("Unable to delete ipsec encrypt route", "route", rt.String(), logfields.Error, err)
  1285  			}
  1286  		}
  1287  	}
  1288  }
  1289  
  1290  func keyfileWatcher(log *slog.Logger, ctx context.Context, watcher *fswatcher.Watcher, keyfilePath string, nodeHandler datapath.NodeHandler, health cell.Health) error {
  1291  	for {
  1292  		select {
  1293  		case event := <-watcher.Events:
  1294  			if event.Op&(fsnotify.Create|fsnotify.Write) == 0 {
  1295  				continue
  1296  			}
  1297  
  1298  			_, spi, err := LoadIPSecKeysFile(log, keyfilePath)
  1299  			if err != nil {
  1300  				health.Degraded(fmt.Sprintf("Failed to load keyfile %q", keyfilePath), err)
  1301  				log.Error("Failed to load IPsec keyfile", logfields.Error, err)
  1302  				continue
  1303  			}
  1304  
  1305  			// Update the IPSec key identity in the local node.
  1306  			// This will set addrs.ipsecKeyIdentity in the node
  1307  			// package, and eventually trigger an update to
  1308  			// publish the updated information to k8s/kvstore.
  1309  			node.SetIPsecKeyIdentity(spi)
  1310  
  1311  			// AllNodeValidateImplementation will eventually call
  1312  			// nodeUpdate(), which is responsible for updating the
  1313  			// IPSec policies and states for all the different EPs
  1314  			// with ipsec.UpsertIPsecEndpoint()
  1315  			nodeHandler.AllNodeValidateImplementation()
  1316  
  1317  			// Push SPI update into BPF datapath now that XFRM state
  1318  			// is configured.
  1319  			if err := SetIPSecSPI(log, spi); err != nil {
  1320  				health.Degraded("Failed to set IPsec SPI", err)
  1321  				log.Error("Failed to set IPsec SPI", logfields.Error, err)
  1322  				continue
  1323  			}
  1324  			health.OK("Watching keyfiles")
  1325  		case err := <-watcher.Errors:
  1326  			log.Warn("Error encountered while watching file with fsnotify",
  1327  				logfields.Error, err,
  1328  				logfields.Path, keyfilePath,
  1329  			)
  1330  
  1331  		case <-ctx.Done():
  1332  			health.Stopped("Context done")
  1333  			watcher.Close()
  1334  			return nil
  1335  		}
  1336  	}
  1337  }
  1338  
  1339  func StartKeyfileWatcher(log *slog.Logger, group job.Group, keyfilePath string, nodeHandler datapath.NodeHandler) error {
  1340  	if !option.Config.EnableIPsecKeyWatcher {
  1341  		return nil
  1342  	}
  1343  
  1344  	watcher, err := fswatcher.New([]string{keyfilePath})
  1345  	if err != nil {
  1346  		return err
  1347  	}
  1348  
  1349  	group.Add(job.OneShot("keyfile-watcher", func(ctx context.Context, health cell.Health) error {
  1350  		return keyfileWatcher(log, ctx, watcher, keyfilePath, nodeHandler, health)
  1351  	}))
  1352  
  1353  	return nil
  1354  }
  1355  
  1356  // ipSecSPICanBeReclaimed is used to test whether a given SPI can be reclaimed
  1357  // or not (i.e. if it's not in use, and if not, if enough time has passed since
  1358  // when it was replaced by a newer one).
  1359  //
  1360  // In addition to the SPI, this function takes also a reclaimTimestamp
  1361  // parameter which represents the time at which we started reclaiming old keys.
  1362  // This is needed as we need to test the same SPI multiple times (since for any
  1363  // given SPI there are multiple policies and states associated with it), and we
  1364  // don't want to get inconsistent results because we are calling time.Now()
  1365  // directly in this function.
  1366  func ipSecSPICanBeReclaimed(spi uint8, reclaimTimestamp time.Time) bool {
  1367  	// The SPI associated with the key currently in use should not be reclaimed
  1368  	if spi == ipSecCurrentKeySPI {
  1369  		return false
  1370  	}
  1371  
  1372  	// Otherwise retrieve the time at which the key for the given SPI was removed
  1373  	keyRemovalTime, ok := ipSecKeysRemovalTime[spi]
  1374  	if !ok {
  1375  		// If not found in the keyRemovalTime map, assume the key was
  1376  		// deleted just now.
  1377  		// In this way if the agent gets restarted before an old key is
  1378  		// removed we will always wait at least IPsecKeyRotationDuration time
  1379  		// before reclaiming it
  1380  		ipSecKeysRemovalTime[spi] = time.Now()
  1381  
  1382  		return false
  1383  	}
  1384  
  1385  	// If the key was deleted less than the IPSec key deletion delay
  1386  	// time ago, it should not be reclaimed
  1387  	if reclaimTimestamp.Sub(keyRemovalTime) < option.Config.IPsecKeyRotationDuration {
  1388  		return false
  1389  	}
  1390  
  1391  	return true
  1392  }
  1393  
  1394  func deleteStaleXfrmStates(reclaimTimestamp time.Time) error {
  1395  	xfrmStateList, err := xfrmStateCache.XfrmStateList()
  1396  	if err != nil {
  1397  		return err
  1398  	}
  1399  
  1400  	errs := resiliency.NewErrorSet("failed to delete stale xfrm states", len(xfrmStateList))
  1401  	for _, s := range xfrmStateList {
  1402  		stateSPI := uint8(s.Spi)
  1403  		if !ipSecSPICanBeReclaimed(stateSPI, reclaimTimestamp) {
  1404  			continue
  1405  		}
  1406  		if err := xfrmStateCache.XfrmStateDel(&s); err != nil {
  1407  			errs.Add(fmt.Errorf("failed to delete stale xfrm state spi (%d): %w", stateSPI, err))
  1408  		}
  1409  	}
  1410  
  1411  	return errs.Error()
  1412  }
  1413  
  1414  func deleteStaleXfrmPolicies(log *slog.Logger, reclaimTimestamp time.Time) error {
  1415  	scopedLog := log.With(logfields.SPI, ipSecCurrentKeySPI)
  1416  
  1417  	xfrmPolicyList, err := netlink.XfrmPolicyList(netlink.FAMILY_ALL)
  1418  	if err != nil {
  1419  		return err
  1420  	}
  1421  
  1422  	errs := resiliency.NewErrorSet("failed to delete stale xfrm policies", len(xfrmPolicyList))
  1423  	for _, p := range xfrmPolicyList {
  1424  		policySPI := ipsec.GetSPIFromXfrmPolicy(&p)
  1425  		if !ipSecSPICanBeReclaimed(policySPI, reclaimTimestamp) {
  1426  			continue
  1427  		}
  1428  
  1429  		// Only OUT XFRM policies depend on the SPI
  1430  		if p.Dir != netlink.XFRM_DIR_OUT {
  1431  			continue
  1432  		}
  1433  
  1434  		if isDefaultDropPolicy(&p) {
  1435  			continue
  1436  		}
  1437  
  1438  		scopedLog.Info("Deleting stale XFRM policy",
  1439  			logfields.OldSPI, policySPI,
  1440  			logfields.SourceIP, p.Src,
  1441  			logfields.DestinationIP, p.Dst,
  1442  			logfields.TrafficDirection, getDirFromXfrmMark(p.Mark),
  1443  			logfields.NodeID, getNodeIDAsHexFromXfrmMark(p.Mark),
  1444  		)
  1445  		if err := netlink.XfrmPolicyDel(&p); err != nil {
  1446  			errs.Add(fmt.Errorf("failed to delete stale xfrm policy spi (%d): %w", policySPI, err))
  1447  		}
  1448  	}
  1449  
  1450  	return errs.Error()
  1451  }
  1452  
  1453  func isDefaultDropPolicy(p *netlink.XfrmPolicy) bool {
  1454  	return equalDefaultDropPolicy(defaultDropPolicyIPv4, p) ||
  1455  		equalDefaultDropPolicy(defaultDropPolicyIPv6, p)
  1456  }
  1457  
  1458  func equalDefaultDropPolicy(defaultDropPolicy, p *netlink.XfrmPolicy) bool {
  1459  	return p.Priority == defaultDropPolicy.Priority &&
  1460  		p.Action == defaultDropPolicy.Action &&
  1461  		p.Dir == defaultDropPolicy.Dir &&
  1462  		xfrmMarkEqual(p.Mark, defaultDropPolicy.Mark) &&
  1463  		p.Src.String() == defaultDropPolicy.Src.String() &&
  1464  		p.Dst.String() == defaultDropPolicy.Dst.String()
  1465  }
  1466  
  1467  type staleKeyReclaimer struct {
  1468  	log *slog.Logger
  1469  }
  1470  
  1471  func (skr staleKeyReclaimer) onTimer(ctx context.Context) error {
  1472  	ipSecLock.Lock()
  1473  	defer ipSecLock.Unlock()
  1474  
  1475  	// In case no IPSec key has been loaded yet, don't try to reclaim any
  1476  	// old key
  1477  	if ipSecCurrentKeySPI == 0 {
  1478  		return nil
  1479  	}
  1480  
  1481  	reclaimTimestamp := time.Now()
  1482  
  1483  	scopedLog := skr.log.With(logfields.SPI, ipSecCurrentKeySPI)
  1484  	if err := deleteStaleXfrmStates(reclaimTimestamp); err != nil {
  1485  		scopedLog.Warn("Failed to delete stale XFRM states", logfields.Error, err)
  1486  		return err
  1487  	}
  1488  	if err := deleteStaleXfrmPolicies(skr.log, reclaimTimestamp); err != nil {
  1489  		scopedLog.Warn("Failed to delete stale XFRM policies", logfields.Error, err)
  1490  		return err
  1491  	}
  1492  
  1493  	return nil
  1494  }