github.com/moby/docker@v26.1.3+incompatible/libnetwork/drivers/overlay/encryption.go (about) 1 //go:build linux 2 3 package overlay 4 5 import ( 6 "bytes" 7 "context" 8 "encoding/binary" 9 "encoding/hex" 10 "fmt" 11 "hash/fnv" 12 "net" 13 "strconv" 14 "sync" 15 "syscall" 16 17 "github.com/containerd/log" 18 "github.com/docker/docker/libnetwork/drivers/overlay/overlayutils" 19 "github.com/docker/docker/libnetwork/iptables" 20 "github.com/docker/docker/libnetwork/ns" 21 "github.com/docker/docker/libnetwork/types" 22 "github.com/vishvananda/netlink" 23 ) 24 25 /* 26 Encrypted overlay networks use IPsec in transport mode to encrypt and 27 authenticate the VXLAN UDP datagrams. This driver implements a bespoke control 28 plane which negotiates the security parameters for each peer-to-peer tunnel. 29 30 IPsec Terminology 31 32 - ESP: IPSec Encapsulating Security Payload 33 - SPI: Security Parameter Index 34 - ICV: Integrity Check Value 35 - SA: Security Association https://en.wikipedia.org/wiki/IPsec#Security_association 36 37 38 Developer documentation for Linux IPsec is rather sparse online. The following 39 slide deck provides a decent overview. 40 https://libreswan.org/wiki/images/e/e0/Netdev-0x12-ipsec-flow.pdf 41 42 The Linux IPsec stack is part of XFRM, the netlink packet transformation 43 interface. 44 https://man7.org/linux/man-pages/man8/ip-xfrm.8.html 45 */ 46 47 const ( 48 // Value used to mark outgoing packets which should have our IPsec 49 // processing applied. It is also used as a label to identify XFRM 50 // states (Security Associations) and policies (Security Policies) 51 // programmed by us so we know which ones we can clean up without 52 // disrupting other VPN connections on the system. 53 mark = 0xD0C4E3 54 55 pktExpansion = 26 // SPI(4) + SeqN(4) + IV(8) + PadLength(1) + NextHeader(1) + ICV(8) 56 ) 57 58 const ( 59 forward = iota + 1 60 reverse 61 bidir 62 ) 63 64 // Mark value for matching packets which should have our IPsec security policy 65 // applied. 66 var spMark = netlink.XfrmMark{Value: mark, Mask: 0xffffffff} 67 68 type key struct { 69 value []byte 70 tag uint32 71 } 72 73 func (k *key) String() string { 74 if k != nil { 75 return fmt.Sprintf("(key: %s, tag: 0x%x)", hex.EncodeToString(k.value)[0:5], k.tag) 76 } 77 return "" 78 } 79 80 // Security Parameter Indices for the IPsec flows between local node and a 81 // remote peer, which identify the Security Associations (XFRM states) to be 82 // applied when encrypting and decrypting packets. 83 type spi struct { 84 forward int 85 reverse int 86 } 87 88 func (s *spi) String() string { 89 return fmt.Sprintf("SPI(FWD: 0x%x, REV: 0x%x)", uint32(s.forward), uint32(s.reverse)) 90 } 91 92 type encrMap struct { 93 nodes map[string][]*spi 94 sync.Mutex 95 } 96 97 func (e *encrMap) String() string { 98 e.Lock() 99 defer e.Unlock() 100 b := new(bytes.Buffer) 101 for k, v := range e.nodes { 102 b.WriteString("\n") 103 b.WriteString(k) 104 b.WriteString(":") 105 b.WriteString("[") 106 for _, s := range v { 107 b.WriteString(s.String()) 108 b.WriteString(",") 109 } 110 b.WriteString("]") 111 } 112 return b.String() 113 } 114 115 func (d *driver) checkEncryption(nid string, rIP net.IP, isLocal, add bool) error { 116 log.G(context.TODO()).Debugf("checkEncryption(%.7s, %v, %t)", nid, rIP, isLocal) 117 118 n := d.network(nid) 119 if n == nil || !n.secure { 120 return nil 121 } 122 123 if len(d.keys) == 0 { 124 return types.ForbiddenErrorf("encryption key is not present") 125 } 126 127 lIP := d.bindAddress 128 aIP := d.advertiseAddress 129 nodes := map[string]net.IP{} 130 131 switch { 132 case isLocal: 133 if err := d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool { 134 if !aIP.Equal(pEntry.vtep) { 135 nodes[pEntry.vtep.String()] = pEntry.vtep 136 } 137 return false 138 }); err != nil { 139 log.G(context.TODO()).Warnf("Failed to retrieve list of participating nodes in overlay network %.5s: %v", nid, err) 140 } 141 default: 142 if len(d.network(nid).endpoints) > 0 { 143 nodes[rIP.String()] = rIP 144 } 145 } 146 147 log.G(context.TODO()).Debugf("List of nodes: %s", nodes) 148 149 if add { 150 for _, rIP := range nodes { 151 if err := setupEncryption(lIP, aIP, rIP, d.secMap, d.keys); err != nil { 152 log.G(context.TODO()).Warnf("Failed to program network encryption between %s and %s: %v", lIP, rIP, err) 153 } 154 } 155 } else { 156 if len(nodes) == 0 { 157 if err := removeEncryption(lIP, rIP, d.secMap); err != nil { 158 log.G(context.TODO()).Warnf("Failed to remove network encryption between %s and %s: %v", lIP, rIP, err) 159 } 160 } 161 } 162 163 return nil 164 } 165 166 // setupEncryption programs the encryption parameters for secure communication 167 // between the local node and a remote node. 168 func setupEncryption(localIP, advIP, remoteIP net.IP, em *encrMap, keys []*key) error { 169 log.G(context.TODO()).Debugf("Programming encryption between %s and %s", localIP, remoteIP) 170 rIPs := remoteIP.String() 171 172 indices := make([]*spi, 0, len(keys)) 173 174 for i, k := range keys { 175 spis := &spi{buildSPI(advIP, remoteIP, k.tag), buildSPI(remoteIP, advIP, k.tag)} 176 dir := reverse 177 if i == 0 { 178 dir = bidir 179 } 180 fSA, rSA, err := programSA(localIP, remoteIP, spis, k, dir, true) 181 if err != nil { 182 log.G(context.TODO()).Warn(err) 183 } 184 indices = append(indices, spis) 185 if i != 0 { 186 continue 187 } 188 err = programSP(fSA, rSA, true) 189 if err != nil { 190 log.G(context.TODO()).Warn(err) 191 } 192 } 193 194 em.Lock() 195 em.nodes[rIPs] = indices 196 em.Unlock() 197 198 return nil 199 } 200 201 func removeEncryption(localIP, remoteIP net.IP, em *encrMap) error { 202 em.Lock() 203 indices, ok := em.nodes[remoteIP.String()] 204 em.Unlock() 205 if !ok { 206 return nil 207 } 208 for i, idxs := range indices { 209 dir := reverse 210 if i == 0 { 211 dir = bidir 212 } 213 fSA, rSA, err := programSA(localIP, remoteIP, idxs, nil, dir, false) 214 if err != nil { 215 log.G(context.TODO()).Warn(err) 216 } 217 if i != 0 { 218 continue 219 } 220 err = programSP(fSA, rSA, false) 221 if err != nil { 222 log.G(context.TODO()).Warn(err) 223 } 224 } 225 return nil 226 } 227 228 func (d *driver) transportIPTable() (*iptables.IPTable, error) { 229 v6, err := d.isIPv6Transport() 230 if err != nil { 231 return nil, err 232 } 233 version := iptables.IPv4 234 if v6 { 235 version = iptables.IPv6 236 } 237 return iptables.GetIptable(version), nil 238 } 239 240 func (d *driver) programMangle(vni uint32, add bool) error { 241 var ( 242 m = strconv.FormatUint(mark, 10) 243 chain = "OUTPUT" 244 rule = append(matchVXLAN(overlayutils.VXLANUDPPort(), vni), "-j", "MARK", "--set-mark", m) 245 a = iptables.Append 246 action = "install" 247 ) 248 249 iptable, err := d.transportIPTable() 250 if err != nil { 251 // Fail closed if unsure. Better safe than cleartext. 252 return err 253 } 254 255 if !add { 256 a = iptables.Delete 257 action = "remove" 258 } 259 260 if err := iptable.ProgramRule(iptables.Mangle, chain, a, rule); err != nil { 261 return fmt.Errorf("could not %s mangle rule: %w", action, err) 262 } 263 264 return nil 265 } 266 267 func (d *driver) programInput(vni uint32, add bool) error { 268 var ( 269 plainVxlan = matchVXLAN(overlayutils.VXLANUDPPort(), vni) 270 chain = "INPUT" 271 msg = "add" 272 ) 273 274 rule := func(policy, jump string) []string { 275 args := append([]string{"-m", "policy", "--dir", "in", "--pol", policy}, plainVxlan...) 276 return append(args, "-j", jump) 277 } 278 279 iptable, err := d.transportIPTable() 280 if err != nil { 281 // Fail closed if unsure. Better safe than cleartext. 282 return err 283 } 284 285 if !add { 286 msg = "remove" 287 } 288 289 action := func(a iptables.Action) iptables.Action { 290 if !add { 291 return iptables.Delete 292 } 293 return a 294 } 295 296 // Drop incoming VXLAN datagrams for the VNI which were received in cleartext. 297 // Insert at the top of the chain so the packets are dropped even if an 298 // administrator-configured rule exists which would otherwise unconditionally 299 // accept incoming VXLAN traffic. 300 if err := iptable.ProgramRule(iptables.Filter, chain, action(iptables.Insert), rule("none", "DROP")); err != nil { 301 return fmt.Errorf("could not %s input drop rule: %w", msg, err) 302 } 303 304 return nil 305 } 306 307 func programSA(localIP, remoteIP net.IP, spi *spi, k *key, dir int, add bool) (fSA *netlink.XfrmState, rSA *netlink.XfrmState, err error) { 308 var ( 309 action = "Removing" 310 xfrmProgram = ns.NlHandle().XfrmStateDel 311 ) 312 313 if add { 314 action = "Adding" 315 xfrmProgram = ns.NlHandle().XfrmStateAdd 316 } 317 318 if dir&reverse > 0 { 319 rSA = &netlink.XfrmState{ 320 Src: remoteIP, 321 Dst: localIP, 322 Proto: netlink.XFRM_PROTO_ESP, 323 Spi: spi.reverse, 324 Mode: netlink.XFRM_MODE_TRANSPORT, 325 Reqid: mark, 326 } 327 if add { 328 rSA.Aead = buildAeadAlgo(k, spi.reverse) 329 } 330 331 exists, err := saExists(rSA) 332 if err != nil { 333 exists = !add 334 } 335 336 if add != exists { 337 log.G(context.TODO()).Debugf("%s: rSA{%s}", action, rSA) 338 if err := xfrmProgram(rSA); err != nil { 339 log.G(context.TODO()).Warnf("Failed %s rSA{%s}: %v", action, rSA, err) 340 } 341 } 342 } 343 344 if dir&forward > 0 { 345 fSA = &netlink.XfrmState{ 346 Src: localIP, 347 Dst: remoteIP, 348 Proto: netlink.XFRM_PROTO_ESP, 349 Spi: spi.forward, 350 Mode: netlink.XFRM_MODE_TRANSPORT, 351 Reqid: mark, 352 } 353 if add { 354 fSA.Aead = buildAeadAlgo(k, spi.forward) 355 } 356 357 exists, err := saExists(fSA) 358 if err != nil { 359 exists = !add 360 } 361 362 if add != exists { 363 log.G(context.TODO()).Debugf("%s fSA{%s}", action, fSA) 364 if err := xfrmProgram(fSA); err != nil { 365 log.G(context.TODO()).Warnf("Failed %s fSA{%s}: %v.", action, fSA, err) 366 } 367 } 368 } 369 370 return 371 } 372 373 // getMinimalIP returns the address in its shortest form 374 // If ip contains an IPv4-mapped IPv6 address, the 4-octet form of the IPv4 address will be returned. 375 // Otherwise ip is returned unchanged. 376 func getMinimalIP(ip net.IP) net.IP { 377 if ip != nil && ip.To4() != nil { 378 return ip.To4() 379 } 380 return ip 381 } 382 383 func programSP(fSA *netlink.XfrmState, rSA *netlink.XfrmState, add bool) error { 384 action := "Removing" 385 xfrmProgram := ns.NlHandle().XfrmPolicyDel 386 if add { 387 action = "Adding" 388 xfrmProgram = ns.NlHandle().XfrmPolicyAdd 389 } 390 391 // Create a congruent cidr 392 s := getMinimalIP(fSA.Src) 393 d := getMinimalIP(fSA.Dst) 394 fullMask := net.CIDRMask(8*len(s), 8*len(s)) 395 396 fPol := &netlink.XfrmPolicy{ 397 Src: &net.IPNet{IP: s, Mask: fullMask}, 398 Dst: &net.IPNet{IP: d, Mask: fullMask}, 399 Dir: netlink.XFRM_DIR_OUT, 400 Proto: syscall.IPPROTO_UDP, 401 DstPort: int(overlayutils.VXLANUDPPort()), 402 Mark: &spMark, 403 Tmpls: []netlink.XfrmPolicyTmpl{ 404 { 405 Src: fSA.Src, 406 Dst: fSA.Dst, 407 Proto: netlink.XFRM_PROTO_ESP, 408 Mode: netlink.XFRM_MODE_TRANSPORT, 409 Spi: fSA.Spi, 410 Reqid: mark, 411 }, 412 }, 413 } 414 415 exists, err := spExists(fPol) 416 if err != nil { 417 exists = !add 418 } 419 420 if add != exists { 421 log.G(context.TODO()).Debugf("%s fSP{%s}", action, fPol) 422 if err := xfrmProgram(fPol); err != nil { 423 log.G(context.TODO()).Warnf("%s fSP{%s}: %v", action, fPol, err) 424 } 425 } 426 427 return nil 428 } 429 430 func saExists(sa *netlink.XfrmState) (bool, error) { 431 _, err := ns.NlHandle().XfrmStateGet(sa) 432 switch err { 433 case nil: 434 return true, nil 435 case syscall.ESRCH: 436 return false, nil 437 default: 438 err = fmt.Errorf("Error while checking for SA existence: %v", err) 439 log.G(context.TODO()).Warn(err) 440 return false, err 441 } 442 } 443 444 func spExists(sp *netlink.XfrmPolicy) (bool, error) { 445 _, err := ns.NlHandle().XfrmPolicyGet(sp) 446 switch err { 447 case nil: 448 return true, nil 449 case syscall.ENOENT: 450 return false, nil 451 default: 452 err = fmt.Errorf("Error while checking for SP existence: %v", err) 453 log.G(context.TODO()).Warn(err) 454 return false, err 455 } 456 } 457 458 func buildSPI(src, dst net.IP, st uint32) int { 459 b := make([]byte, 4) 460 binary.BigEndian.PutUint32(b, st) 461 h := fnv.New32a() 462 h.Write(src) 463 h.Write(b) 464 h.Write(dst) 465 return int(binary.BigEndian.Uint32(h.Sum(nil))) 466 } 467 468 func buildAeadAlgo(k *key, s int) *netlink.XfrmStateAlgo { 469 salt := make([]byte, 4) 470 binary.BigEndian.PutUint32(salt, uint32(s)) 471 return &netlink.XfrmStateAlgo{ 472 Name: "rfc4106(gcm(aes))", 473 Key: append(k.value, salt...), 474 ICVLen: 64, 475 } 476 } 477 478 func (d *driver) secMapWalk(f func(string, []*spi) ([]*spi, bool)) error { 479 d.secMap.Lock() 480 for node, indices := range d.secMap.nodes { 481 idxs, stop := f(node, indices) 482 if idxs != nil { 483 d.secMap.nodes[node] = idxs 484 } 485 if stop { 486 break 487 } 488 } 489 d.secMap.Unlock() 490 return nil 491 } 492 493 func (d *driver) setKeys(keys []*key) error { 494 // Remove any stale policy, state 495 clearEncryptionStates() 496 // Accept the encryption keys and clear any stale encryption map 497 d.Lock() 498 d.keys = keys 499 d.secMap = &encrMap{nodes: map[string][]*spi{}} 500 d.Unlock() 501 log.G(context.TODO()).Debugf("Initial encryption keys: %v", keys) 502 return nil 503 } 504 505 // updateKeys allows to add a new key and/or change the primary key and/or prune an existing key 506 // The primary key is the key used in transmission and will go in first position in the list. 507 func (d *driver) updateKeys(newKey, primary, pruneKey *key) error { 508 log.G(context.TODO()).Debugf("Updating Keys. New: %v, Primary: %v, Pruned: %v", newKey, primary, pruneKey) 509 510 log.G(context.TODO()).Debugf("Current: %v", d.keys) 511 512 var ( 513 newIdx = -1 514 priIdx = -1 515 delIdx = -1 516 lIP = d.bindAddress 517 aIP = d.advertiseAddress 518 ) 519 520 d.Lock() 521 defer d.Unlock() 522 523 // add new 524 if newKey != nil { 525 d.keys = append(d.keys, newKey) 526 newIdx += len(d.keys) 527 } 528 for i, k := range d.keys { 529 if primary != nil && k.tag == primary.tag { 530 priIdx = i 531 } 532 if pruneKey != nil && k.tag == pruneKey.tag { 533 delIdx = i 534 } 535 } 536 537 if (newKey != nil && newIdx == -1) || 538 (primary != nil && priIdx == -1) || 539 (pruneKey != nil && delIdx == -1) { 540 return types.InvalidParameterErrorf("cannot find proper key indices while processing key update:"+ 541 "(newIdx,priIdx,delIdx):(%d, %d, %d)", newIdx, priIdx, delIdx) 542 } 543 544 if priIdx != -1 && priIdx == delIdx { 545 return types.InvalidParameterErrorf("attempting to both make a key (index %d) primary and delete it", priIdx) 546 } 547 548 d.secMapWalk(func(rIPs string, spis []*spi) ([]*spi, bool) { 549 rIP := net.ParseIP(rIPs) 550 return updateNodeKey(lIP, aIP, rIP, spis, d.keys, newIdx, priIdx, delIdx), false 551 }) 552 553 // swap primary 554 if priIdx != -1 { 555 d.keys[0], d.keys[priIdx] = d.keys[priIdx], d.keys[0] 556 } 557 // prune 558 if delIdx != -1 { 559 if delIdx == 0 { 560 delIdx = priIdx 561 } 562 d.keys = append(d.keys[:delIdx], d.keys[delIdx+1:]...) 563 } 564 565 log.G(context.TODO()).Debugf("Updated: %v", d.keys) 566 567 return nil 568 } 569 570 /******************************************************** 571 * Steady state: rSA0, rSA1, rSA2, fSA1, fSP1 572 * Rotation --> -rSA0, +rSA3, +fSA2, +fSP2/-fSP1, -fSA1 573 * Steady state: rSA1, rSA2, rSA3, fSA2, fSP2 574 *********************************************************/ 575 576 // Spis and keys are sorted in such away the one in position 0 is the primary 577 func updateNodeKey(lIP, aIP, rIP net.IP, idxs []*spi, curKeys []*key, newIdx, priIdx, delIdx int) []*spi { 578 log.G(context.TODO()).Debugf("Updating keys for node: %s (%d,%d,%d)", rIP, newIdx, priIdx, delIdx) 579 580 spis := idxs 581 log.G(context.TODO()).Debugf("Current: %v", spis) 582 583 // add new 584 if newIdx != -1 { 585 spis = append(spis, &spi{ 586 forward: buildSPI(aIP, rIP, curKeys[newIdx].tag), 587 reverse: buildSPI(rIP, aIP, curKeys[newIdx].tag), 588 }) 589 } 590 591 if delIdx != -1 { 592 // -rSA0 593 programSA(lIP, rIP, spis[delIdx], nil, reverse, false) 594 } 595 596 if newIdx > -1 { 597 // +rSA2 598 programSA(lIP, rIP, spis[newIdx], curKeys[newIdx], reverse, true) 599 } 600 601 if priIdx > 0 { 602 // +fSA2 603 fSA2, _, _ := programSA(lIP, rIP, spis[priIdx], curKeys[priIdx], forward, true) 604 605 // +fSP2, -fSP1 606 s := getMinimalIP(fSA2.Src) 607 d := getMinimalIP(fSA2.Dst) 608 fullMask := net.CIDRMask(8*len(s), 8*len(s)) 609 610 fSP1 := &netlink.XfrmPolicy{ 611 Src: &net.IPNet{IP: s, Mask: fullMask}, 612 Dst: &net.IPNet{IP: d, Mask: fullMask}, 613 Dir: netlink.XFRM_DIR_OUT, 614 Proto: syscall.IPPROTO_UDP, 615 DstPort: int(overlayutils.VXLANUDPPort()), 616 Mark: &spMark, 617 Tmpls: []netlink.XfrmPolicyTmpl{ 618 { 619 Src: fSA2.Src, 620 Dst: fSA2.Dst, 621 Proto: netlink.XFRM_PROTO_ESP, 622 Mode: netlink.XFRM_MODE_TRANSPORT, 623 Spi: fSA2.Spi, 624 Reqid: mark, 625 }, 626 }, 627 } 628 log.G(context.TODO()).Debugf("Updating fSP{%s}", fSP1) 629 if err := ns.NlHandle().XfrmPolicyUpdate(fSP1); err != nil { 630 log.G(context.TODO()).Warnf("Failed to update fSP{%s}: %v", fSP1, err) 631 } 632 633 // -fSA1 634 programSA(lIP, rIP, spis[0], nil, forward, false) 635 } 636 637 // swap 638 if priIdx > 0 { 639 swp := spis[0] 640 spis[0] = spis[priIdx] 641 spis[priIdx] = swp 642 } 643 // prune 644 if delIdx != -1 { 645 if delIdx == 0 { 646 delIdx = priIdx 647 } 648 spis = append(spis[:delIdx], spis[delIdx+1:]...) 649 } 650 651 log.G(context.TODO()).Debugf("Updated: %v", spis) 652 653 return spis 654 } 655 656 func (n *network) maxMTU() int { 657 mtu := 1500 658 if n.mtu != 0 { 659 mtu = n.mtu 660 } 661 mtu -= vxlanEncap 662 if n.secure { 663 // In case of encryption account for the 664 // esp packet expansion and padding 665 mtu -= pktExpansion 666 mtu -= (mtu % 4) 667 } 668 return mtu 669 } 670 671 func clearEncryptionStates() { 672 nlh := ns.NlHandle() 673 spList, err := nlh.XfrmPolicyList(netlink.FAMILY_ALL) 674 if err != nil { 675 log.G(context.TODO()).Warnf("Failed to retrieve SP list for cleanup: %v", err) 676 } 677 saList, err := nlh.XfrmStateList(netlink.FAMILY_ALL) 678 if err != nil { 679 log.G(context.TODO()).Warnf("Failed to retrieve SA list for cleanup: %v", err) 680 } 681 for _, sp := range spList { 682 sp := sp 683 if sp.Mark != nil && sp.Mark.Value == spMark.Value { 684 if err := nlh.XfrmPolicyDel(&sp); err != nil { 685 log.G(context.TODO()).Warnf("Failed to delete stale SP %s: %v", sp, err) 686 continue 687 } 688 log.G(context.TODO()).Debugf("Removed stale SP: %s", sp) 689 } 690 } 691 for _, sa := range saList { 692 sa := sa 693 if sa.Reqid == mark { 694 if err := nlh.XfrmStateDel(&sa); err != nil { 695 log.G(context.TODO()).Warnf("Failed to delete stale SA %s: %v", sa, err) 696 continue 697 } 698 log.G(context.TODO()).Debugf("Removed stale SA: %s", sa) 699 } 700 } 701 }