github.com/moby/docker@v26.1.3+incompatible/libnetwork/osl/namespace_linux.go (about) 1 package osl 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "net" 8 "os" 9 "path/filepath" 10 "runtime" 11 "strconv" 12 "strings" 13 "sync" 14 "syscall" 15 "time" 16 17 "github.com/containerd/log" 18 "github.com/docker/docker/internal/unshare" 19 "github.com/docker/docker/libnetwork/ns" 20 "github.com/docker/docker/libnetwork/osl/kernel" 21 "github.com/docker/docker/libnetwork/types" 22 "github.com/vishvananda/netlink" 23 "github.com/vishvananda/netlink/nl" 24 "github.com/vishvananda/netns" 25 "golang.org/x/sys/unix" 26 ) 27 28 const defaultPrefix = "/var/run/docker" 29 30 func init() { 31 // Lock main() to the initial thread to exclude the goroutines spawned 32 // by func (*Namespace) InvokeFunc() or func setIPv6() below from 33 // being scheduled onto that thread. Changes to the network namespace of 34 // the initial thread alter /proc/self/ns/net, which would break any 35 // code which (incorrectly) assumes that the file is the network 36 // namespace for the thread it is currently executing on. 37 runtime.LockOSThread() 38 } 39 40 var ( 41 once sync.Once 42 garbagePathMap = make(map[string]bool) 43 gpmLock sync.Mutex 44 gpmWg sync.WaitGroup 45 gpmCleanupPeriod = 60 * time.Second 46 gpmChan = make(chan chan struct{}) 47 netnsBasePath = filepath.Join(defaultPrefix, "netns") 48 ) 49 50 // SetBasePath sets the base url prefix for the ns path 51 func SetBasePath(path string) { 52 netnsBasePath = filepath.Join(path, "netns") 53 } 54 55 func basePath() string { 56 return netnsBasePath 57 } 58 59 func createBasePath() { 60 err := os.MkdirAll(basePath(), 0o755) 61 if err != nil { 62 panic("Could not create net namespace path directory") 63 } 64 65 // Start the garbage collection go routine 66 go removeUnusedPaths() 67 } 68 69 func removeUnusedPaths() { 70 gpmLock.Lock() 71 period := gpmCleanupPeriod 72 gpmLock.Unlock() 73 74 ticker := time.NewTicker(period) 75 for { 76 var ( 77 gc chan struct{} 78 gcOk bool 79 ) 80 81 select { 82 case <-ticker.C: 83 case gc, gcOk = <-gpmChan: 84 } 85 86 gpmLock.Lock() 87 pathList := make([]string, 0, len(garbagePathMap)) 88 for path := range garbagePathMap { 89 pathList = append(pathList, path) 90 } 91 garbagePathMap = make(map[string]bool) 92 gpmWg.Add(1) 93 gpmLock.Unlock() 94 95 for _, path := range pathList { 96 os.Remove(path) 97 } 98 99 gpmWg.Done() 100 if gcOk { 101 close(gc) 102 } 103 } 104 } 105 106 func addToGarbagePaths(path string) { 107 gpmLock.Lock() 108 garbagePathMap[path] = true 109 gpmLock.Unlock() 110 } 111 112 func removeFromGarbagePaths(path string) { 113 gpmLock.Lock() 114 delete(garbagePathMap, path) 115 gpmLock.Unlock() 116 } 117 118 // GC triggers garbage collection of namespace path right away 119 // and waits for it. 120 func GC() { 121 gpmLock.Lock() 122 if len(garbagePathMap) == 0 { 123 // No need for GC if map is empty 124 gpmLock.Unlock() 125 return 126 } 127 gpmLock.Unlock() 128 129 // if content exists in the garbage paths 130 // we can trigger GC to run, providing a 131 // channel to be notified on completion 132 waitGC := make(chan struct{}) 133 gpmChan <- waitGC 134 // wait for GC completion 135 <-waitGC 136 } 137 138 // GenerateKey generates a sandbox key based on the passed 139 // container id. 140 func GenerateKey(containerID string) string { 141 maxLen := 12 142 // Read sandbox key from host for overlay 143 if strings.HasPrefix(containerID, "-") { 144 var ( 145 index int 146 indexStr string 147 tmpkey string 148 ) 149 dir, err := os.ReadDir(basePath()) 150 if err != nil { 151 return "" 152 } 153 154 for _, v := range dir { 155 id := v.Name() 156 if strings.HasSuffix(id, containerID[:maxLen-1]) { 157 indexStr = strings.TrimSuffix(id, containerID[:maxLen-1]) 158 tmpindex, err := strconv.Atoi(indexStr) 159 if err != nil { 160 return "" 161 } 162 if tmpindex > index { 163 index = tmpindex 164 tmpkey = id 165 } 166 } 167 } 168 containerID = tmpkey 169 if containerID == "" { 170 return "" 171 } 172 } 173 174 if len(containerID) < maxLen { 175 maxLen = len(containerID) 176 } 177 178 return basePath() + "/" + containerID[:maxLen] 179 } 180 181 // NewSandbox provides a new Namespace instance created in an os specific way 182 // provided a key which uniquely identifies the sandbox. 183 func NewSandbox(key string, osCreate, isRestore bool) (*Namespace, error) { 184 if !isRestore { 185 err := createNetworkNamespace(key, osCreate) 186 if err != nil { 187 return nil, err 188 } 189 } else { 190 once.Do(createBasePath) 191 } 192 193 n := &Namespace{path: key, isDefault: !osCreate, nextIfIndex: make(map[string]int)} 194 195 sboxNs, err := netns.GetFromPath(n.path) 196 if err != nil { 197 return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err) 198 } 199 defer sboxNs.Close() 200 201 n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE) 202 if err != nil { 203 return nil, fmt.Errorf("failed to create a netlink handle: %v", err) 204 } 205 206 err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout) 207 if err != nil { 208 log.G(context.TODO()).Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err) 209 } 210 211 if err = n.loopbackUp(); err != nil { 212 n.nlHandle.Close() 213 return nil, err 214 } 215 216 return n, nil 217 } 218 219 func mountNetworkNamespace(basePath string, lnPath string) error { 220 err := syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "") 221 if err != nil { 222 return fmt.Errorf("bind-mount %s -> %s: %w", basePath, lnPath, err) 223 } 224 return nil 225 } 226 227 // GetSandboxForExternalKey returns sandbox object for the supplied path 228 func GetSandboxForExternalKey(basePath string, key string) (*Namespace, error) { 229 if err := createNamespaceFile(key); err != nil { 230 return nil, err 231 } 232 233 if err := mountNetworkNamespace(basePath, key); err != nil { 234 return nil, err 235 } 236 n := &Namespace{path: key, nextIfIndex: make(map[string]int)} 237 238 sboxNs, err := netns.GetFromPath(n.path) 239 if err != nil { 240 return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err) 241 } 242 defer sboxNs.Close() 243 244 n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE) 245 if err != nil { 246 return nil, fmt.Errorf("failed to create a netlink handle: %v", err) 247 } 248 249 err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout) 250 if err != nil { 251 log.G(context.TODO()).Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err) 252 } 253 254 if err = n.loopbackUp(); err != nil { 255 n.nlHandle.Close() 256 return nil, err 257 } 258 259 return n, nil 260 } 261 262 func createNetworkNamespace(path string, osCreate bool) error { 263 if err := createNamespaceFile(path); err != nil { 264 return err 265 } 266 267 do := func() error { 268 return mountNetworkNamespace(fmt.Sprintf("/proc/self/task/%d/ns/net", unix.Gettid()), path) 269 } 270 if osCreate { 271 return unshare.Go(unix.CLONE_NEWNET, do, nil) 272 } 273 return do() 274 } 275 276 func unmountNamespaceFile(path string) { 277 if _, err := os.Stat(path); err != nil { 278 // ignore when we cannot stat the path 279 return 280 } 281 if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil && !errors.Is(err, unix.EINVAL) { 282 log.G(context.TODO()).WithError(err).Error("Error unmounting namespace file") 283 } 284 } 285 286 func createNamespaceFile(path string) error { 287 once.Do(createBasePath) 288 // Remove it from garbage collection list if present 289 removeFromGarbagePaths(path) 290 291 // If the path is there unmount it first 292 unmountNamespaceFile(path) 293 294 // wait for garbage collection to complete if it is in progress 295 // before trying to create the file. 296 // 297 // TODO(aker): This garbage-collection was for a kernel bug in kernels 3.18-4.0.1: is this still needed on current kernels (and on kernel 3.10)? see https://github.com/moby/moby/pull/46315/commits/c0a6beba8e61d4019e1806d5241ba22007072ca2#r1331327103 298 gpmWg.Wait() 299 300 f, err := os.Create(path) 301 if err != nil { 302 return err 303 } 304 _ = f.Close() 305 return nil 306 } 307 308 // Namespace represents a network sandbox. It represents a Linux network 309 // namespace, and moves an interface into it when called on method AddInterface 310 // or sets the gateway etc. It holds a list of Interfaces, routes etc., and more 311 // can be added dynamically. 312 type Namespace struct { 313 path string 314 iFaces []*Interface 315 gw net.IP 316 gwv6 net.IP 317 staticRoutes []*types.StaticRoute 318 neighbors []*neigh 319 nextIfIndex map[string]int 320 isDefault bool 321 ipv6LoEnabledOnce sync.Once 322 ipv6LoEnabledCached bool 323 nlHandle *netlink.Handle 324 mu sync.Mutex 325 } 326 327 // Interfaces returns the collection of Interface previously added with the AddInterface 328 // method. Note that this doesn't include network interfaces added in any 329 // other way (such as the default loopback interface which is automatically 330 // created on creation of a sandbox). 331 func (n *Namespace) Interfaces() []*Interface { 332 ifaces := make([]*Interface, len(n.iFaces)) 333 copy(ifaces, n.iFaces) 334 return ifaces 335 } 336 337 func (n *Namespace) loopbackUp() error { 338 iface, err := n.nlHandle.LinkByName("lo") 339 if err != nil { 340 return err 341 } 342 return n.nlHandle.LinkSetUp(iface) 343 } 344 345 // GetLoopbackIfaceName returns the name of the loopback interface 346 func (n *Namespace) GetLoopbackIfaceName() string { 347 return "lo" 348 } 349 350 // AddAliasIP adds the passed IP address to the named interface 351 func (n *Namespace) AddAliasIP(ifName string, ip *net.IPNet) error { 352 iface, err := n.nlHandle.LinkByName(ifName) 353 if err != nil { 354 return err 355 } 356 return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip}) 357 } 358 359 // RemoveAliasIP removes the passed IP address from the named interface 360 func (n *Namespace) RemoveAliasIP(ifName string, ip *net.IPNet) error { 361 iface, err := n.nlHandle.LinkByName(ifName) 362 if err != nil { 363 return err 364 } 365 return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip}) 366 } 367 368 // DisableARPForVIP disables ARP replies and requests for VIP addresses 369 // on a particular interface. 370 func (n *Namespace) DisableARPForVIP(srcName string) (Err error) { 371 dstName := "" 372 for _, i := range n.Interfaces() { 373 if i.SrcName() == srcName { 374 dstName = i.DstName() 375 break 376 } 377 } 378 if dstName == "" { 379 return fmt.Errorf("failed to find interface %s in sandbox", srcName) 380 } 381 382 err := n.InvokeFunc(func() { 383 path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore") 384 if err := os.WriteFile(path, []byte{'1', '\n'}, 0o644); err != nil { 385 Err = fmt.Errorf("Failed to set %s to 1: %v", path, err) 386 return 387 } 388 path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce") 389 if err := os.WriteFile(path, []byte{'2', '\n'}, 0o644); err != nil { 390 Err = fmt.Errorf("Failed to set %s to 2: %v", path, err) 391 return 392 } 393 }) 394 if err != nil { 395 return err 396 } 397 return 398 } 399 400 // InvokeFunc invoke a function in the network namespace. 401 func (n *Namespace) InvokeFunc(f func()) error { 402 path := n.nsPath() 403 newNS, err := netns.GetFromPath(path) 404 if err != nil { 405 return fmt.Errorf("failed get network namespace %q: %w", path, err) 406 } 407 defer newNS.Close() 408 409 done := make(chan error, 1) 410 go func() { 411 runtime.LockOSThread() 412 // InvokeFunc() could have been called from a goroutine with 413 // tampered thread state, e.g. from another InvokeFunc() 414 // callback. The outer goroutine's thread state cannot be 415 // trusted. 416 origNS, err := netns.Get() 417 if err != nil { 418 runtime.UnlockOSThread() 419 done <- fmt.Errorf("failed to get original network namespace: %w", err) 420 return 421 } 422 defer origNS.Close() 423 424 if err := netns.Set(newNS); err != nil { 425 runtime.UnlockOSThread() 426 done <- err 427 return 428 } 429 defer func() { 430 close(done) 431 if err := netns.Set(origNS); err != nil { 432 log.G(context.TODO()).WithError(err).Warn("failed to restore thread's network namespace") 433 // Recover from the error by leaving this goroutine locked to 434 // the thread. The runtime will terminate the thread and replace 435 // it with a clean one when this goroutine returns. 436 } else { 437 runtime.UnlockOSThread() 438 } 439 }() 440 f() 441 }() 442 return <-done 443 } 444 445 func (n *Namespace) nsPath() string { 446 n.mu.Lock() 447 defer n.mu.Unlock() 448 449 return n.path 450 } 451 452 // Key returns the path where the network namespace is mounted. 453 func (n *Namespace) Key() string { 454 return n.path 455 } 456 457 // Destroy destroys the sandbox. 458 func (n *Namespace) Destroy() error { 459 if n.nlHandle != nil { 460 n.nlHandle.Close() 461 } 462 // Assuming no running process is executing in this network namespace, 463 // unmounting is sufficient to destroy it. 464 if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil { 465 return err 466 } 467 468 // Stash it into the garbage collection list 469 addToGarbagePaths(n.path) 470 return nil 471 } 472 473 // Restore restores the network namespace. 474 func (n *Namespace) Restore(interfaces map[Iface][]IfaceOption, routes []*types.StaticRoute, gw net.IP, gw6 net.IP) error { 475 // restore interfaces 476 for iface, opts := range interfaces { 477 i, err := newInterface(n, iface.SrcName, iface.DstPrefix, opts...) 478 if err != nil { 479 return err 480 } 481 if n.isDefault { 482 i.dstName = i.srcName 483 } else { 484 links, err := n.nlHandle.LinkList() 485 if err != nil { 486 return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path) 487 } 488 // due to the docker network connect/disconnect, so the dstName should 489 // restore from the namespace 490 for _, link := range links { 491 ifaceName := link.Attrs().Name 492 if i.dstName == "vxlan" && strings.HasPrefix(ifaceName, "vxlan") { 493 i.dstName = ifaceName 494 break 495 } 496 // find the interface name by ip 497 if i.address != nil { 498 addresses, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4) 499 if err != nil { 500 return err 501 } 502 for _, addr := range addresses { 503 if addr.IPNet.String() == i.address.String() { 504 i.dstName = ifaceName 505 break 506 } 507 } 508 if i.dstName == ifaceName { 509 break 510 } 511 } 512 // This is to find the interface name of the pair in overlay sandbox 513 if i.master != "" && i.dstName == "veth" && strings.HasPrefix(ifaceName, "veth") { 514 i.dstName = ifaceName 515 } 516 } 517 518 var index int 519 if idx := strings.TrimPrefix(i.dstName, iface.DstPrefix); idx != "" { 520 index, err = strconv.Atoi(idx) 521 if err != nil { 522 return fmt.Errorf("failed to restore interface in network namespace %q: invalid dstName for interface: %s: %v", n.path, i.dstName, err) 523 } 524 } 525 index++ 526 n.mu.Lock() 527 if index > n.nextIfIndex[iface.DstPrefix] { 528 n.nextIfIndex[iface.DstPrefix] = index 529 } 530 n.iFaces = append(n.iFaces, i) 531 n.mu.Unlock() 532 } 533 } 534 535 // restore routes and gateways 536 n.mu.Lock() 537 n.staticRoutes = append(n.staticRoutes, routes...) 538 if len(gw) > 0 { 539 n.gw = gw 540 } 541 if len(gw6) > 0 { 542 n.gwv6 = gw6 543 } 544 n.mu.Unlock() 545 return nil 546 } 547 548 // IPv6LoEnabled returns true if the loopback interface had an IPv6 address when 549 // last checked. It's always checked on the first call, and by RefreshIPv6LoEnabled. 550 // ('::1' is assigned by the kernel if IPv6 is enabled.) 551 func (n *Namespace) IPv6LoEnabled() bool { 552 n.ipv6LoEnabledOnce.Do(func() { 553 n.RefreshIPv6LoEnabled() 554 }) 555 n.mu.Lock() 556 defer n.mu.Unlock() 557 return n.ipv6LoEnabledCached 558 } 559 560 // RefreshIPv6LoEnabled refreshes the cached result returned by IPv6LoEnabled. 561 func (n *Namespace) RefreshIPv6LoEnabled() { 562 n.mu.Lock() 563 defer n.mu.Unlock() 564 565 // If anything goes wrong, assume no-IPv6. 566 n.ipv6LoEnabledCached = false 567 iface, err := n.nlHandle.LinkByName("lo") 568 if err != nil { 569 log.G(context.TODO()).WithError(err).Warn("Unable to find 'lo' to determine IPv6 support") 570 return 571 } 572 addrs, err := n.nlHandle.AddrList(iface, nl.FAMILY_V6) 573 if err != nil { 574 log.G(context.TODO()).WithError(err).Warn("Unable to get 'lo' addresses to determine IPv6 support") 575 return 576 } 577 n.ipv6LoEnabledCached = len(addrs) > 0 578 } 579 580 // ApplyOSTweaks applies operating system specific knobs on the sandbox. 581 func (n *Namespace) ApplyOSTweaks(types []SandboxType) { 582 for _, t := range types { 583 switch t { 584 case SandboxTypeLoadBalancer, SandboxTypeIngress: 585 kernel.ApplyOSTweaks(map[string]*kernel.OSValue{ 586 // disables any special handling on port reuse of existing IPVS connection table entries 587 // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L32 588 "net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil}, 589 // expires connection from the IPVS connection table when the backend is not available 590 // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L133 591 "net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil}, 592 // expires persistent connections to destination servers with weights set to 0 593 // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L151 594 "net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil}, 595 }) 596 } 597 } 598 } 599 600 func setIPv6(nspath, iface string, enable bool) error { 601 errCh := make(chan error, 1) 602 go func() { 603 defer close(errCh) 604 605 namespace, err := netns.GetFromPath(nspath) 606 if err != nil { 607 errCh <- fmt.Errorf("failed get network namespace %q: %w", nspath, err) 608 return 609 } 610 defer namespace.Close() 611 612 runtime.LockOSThread() 613 614 origNS, err := netns.Get() 615 if err != nil { 616 runtime.UnlockOSThread() 617 errCh <- fmt.Errorf("failed to get current network namespace: %w", err) 618 return 619 } 620 defer origNS.Close() 621 622 if err = netns.Set(namespace); err != nil { 623 runtime.UnlockOSThread() 624 errCh <- fmt.Errorf("setting into container netns %q failed: %w", nspath, err) 625 return 626 } 627 defer func() { 628 if err := netns.Set(origNS); err != nil { 629 log.G(context.TODO()).WithError(err).Error("libnetwork: restoring thread network namespace failed") 630 // The error is only fatal for the current thread. Keep this 631 // goroutine locked to the thread to make the runtime replace it 632 // with a clean thread once this goroutine returns. 633 } else { 634 runtime.UnlockOSThread() 635 } 636 }() 637 638 var ( 639 action = "disable" 640 value = byte('1') 641 path = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", iface) 642 ) 643 644 if enable { 645 action = "enable" 646 value = '0' 647 } 648 649 if curVal, err := os.ReadFile(path); err != nil { 650 if os.IsNotExist(err) { 651 if enable { 652 log.G(context.TODO()).WithError(err).Warn("Cannot enable IPv6 on container interface. Has IPv6 been disabled in this node's kernel?") 653 } else { 654 log.G(context.TODO()).WithError(err).Debug("Not disabling IPv6 on container interface. Has IPv6 been disabled in this node's kernel?") 655 } 656 return 657 } 658 errCh <- err 659 return 660 } else if len(curVal) > 0 && curVal[0] == value { 661 // Nothing to do, the setting is already correct. 662 return 663 } 664 665 if err = os.WriteFile(path, []byte{value, '\n'}, 0o644); err != nil || os.Getenv("DOCKER_TEST_RO_DISABLE_IPV6") != "" { 666 logger := log.G(context.TODO()).WithFields(log.Fields{ 667 "error": err, 668 "interface": iface, 669 }) 670 if enable { 671 // The user asked for IPv6 on the interface, and we can't give it to them. 672 // But, in line with the IsNotExist case above, just log. 673 logger.Warn("Cannot enable IPv6 on container interface, continuing.") 674 } else if os.Getenv("DOCKER_ALLOW_IPV6_ON_IPV4_INTERFACE") == "1" { 675 // TODO(robmry) - remove this escape hatch for https://github.com/moby/moby/issues/47751 676 // If the "/proc" file exists but isn't writable, we can't disable IPv6, which is 677 // https://github.com/moby/moby/security/advisories/GHSA-x84c-p2g9-rqv9 ... so, 678 // the user is required to override the error (or configure IPv6, or disable IPv6 679 // by default in the OS, or make the "/proc" file writable). Once it's possible 680 // to enable IPv6 without having to configure IPAM etc, the env var should be 681 // removed. Then the user will have to explicitly enable IPv6 if it can't be 682 // disabled on the interface. 683 logger.Info("Cannot disable IPv6 on container interface but DOCKER_ALLOW_IPV6_ON_IPV4_INTERFACE=1, continuing.") 684 } else { 685 logger.Error("Cannot disable IPv6 on container interface. Set env var DOCKER_ALLOW_IPV6_ON_IPV4_INTERFACE=1 to ignore.") 686 errCh <- fmt.Errorf( 687 "failed to %s IPv6 on container's interface %s, set env var DOCKER_ALLOW_IPV6_ON_IPV4_INTERFACE=1 to ignore this error", 688 action, iface) 689 } 690 return 691 } 692 }() 693 return <-errCh 694 }