github.com/rawahars/moby@v24.0.4+incompatible/libnetwork/osl/namespace_linux.go (about) 1 package osl 2 3 import ( 4 "errors" 5 "fmt" 6 "net" 7 "os" 8 "path/filepath" 9 "runtime" 10 "strconv" 11 "strings" 12 "sync" 13 "syscall" 14 "time" 15 16 "github.com/docker/docker/internal/unshare" 17 "github.com/docker/docker/libnetwork/ns" 18 "github.com/docker/docker/libnetwork/osl/kernel" 19 "github.com/docker/docker/libnetwork/types" 20 "github.com/sirupsen/logrus" 21 "github.com/vishvananda/netlink" 22 "github.com/vishvananda/netns" 23 "golang.org/x/sys/unix" 24 ) 25 26 const defaultPrefix = "/var/run/docker" 27 28 func init() { 29 // Lock main() to the initial thread to exclude the goroutines spawned 30 // by func (*networkNamespace) InvokeFunc() or func setIPv6() below from 31 // being scheduled onto that thread. Changes to the network namespace of 32 // the initial thread alter /proc/self/ns/net, which would break any 33 // code which (incorrectly) assumes that that file is the network 34 // namespace for the thread it is currently executing on. 35 runtime.LockOSThread() 36 } 37 38 var ( 39 once sync.Once 40 garbagePathMap = make(map[string]bool) 41 gpmLock sync.Mutex 42 gpmWg sync.WaitGroup 43 gpmCleanupPeriod = 60 * time.Second 44 gpmChan = make(chan chan struct{}) 45 prefix = defaultPrefix 46 ) 47 48 // The networkNamespace type is the linux implementation of the Sandbox 49 // interface. It represents a linux network namespace, and moves an interface 50 // into it when called on method AddInterface or sets the gateway etc. 51 type networkNamespace struct { 52 path string 53 iFaces []*nwIface 54 gw net.IP 55 gwv6 net.IP 56 staticRoutes []*types.StaticRoute 57 neighbors []*neigh 58 nextIfIndex map[string]int 59 isDefault bool 60 nlHandle *netlink.Handle 61 loV6Enabled bool 62 sync.Mutex 63 } 64 65 // SetBasePath sets the base url prefix for the ns path 66 func SetBasePath(path string) { 67 prefix = path 68 } 69 70 func basePath() string { 71 return filepath.Join(prefix, "netns") 72 } 73 74 func createBasePath() { 75 err := os.MkdirAll(basePath(), 0755) 76 if err != nil { 77 panic("Could not create net namespace path directory") 78 } 79 80 // Start the garbage collection go routine 81 go removeUnusedPaths() 82 } 83 84 func removeUnusedPaths() { 85 gpmLock.Lock() 86 period := gpmCleanupPeriod 87 gpmLock.Unlock() 88 89 ticker := time.NewTicker(period) 90 for { 91 var ( 92 gc chan struct{} 93 gcOk bool 94 ) 95 96 select { 97 case <-ticker.C: 98 case gc, gcOk = <-gpmChan: 99 } 100 101 gpmLock.Lock() 102 pathList := make([]string, 0, len(garbagePathMap)) 103 for path := range garbagePathMap { 104 pathList = append(pathList, path) 105 } 106 garbagePathMap = make(map[string]bool) 107 gpmWg.Add(1) 108 gpmLock.Unlock() 109 110 for _, path := range pathList { 111 os.Remove(path) 112 } 113 114 gpmWg.Done() 115 if gcOk { 116 close(gc) 117 } 118 } 119 } 120 121 func addToGarbagePaths(path string) { 122 gpmLock.Lock() 123 garbagePathMap[path] = true 124 gpmLock.Unlock() 125 } 126 127 func removeFromGarbagePaths(path string) { 128 gpmLock.Lock() 129 delete(garbagePathMap, path) 130 gpmLock.Unlock() 131 } 132 133 // GC triggers garbage collection of namespace path right away 134 // and waits for it. 135 func GC() { 136 gpmLock.Lock() 137 if len(garbagePathMap) == 0 { 138 // No need for GC if map is empty 139 gpmLock.Unlock() 140 return 141 } 142 gpmLock.Unlock() 143 144 // if content exists in the garbage paths 145 // we can trigger GC to run, providing a 146 // channel to be notified on completion 147 waitGC := make(chan struct{}) 148 gpmChan <- waitGC 149 // wait for GC completion 150 <-waitGC 151 } 152 153 // GenerateKey generates a sandbox key based on the passed 154 // container id. 155 func GenerateKey(containerID string) string { 156 maxLen := 12 157 // Read sandbox key from host for overlay 158 if strings.HasPrefix(containerID, "-") { 159 var ( 160 index int 161 indexStr string 162 tmpkey string 163 ) 164 dir, err := os.ReadDir(basePath()) 165 if err != nil { 166 return "" 167 } 168 169 for _, v := range dir { 170 id := v.Name() 171 if strings.HasSuffix(id, containerID[:maxLen-1]) { 172 indexStr = strings.TrimSuffix(id, containerID[:maxLen-1]) 173 tmpindex, err := strconv.Atoi(indexStr) 174 if err != nil { 175 return "" 176 } 177 if tmpindex > index { 178 index = tmpindex 179 tmpkey = id 180 } 181 } 182 } 183 containerID = tmpkey 184 if containerID == "" { 185 return "" 186 } 187 } 188 189 if len(containerID) < maxLen { 190 maxLen = len(containerID) 191 } 192 193 return basePath() + "/" + containerID[:maxLen] 194 } 195 196 // NewSandbox provides a new sandbox instance created in an os specific way 197 // provided a key which uniquely identifies the sandbox 198 func NewSandbox(key string, osCreate, isRestore bool) (Sandbox, error) { 199 if !isRestore { 200 err := createNetworkNamespace(key, osCreate) 201 if err != nil { 202 return nil, err 203 } 204 } else { 205 once.Do(createBasePath) 206 } 207 208 n := &networkNamespace{path: key, isDefault: !osCreate, nextIfIndex: make(map[string]int)} 209 210 sboxNs, err := netns.GetFromPath(n.path) 211 if err != nil { 212 return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err) 213 } 214 defer sboxNs.Close() 215 216 n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE) 217 if err != nil { 218 return nil, fmt.Errorf("failed to create a netlink handle: %v", err) 219 } 220 221 err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout) 222 if err != nil { 223 logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err) 224 } 225 // In live-restore mode, IPV6 entries are getting cleaned up due to below code 226 // We should retain IPV6 configurations in live-restore mode when Docker Daemon 227 // comes back. It should work as it is on other cases 228 // As starting point, disable IPv6 on all interfaces 229 if !isRestore && !n.isDefault { 230 err = setIPv6(n.path, "all", false) 231 if err != nil { 232 logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err) 233 } 234 } 235 236 if err = n.loopbackUp(); err != nil { 237 n.nlHandle.Close() 238 return nil, err 239 } 240 241 return n, nil 242 } 243 244 func (n *networkNamespace) InterfaceOptions() IfaceOptionSetter { 245 return n 246 } 247 248 func (n *networkNamespace) NeighborOptions() NeighborOptionSetter { 249 return n 250 } 251 252 func mountNetworkNamespace(basePath string, lnPath string) error { 253 return syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "") 254 } 255 256 // GetSandboxForExternalKey returns sandbox object for the supplied path 257 func GetSandboxForExternalKey(basePath string, key string) (Sandbox, error) { 258 if err := createNamespaceFile(key); err != nil { 259 return nil, err 260 } 261 262 if err := mountNetworkNamespace(basePath, key); err != nil { 263 return nil, err 264 } 265 n := &networkNamespace{path: key, nextIfIndex: make(map[string]int)} 266 267 sboxNs, err := netns.GetFromPath(n.path) 268 if err != nil { 269 return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err) 270 } 271 defer sboxNs.Close() 272 273 n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE) 274 if err != nil { 275 return nil, fmt.Errorf("failed to create a netlink handle: %v", err) 276 } 277 278 err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout) 279 if err != nil { 280 logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err) 281 } 282 283 // As starting point, disable IPv6 on all interfaces 284 err = setIPv6(n.path, "all", false) 285 if err != nil { 286 logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err) 287 } 288 289 if err = n.loopbackUp(); err != nil { 290 n.nlHandle.Close() 291 return nil, err 292 } 293 294 return n, nil 295 } 296 297 func createNetworkNamespace(path string, osCreate bool) error { 298 if err := createNamespaceFile(path); err != nil { 299 return err 300 } 301 302 do := func() error { 303 return mountNetworkNamespace(fmt.Sprintf("/proc/self/task/%d/ns/net", unix.Gettid()), path) 304 } 305 if osCreate { 306 return unshare.Go(unix.CLONE_NEWNET, do, nil) 307 } 308 return do() 309 } 310 311 func unmountNamespaceFile(path string) { 312 if _, err := os.Stat(path); err == nil { 313 if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil && !errors.Is(err, unix.EINVAL) { 314 logrus.WithError(err).Error("Error unmounting namespace file") 315 } 316 } 317 } 318 319 func createNamespaceFile(path string) (err error) { 320 var f *os.File 321 322 once.Do(createBasePath) 323 // Remove it from garbage collection list if present 324 removeFromGarbagePaths(path) 325 326 // If the path is there unmount it first 327 unmountNamespaceFile(path) 328 329 // wait for garbage collection to complete if it is in progress 330 // before trying to create the file. 331 gpmWg.Wait() 332 333 if f, err = os.Create(path); err == nil { 334 f.Close() 335 } 336 337 return err 338 } 339 340 func (n *networkNamespace) loopbackUp() error { 341 iface, err := n.nlHandle.LinkByName("lo") 342 if err != nil { 343 return err 344 } 345 return n.nlHandle.LinkSetUp(iface) 346 } 347 348 func (n *networkNamespace) GetLoopbackIfaceName() string { 349 return "lo" 350 } 351 352 func (n *networkNamespace) AddAliasIP(ifName string, ip *net.IPNet) error { 353 iface, err := n.nlHandle.LinkByName(ifName) 354 if err != nil { 355 return err 356 } 357 return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip}) 358 } 359 360 func (n *networkNamespace) RemoveAliasIP(ifName string, ip *net.IPNet) error { 361 iface, err := n.nlHandle.LinkByName(ifName) 362 if err != nil { 363 return err 364 } 365 return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip}) 366 } 367 368 func (n *networkNamespace) DisableARPForVIP(srcName string) (Err error) { 369 dstName := "" 370 for _, i := range n.Interfaces() { 371 if i.SrcName() == srcName { 372 dstName = i.DstName() 373 break 374 } 375 } 376 if dstName == "" { 377 return fmt.Errorf("failed to find interface %s in sandbox", srcName) 378 } 379 380 err := n.InvokeFunc(func() { 381 path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore") 382 if err := os.WriteFile(path, []byte{'1', '\n'}, 0644); err != nil { 383 Err = fmt.Errorf("Failed to set %s to 1: %v", path, err) 384 return 385 } 386 path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce") 387 if err := os.WriteFile(path, []byte{'2', '\n'}, 0644); err != nil { 388 Err = fmt.Errorf("Failed to set %s to 2: %v", path, err) 389 return 390 } 391 }) 392 if err != nil { 393 return err 394 } 395 return 396 } 397 398 func (n *networkNamespace) InvokeFunc(f func()) error { 399 path := n.nsPath() 400 newNS, err := netns.GetFromPath(path) 401 if err != nil { 402 return fmt.Errorf("failed get network namespace %q: %w", path, err) 403 } 404 defer newNS.Close() 405 406 done := make(chan error, 1) 407 go func() { 408 runtime.LockOSThread() 409 // InvokeFunc() could have been called from a goroutine with 410 // tampered thread state, e.g. from another InvokeFunc() 411 // callback. The outer goroutine's thread state cannot be 412 // trusted. 413 origNS, err := netns.Get() 414 if err != nil { 415 runtime.UnlockOSThread() 416 done <- fmt.Errorf("failed to get original network namespace: %w", err) 417 return 418 } 419 defer origNS.Close() 420 421 if err := netns.Set(newNS); err != nil { 422 runtime.UnlockOSThread() 423 done <- err 424 return 425 } 426 defer func() { 427 close(done) 428 if err := netns.Set(origNS); err != nil { 429 logrus.WithError(err).Warn("failed to restore thread's network namespace") 430 // Recover from the error by leaving this goroutine locked to 431 // the thread. The runtime will terminate the thread and replace 432 // it with a clean one when this goroutine returns. 433 } else { 434 runtime.UnlockOSThread() 435 } 436 }() 437 f() 438 }() 439 return <-done 440 } 441 442 func (n *networkNamespace) nsPath() string { 443 n.Lock() 444 defer n.Unlock() 445 446 return n.path 447 } 448 449 func (n *networkNamespace) Info() Info { 450 return n 451 } 452 453 func (n *networkNamespace) Key() string { 454 return n.path 455 } 456 457 func (n *networkNamespace) Destroy() error { 458 if n.nlHandle != nil { 459 n.nlHandle.Close() 460 } 461 // Assuming no running process is executing in this network namespace, 462 // unmounting is sufficient to destroy it. 463 if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil { 464 return err 465 } 466 467 // Stash it into the garbage collection list 468 addToGarbagePaths(n.path) 469 return nil 470 } 471 472 // Restore restore the network namespace 473 func (n *networkNamespace) Restore(ifsopt map[Iface][]IfaceOption, routes []*types.StaticRoute, gw net.IP, gw6 net.IP) error { 474 // restore interfaces 475 for name, opts := range ifsopt { 476 i := &nwIface{srcName: name.SrcName, dstName: name.DstPrefix, ns: n} 477 i.processInterfaceOptions(opts...) 478 if i.master != "" { 479 i.dstMaster = n.findDst(i.master, true) 480 if i.dstMaster == "" { 481 return fmt.Errorf("could not find an appropriate master %q for %q", 482 i.master, i.srcName) 483 } 484 } 485 if n.isDefault { 486 i.dstName = i.srcName 487 } else { 488 links, err := n.nlHandle.LinkList() 489 if err != nil { 490 return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path) 491 } 492 // due to the docker network connect/disconnect, so the dstName should 493 // restore from the namespace 494 for _, link := range links { 495 addrs, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4) 496 if err != nil { 497 return err 498 } 499 ifaceName := link.Attrs().Name 500 if strings.HasPrefix(ifaceName, "vxlan") { 501 if i.dstName == "vxlan" { 502 i.dstName = ifaceName 503 break 504 } 505 } 506 // find the interface name by ip 507 if i.address != nil { 508 for _, addr := range addrs { 509 if addr.IPNet.String() == i.address.String() { 510 i.dstName = ifaceName 511 break 512 } 513 continue 514 } 515 if i.dstName == ifaceName { 516 break 517 } 518 } 519 // This is to find the interface name of the pair in overlay sandbox 520 if strings.HasPrefix(ifaceName, "veth") { 521 if i.master != "" && i.dstName == "veth" { 522 i.dstName = ifaceName 523 } 524 } 525 } 526 527 var index int 528 indexStr := strings.TrimPrefix(i.dstName, name.DstPrefix) 529 if indexStr != "" { 530 index, err = strconv.Atoi(indexStr) 531 if err != nil { 532 return err 533 } 534 } 535 index++ 536 n.Lock() 537 if index > n.nextIfIndex[name.DstPrefix] { 538 n.nextIfIndex[name.DstPrefix] = index 539 } 540 n.iFaces = append(n.iFaces, i) 541 n.Unlock() 542 } 543 } 544 545 // restore routes 546 for _, r := range routes { 547 n.Lock() 548 n.staticRoutes = append(n.staticRoutes, r) 549 n.Unlock() 550 } 551 552 // restore gateway 553 if len(gw) > 0 { 554 n.Lock() 555 n.gw = gw 556 n.Unlock() 557 } 558 559 if len(gw6) > 0 { 560 n.Lock() 561 n.gwv6 = gw6 562 n.Unlock() 563 } 564 565 return nil 566 } 567 568 // Checks whether IPv6 needs to be enabled/disabled on the loopback interface 569 func (n *networkNamespace) checkLoV6() { 570 var ( 571 enable = false 572 action = "disable" 573 ) 574 575 n.Lock() 576 for _, iface := range n.iFaces { 577 if iface.AddressIPv6() != nil { 578 enable = true 579 action = "enable" 580 break 581 } 582 } 583 n.Unlock() 584 585 if n.loV6Enabled == enable { 586 return 587 } 588 589 if err := setIPv6(n.path, "lo", enable); err != nil { 590 logrus.Warnf("Failed to %s IPv6 on loopback interface on network namespace %q: %v", action, n.path, err) 591 } 592 593 n.loV6Enabled = enable 594 } 595 596 func setIPv6(nspath, iface string, enable bool) error { 597 errCh := make(chan error, 1) 598 go func() { 599 defer close(errCh) 600 601 namespace, err := netns.GetFromPath(nspath) 602 if err != nil { 603 errCh <- fmt.Errorf("failed get network namespace %q: %w", nspath, err) 604 return 605 } 606 defer namespace.Close() 607 608 runtime.LockOSThread() 609 610 origNS, err := netns.Get() 611 if err != nil { 612 runtime.UnlockOSThread() 613 errCh <- fmt.Errorf("failed to get current network namespace: %w", err) 614 return 615 } 616 defer origNS.Close() 617 618 if err = netns.Set(namespace); err != nil { 619 runtime.UnlockOSThread() 620 errCh <- fmt.Errorf("setting into container netns %q failed: %w", nspath, err) 621 return 622 } 623 defer func() { 624 if err := netns.Set(origNS); err != nil { 625 logrus.WithError(err).Error("libnetwork: restoring thread network namespace failed") 626 // The error is only fatal for the current thread. Keep this 627 // goroutine locked to the thread to make the runtime replace it 628 // with a clean thread once this goroutine returns. 629 } else { 630 runtime.UnlockOSThread() 631 } 632 }() 633 634 var ( 635 action = "disable" 636 value = byte('1') 637 path = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", iface) 638 ) 639 640 if enable { 641 action = "enable" 642 value = '0' 643 } 644 645 if _, err := os.Stat(path); err != nil { 646 if os.IsNotExist(err) { 647 logrus.WithError(err).Warn("Cannot configure IPv6 forwarding on container interface. Has IPv6 been disabled in this node's kernel?") 648 return 649 } 650 errCh <- err 651 return 652 } 653 654 if err = os.WriteFile(path, []byte{value, '\n'}, 0o644); err != nil { 655 errCh <- fmt.Errorf("failed to %s IPv6 forwarding for container's interface %s: %w", action, iface, err) 656 return 657 } 658 }() 659 return <-errCh 660 } 661 662 // ApplyOSTweaks applies linux configs on the sandbox 663 func (n *networkNamespace) ApplyOSTweaks(types []SandboxType) { 664 for _, t := range types { 665 switch t { 666 case SandboxTypeLoadBalancer, SandboxTypeIngress: 667 kernel.ApplyOSTweaks(map[string]*kernel.OSValue{ 668 // disables any special handling on port reuse of existing IPVS connection table entries 669 // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L32 670 "net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil}, 671 // expires connection from the IPVS connection table when the backend is not available 672 // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L133 673 "net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil}, 674 // expires persistent connections to destination servers with weights set to 0 675 // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L151 676 "net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil}, 677 }) 678 } 679 } 680 }