github.com/Heebron/moby@v0.0.0-20221111184709-6eab4f55faf7/libnetwork/osl/namespace_linux.go (about) 1 package osl 2 3 import ( 4 "errors" 5 "fmt" 6 "net" 7 "os" 8 "os/exec" 9 "path/filepath" 10 "runtime" 11 "strconv" 12 "strings" 13 "sync" 14 "syscall" 15 "time" 16 17 "github.com/docker/docker/libnetwork/ns" 18 "github.com/docker/docker/libnetwork/osl/kernel" 19 "github.com/docker/docker/libnetwork/types" 20 "github.com/docker/docker/pkg/reexec" 21 "github.com/sirupsen/logrus" 22 "github.com/vishvananda/netlink" 23 "github.com/vishvananda/netns" 24 "golang.org/x/sys/unix" 25 ) 26 27 const defaultPrefix = "/var/run/docker" 28 29 func init() { 30 reexec.Register("set-ipv6", reexecSetIPv6) 31 32 // Lock main() to the initial thread to exclude the goroutines spawned 33 // by func (*networkNamespace) InvokeFunc() from being scheduled onto 34 // that thread. Changes to the network namespace of the initial thread 35 // alter /proc/self/ns/net, which would break any code which 36 // (incorrectly) assumes that that file is a handle to the network 37 // namespace for the thread it is currently executing on. 38 runtime.LockOSThread() 39 } 40 41 var ( 42 once sync.Once 43 garbagePathMap = make(map[string]bool) 44 gpmLock sync.Mutex 45 gpmWg sync.WaitGroup 46 gpmCleanupPeriod = 60 * time.Second 47 gpmChan = make(chan chan struct{}) 48 prefix = defaultPrefix 49 ) 50 51 // The networkNamespace type is the linux implementation of the Sandbox 52 // interface. It represents a linux network namespace, and moves an interface 53 // into it when called on method AddInterface or sets the gateway etc. 54 type networkNamespace struct { 55 path string 56 iFaces []*nwIface 57 gw net.IP 58 gwv6 net.IP 59 staticRoutes []*types.StaticRoute 60 neighbors []*neigh 61 nextIfIndex map[string]int 62 isDefault bool 63 nlHandle *netlink.Handle 64 loV6Enabled bool 65 sync.Mutex 66 } 67 68 // SetBasePath sets the base url prefix for the ns path 69 func SetBasePath(path string) { 70 prefix = path 71 } 72 73 func init() { 74 reexec.Register("netns-create", reexecCreateNamespace) 75 } 76 77 func basePath() string { 78 return filepath.Join(prefix, "netns") 79 } 80 81 func createBasePath() { 82 err := os.MkdirAll(basePath(), 0755) 83 if err != nil { 84 panic("Could not create net namespace path directory") 85 } 86 87 // Start the garbage collection go routine 88 go removeUnusedPaths() 89 } 90 91 func removeUnusedPaths() { 92 gpmLock.Lock() 93 period := gpmCleanupPeriod 94 gpmLock.Unlock() 95 96 ticker := time.NewTicker(period) 97 for { 98 var ( 99 gc chan struct{} 100 gcOk bool 101 ) 102 103 select { 104 case <-ticker.C: 105 case gc, gcOk = <-gpmChan: 106 } 107 108 gpmLock.Lock() 109 pathList := make([]string, 0, len(garbagePathMap)) 110 for path := range garbagePathMap { 111 pathList = append(pathList, path) 112 } 113 garbagePathMap = make(map[string]bool) 114 gpmWg.Add(1) 115 gpmLock.Unlock() 116 117 for _, path := range pathList { 118 os.Remove(path) 119 } 120 121 gpmWg.Done() 122 if gcOk { 123 close(gc) 124 } 125 } 126 } 127 128 func addToGarbagePaths(path string) { 129 gpmLock.Lock() 130 garbagePathMap[path] = true 131 gpmLock.Unlock() 132 } 133 134 func removeFromGarbagePaths(path string) { 135 gpmLock.Lock() 136 delete(garbagePathMap, path) 137 gpmLock.Unlock() 138 } 139 140 // GC triggers garbage collection of namespace path right away 141 // and waits for it. 142 func GC() { 143 gpmLock.Lock() 144 if len(garbagePathMap) == 0 { 145 // No need for GC if map is empty 146 gpmLock.Unlock() 147 return 148 } 149 gpmLock.Unlock() 150 151 // if content exists in the garbage paths 152 // we can trigger GC to run, providing a 153 // channel to be notified on completion 154 waitGC := make(chan struct{}) 155 gpmChan <- waitGC 156 // wait for GC completion 157 <-waitGC 158 } 159 160 // GenerateKey generates a sandbox key based on the passed 161 // container id. 162 func GenerateKey(containerID string) string { 163 maxLen := 12 164 // Read sandbox key from host for overlay 165 if strings.HasPrefix(containerID, "-") { 166 var ( 167 index int 168 indexStr string 169 tmpkey string 170 ) 171 dir, err := os.ReadDir(basePath()) 172 if err != nil { 173 return "" 174 } 175 176 for _, v := range dir { 177 id := v.Name() 178 if strings.HasSuffix(id, containerID[:maxLen-1]) { 179 indexStr = strings.TrimSuffix(id, containerID[:maxLen-1]) 180 tmpindex, err := strconv.Atoi(indexStr) 181 if err != nil { 182 return "" 183 } 184 if tmpindex > index { 185 index = tmpindex 186 tmpkey = id 187 } 188 } 189 } 190 containerID = tmpkey 191 if containerID == "" { 192 return "" 193 } 194 } 195 196 if len(containerID) < maxLen { 197 maxLen = len(containerID) 198 } 199 200 return basePath() + "/" + containerID[:maxLen] 201 } 202 203 // NewSandbox provides a new sandbox instance created in an os specific way 204 // provided a key which uniquely identifies the sandbox 205 func NewSandbox(key string, osCreate, isRestore bool) (Sandbox, error) { 206 if !isRestore { 207 err := createNetworkNamespace(key, osCreate) 208 if err != nil { 209 return nil, err 210 } 211 } else { 212 once.Do(createBasePath) 213 } 214 215 n := &networkNamespace{path: key, isDefault: !osCreate, nextIfIndex: make(map[string]int)} 216 217 sboxNs, err := netns.GetFromPath(n.path) 218 if err != nil { 219 return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err) 220 } 221 defer sboxNs.Close() 222 223 n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE) 224 if err != nil { 225 return nil, fmt.Errorf("failed to create a netlink handle: %v", err) 226 } 227 228 err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout) 229 if err != nil { 230 logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err) 231 } 232 // In live-restore mode, IPV6 entries are getting cleaned up due to below code 233 // We should retain IPV6 configurations in live-restore mode when Docker Daemon 234 // comes back. It should work as it is on other cases 235 // As starting point, disable IPv6 on all interfaces 236 if !isRestore && !n.isDefault { 237 err = setIPv6(n.path, "all", false) 238 if err != nil { 239 logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err) 240 } 241 } 242 243 if err = n.loopbackUp(); err != nil { 244 n.nlHandle.Close() 245 return nil, err 246 } 247 248 return n, nil 249 } 250 251 func (n *networkNamespace) InterfaceOptions() IfaceOptionSetter { 252 return n 253 } 254 255 func (n *networkNamespace) NeighborOptions() NeighborOptionSetter { 256 return n 257 } 258 259 func mountNetworkNamespace(basePath string, lnPath string) error { 260 return syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "") 261 } 262 263 // GetSandboxForExternalKey returns sandbox object for the supplied path 264 func GetSandboxForExternalKey(basePath string, key string) (Sandbox, error) { 265 if err := createNamespaceFile(key); err != nil { 266 return nil, err 267 } 268 269 if err := mountNetworkNamespace(basePath, key); err != nil { 270 return nil, err 271 } 272 n := &networkNamespace{path: key, nextIfIndex: make(map[string]int)} 273 274 sboxNs, err := netns.GetFromPath(n.path) 275 if err != nil { 276 return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err) 277 } 278 defer sboxNs.Close() 279 280 n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE) 281 if err != nil { 282 return nil, fmt.Errorf("failed to create a netlink handle: %v", err) 283 } 284 285 err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout) 286 if err != nil { 287 logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err) 288 } 289 290 // As starting point, disable IPv6 on all interfaces 291 err = setIPv6(n.path, "all", false) 292 if err != nil { 293 logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err) 294 } 295 296 if err = n.loopbackUp(); err != nil { 297 n.nlHandle.Close() 298 return nil, err 299 } 300 301 return n, nil 302 } 303 304 func reexecCreateNamespace() { 305 if len(os.Args) < 2 { 306 logrus.Fatal("no namespace path provided") 307 } 308 if err := mountNetworkNamespace("/proc/self/ns/net", os.Args[1]); err != nil { 309 logrus.Fatal(err) 310 } 311 } 312 313 func createNetworkNamespace(path string, osCreate bool) error { 314 if err := createNamespaceFile(path); err != nil { 315 return err 316 } 317 318 cmd := &exec.Cmd{ 319 Path: reexec.Self(), 320 Args: append([]string{"netns-create"}, path), 321 Stdout: os.Stdout, 322 Stderr: os.Stderr, 323 } 324 if osCreate { 325 cmd.SysProcAttr = &syscall.SysProcAttr{} 326 cmd.SysProcAttr.Cloneflags = syscall.CLONE_NEWNET 327 } 328 if err := cmd.Run(); err != nil { 329 return fmt.Errorf("namespace creation reexec command failed: %v", err) 330 } 331 332 return nil 333 } 334 335 func unmountNamespaceFile(path string) { 336 if _, err := os.Stat(path); err == nil { 337 if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil && !errors.Is(err, unix.EINVAL) { 338 logrus.WithError(err).Error("Error unmounting namespace file") 339 } 340 } 341 } 342 343 func createNamespaceFile(path string) (err error) { 344 var f *os.File 345 346 once.Do(createBasePath) 347 // Remove it from garbage collection list if present 348 removeFromGarbagePaths(path) 349 350 // If the path is there unmount it first 351 unmountNamespaceFile(path) 352 353 // wait for garbage collection to complete if it is in progress 354 // before trying to create the file. 355 gpmWg.Wait() 356 357 if f, err = os.Create(path); err == nil { 358 f.Close() 359 } 360 361 return err 362 } 363 364 func (n *networkNamespace) loopbackUp() error { 365 iface, err := n.nlHandle.LinkByName("lo") 366 if err != nil { 367 return err 368 } 369 return n.nlHandle.LinkSetUp(iface) 370 } 371 372 func (n *networkNamespace) GetLoopbackIfaceName() string { 373 return "lo" 374 } 375 376 func (n *networkNamespace) AddAliasIP(ifName string, ip *net.IPNet) error { 377 iface, err := n.nlHandle.LinkByName(ifName) 378 if err != nil { 379 return err 380 } 381 return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip}) 382 } 383 384 func (n *networkNamespace) RemoveAliasIP(ifName string, ip *net.IPNet) error { 385 iface, err := n.nlHandle.LinkByName(ifName) 386 if err != nil { 387 return err 388 } 389 return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip}) 390 } 391 392 func (n *networkNamespace) DisableARPForVIP(srcName string) (Err error) { 393 dstName := "" 394 for _, i := range n.Interfaces() { 395 if i.SrcName() == srcName { 396 dstName = i.DstName() 397 break 398 } 399 } 400 if dstName == "" { 401 return fmt.Errorf("failed to find interface %s in sandbox", srcName) 402 } 403 404 err := n.InvokeFunc(func() { 405 path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore") 406 if err := os.WriteFile(path, []byte{'1', '\n'}, 0644); err != nil { 407 Err = fmt.Errorf("Failed to set %s to 1: %v", path, err) 408 return 409 } 410 path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce") 411 if err := os.WriteFile(path, []byte{'2', '\n'}, 0644); err != nil { 412 Err = fmt.Errorf("Failed to set %s to 2: %v", path, err) 413 return 414 } 415 }) 416 if err != nil { 417 return err 418 } 419 return 420 } 421 422 func (n *networkNamespace) InvokeFunc(f func()) error { 423 origNS, err := netns.Get() 424 if err != nil { 425 return fmt.Errorf("failed to get original network namespace: %w", err) 426 } 427 defer origNS.Close() 428 429 path := n.nsPath() 430 newNS, err := netns.GetFromPath(path) 431 if err != nil { 432 return fmt.Errorf("failed get network namespace %q: %w", path, err) 433 } 434 defer newNS.Close() 435 436 done := make(chan error, 1) 437 go func() { 438 runtime.LockOSThread() 439 if err := netns.Set(newNS); err != nil { 440 runtime.UnlockOSThread() 441 done <- err 442 return 443 } 444 defer func() { 445 close(done) 446 if err := netns.Set(origNS); err != nil { 447 logrus.WithError(err).Warn("failed to restore thread's network namespace") 448 // Recover from the error by leaving this goroutine locked to 449 // the thread. The runtime will terminate the thread and replace 450 // it with a clean one when this goroutine returns. 451 } else { 452 runtime.UnlockOSThread() 453 } 454 }() 455 f() 456 }() 457 return <-done 458 } 459 460 func (n *networkNamespace) nsPath() string { 461 n.Lock() 462 defer n.Unlock() 463 464 return n.path 465 } 466 467 func (n *networkNamespace) Info() Info { 468 return n 469 } 470 471 func (n *networkNamespace) Key() string { 472 return n.path 473 } 474 475 func (n *networkNamespace) Destroy() error { 476 if n.nlHandle != nil { 477 n.nlHandle.Close() 478 } 479 // Assuming no running process is executing in this network namespace, 480 // unmounting is sufficient to destroy it. 481 if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil { 482 return err 483 } 484 485 // Stash it into the garbage collection list 486 addToGarbagePaths(n.path) 487 return nil 488 } 489 490 // Restore restore the network namespace 491 func (n *networkNamespace) Restore(ifsopt map[string][]IfaceOption, routes []*types.StaticRoute, gw net.IP, gw6 net.IP) error { 492 // restore interfaces 493 for name, opts := range ifsopt { 494 if !strings.Contains(name, "+") { 495 return fmt.Errorf("wrong iface name in restore osl sandbox interface: %s", name) 496 } 497 seps := strings.Split(name, "+") 498 srcName := seps[0] 499 dstPrefix := seps[1] 500 i := &nwIface{srcName: srcName, dstName: dstPrefix, ns: n} 501 i.processInterfaceOptions(opts...) 502 if i.master != "" { 503 i.dstMaster = n.findDst(i.master, true) 504 if i.dstMaster == "" { 505 return fmt.Errorf("could not find an appropriate master %q for %q", 506 i.master, i.srcName) 507 } 508 } 509 if n.isDefault { 510 i.dstName = i.srcName 511 } else { 512 links, err := n.nlHandle.LinkList() 513 if err != nil { 514 return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path) 515 } 516 // due to the docker network connect/disconnect, so the dstName should 517 // restore from the namespace 518 for _, link := range links { 519 addrs, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4) 520 if err != nil { 521 return err 522 } 523 ifaceName := link.Attrs().Name 524 if strings.HasPrefix(ifaceName, "vxlan") { 525 if i.dstName == "vxlan" { 526 i.dstName = ifaceName 527 break 528 } 529 } 530 // find the interface name by ip 531 if i.address != nil { 532 for _, addr := range addrs { 533 if addr.IPNet.String() == i.address.String() { 534 i.dstName = ifaceName 535 break 536 } 537 continue 538 } 539 if i.dstName == ifaceName { 540 break 541 } 542 } 543 // This is to find the interface name of the pair in overlay sandbox 544 if strings.HasPrefix(ifaceName, "veth") { 545 if i.master != "" && i.dstName == "veth" { 546 i.dstName = ifaceName 547 } 548 } 549 } 550 551 var index int 552 indexStr := strings.TrimPrefix(i.dstName, dstPrefix) 553 if indexStr != "" { 554 index, err = strconv.Atoi(indexStr) 555 if err != nil { 556 return err 557 } 558 } 559 index++ 560 n.Lock() 561 if index > n.nextIfIndex[dstPrefix] { 562 n.nextIfIndex[dstPrefix] = index 563 } 564 n.iFaces = append(n.iFaces, i) 565 n.Unlock() 566 } 567 } 568 569 // restore routes 570 for _, r := range routes { 571 n.Lock() 572 n.staticRoutes = append(n.staticRoutes, r) 573 n.Unlock() 574 } 575 576 // restore gateway 577 if len(gw) > 0 { 578 n.Lock() 579 n.gw = gw 580 n.Unlock() 581 } 582 583 if len(gw6) > 0 { 584 n.Lock() 585 n.gwv6 = gw6 586 n.Unlock() 587 } 588 589 return nil 590 } 591 592 // Checks whether IPv6 needs to be enabled/disabled on the loopback interface 593 func (n *networkNamespace) checkLoV6() { 594 var ( 595 enable = false 596 action = "disable" 597 ) 598 599 n.Lock() 600 for _, iface := range n.iFaces { 601 if iface.AddressIPv6() != nil { 602 enable = true 603 action = "enable" 604 break 605 } 606 } 607 n.Unlock() 608 609 if n.loV6Enabled == enable { 610 return 611 } 612 613 if err := setIPv6(n.path, "lo", enable); err != nil { 614 logrus.Warnf("Failed to %s IPv6 on loopback interface on network namespace %q: %v", action, n.path, err) 615 } 616 617 n.loV6Enabled = enable 618 } 619 620 func reexecSetIPv6() { 621 runtime.LockOSThread() 622 defer runtime.UnlockOSThread() 623 624 if len(os.Args) < 3 { 625 logrus.Errorf("invalid number of arguments for %s", os.Args[0]) 626 os.Exit(1) 627 } 628 629 ns, err := netns.GetFromPath(os.Args[1]) 630 if err != nil { 631 logrus.Errorf("failed get network namespace %q: %v", os.Args[1], err) 632 os.Exit(2) 633 } 634 defer ns.Close() 635 636 if err = netns.Set(ns); err != nil { 637 logrus.Errorf("setting into container netns %q failed: %v", os.Args[1], err) 638 os.Exit(3) 639 } 640 641 var ( 642 action = "disable" 643 value = byte('1') 644 path = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", os.Args[2]) 645 ) 646 647 if os.Args[3] == "true" { 648 action = "enable" 649 value = byte('0') 650 } 651 652 if _, err := os.Stat(path); err != nil { 653 if os.IsNotExist(err) { 654 logrus.Warnf("file does not exist: %s : %v Has IPv6 been disabled in this node's kernel?", path, err) 655 os.Exit(0) 656 } 657 logrus.Errorf("failed to stat %s : %v", path, err) 658 os.Exit(5) 659 } 660 661 if err = os.WriteFile(path, []byte{value, '\n'}, 0644); err != nil { 662 logrus.Errorf("failed to %s IPv6 forwarding for container's interface %s: %v", action, os.Args[2], err) 663 os.Exit(4) 664 } 665 666 os.Exit(0) 667 } 668 669 func setIPv6(path, iface string, enable bool) error { 670 cmd := &exec.Cmd{ 671 Path: reexec.Self(), 672 Args: append([]string{"set-ipv6"}, path, iface, strconv.FormatBool(enable)), 673 Stdout: os.Stdout, 674 Stderr: os.Stderr, 675 } 676 if err := cmd.Run(); err != nil { 677 return fmt.Errorf("reexec to set IPv6 failed: %v", err) 678 } 679 return nil 680 } 681 682 // ApplyOSTweaks applies linux configs on the sandbox 683 func (n *networkNamespace) ApplyOSTweaks(types []SandboxType) { 684 for _, t := range types { 685 switch t { 686 case SandboxTypeLoadBalancer, SandboxTypeIngress: 687 kernel.ApplyOSTweaks(map[string]*kernel.OSValue{ 688 // disables any special handling on port reuse of existing IPVS connection table entries 689 // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L32 690 "net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil}, 691 // expires connection from the IPVS connection table when the backend is not available 692 // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L133 693 "net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil}, 694 // expires persistent connections to destination servers with weights set to 0 695 // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L151 696 "net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil}, 697 }) 698 } 699 } 700 }