github.com/docker/engine@v22.0.0-20211208180946-d456264580cf+incompatible/libnetwork/osl/namespace_linux.go (about) 1 package osl 2 3 import ( 4 "errors" 5 "fmt" 6 "net" 7 "os" 8 "os/exec" 9 "path/filepath" 10 "runtime" 11 "strconv" 12 "strings" 13 "sync" 14 "syscall" 15 "time" 16 17 "github.com/docker/docker/libnetwork/ns" 18 "github.com/docker/docker/libnetwork/osl/kernel" 19 "github.com/docker/docker/libnetwork/types" 20 "github.com/docker/docker/pkg/reexec" 21 "github.com/sirupsen/logrus" 22 "github.com/vishvananda/netlink" 23 "github.com/vishvananda/netns" 24 "golang.org/x/sys/unix" 25 ) 26 27 const defaultPrefix = "/var/run/docker" 28 29 func init() { 30 reexec.Register("set-ipv6", reexecSetIPv6) 31 } 32 33 var ( 34 once sync.Once 35 garbagePathMap = make(map[string]bool) 36 gpmLock sync.Mutex 37 gpmWg sync.WaitGroup 38 gpmCleanupPeriod = 60 * time.Second 39 gpmChan = make(chan chan struct{}) 40 prefix = defaultPrefix 41 loadBalancerConfig = map[string]*kernel.OSValue{ 42 // disables any special handling on port reuse of existing IPVS connection table entries 43 // more info: https://github.com/torvalds/linux/blob/master/Documentation/networking/ipvs-sysctl.txt#L25:1 44 "net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil}, 45 // expires connection from the IPVS connection table when the backend is not available 46 // more info: https://github.com/torvalds/linux/blob/master/Documentation/networking/ipvs-sysctl.txt#L126:1 47 "net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil}, 48 // expires persistent connections to destination servers with weights set to 0 49 // more info: https://github.com/torvalds/linux/blob/master/Documentation/networking/ipvs-sysctl.txt#L144:1 50 "net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil}, 51 } 52 ) 53 54 // The networkNamespace type is the linux implementation of the Sandbox 55 // interface. It represents a linux network namespace, and moves an interface 56 // into it when called on method AddInterface or sets the gateway etc. 57 type networkNamespace struct { 58 path string 59 iFaces []*nwIface 60 gw net.IP 61 gwv6 net.IP 62 staticRoutes []*types.StaticRoute 63 neighbors []*neigh 64 nextIfIndex map[string]int 65 isDefault bool 66 nlHandle *netlink.Handle 67 loV6Enabled bool 68 sync.Mutex 69 } 70 71 // SetBasePath sets the base url prefix for the ns path 72 func SetBasePath(path string) { 73 prefix = path 74 } 75 76 func init() { 77 reexec.Register("netns-create", reexecCreateNamespace) 78 } 79 80 func basePath() string { 81 return filepath.Join(prefix, "netns") 82 } 83 84 func createBasePath() { 85 err := os.MkdirAll(basePath(), 0755) 86 if err != nil { 87 panic("Could not create net namespace path directory") 88 } 89 90 // Start the garbage collection go routine 91 go removeUnusedPaths() 92 } 93 94 func removeUnusedPaths() { 95 gpmLock.Lock() 96 period := gpmCleanupPeriod 97 gpmLock.Unlock() 98 99 ticker := time.NewTicker(period) 100 for { 101 var ( 102 gc chan struct{} 103 gcOk bool 104 ) 105 106 select { 107 case <-ticker.C: 108 case gc, gcOk = <-gpmChan: 109 } 110 111 gpmLock.Lock() 112 pathList := make([]string, 0, len(garbagePathMap)) 113 for path := range garbagePathMap { 114 pathList = append(pathList, path) 115 } 116 garbagePathMap = make(map[string]bool) 117 gpmWg.Add(1) 118 gpmLock.Unlock() 119 120 for _, path := range pathList { 121 os.Remove(path) 122 } 123 124 gpmWg.Done() 125 if gcOk { 126 close(gc) 127 } 128 } 129 } 130 131 func addToGarbagePaths(path string) { 132 gpmLock.Lock() 133 garbagePathMap[path] = true 134 gpmLock.Unlock() 135 } 136 137 func removeFromGarbagePaths(path string) { 138 gpmLock.Lock() 139 delete(garbagePathMap, path) 140 gpmLock.Unlock() 141 } 142 143 // GC triggers garbage collection of namespace path right away 144 // and waits for it. 145 func GC() { 146 gpmLock.Lock() 147 if len(garbagePathMap) == 0 { 148 // No need for GC if map is empty 149 gpmLock.Unlock() 150 return 151 } 152 gpmLock.Unlock() 153 154 // if content exists in the garbage paths 155 // we can trigger GC to run, providing a 156 // channel to be notified on completion 157 waitGC := make(chan struct{}) 158 gpmChan <- waitGC 159 // wait for GC completion 160 <-waitGC 161 } 162 163 // GenerateKey generates a sandbox key based on the passed 164 // container id. 165 func GenerateKey(containerID string) string { 166 maxLen := 12 167 // Read sandbox key from host for overlay 168 if strings.HasPrefix(containerID, "-") { 169 var ( 170 index int 171 indexStr string 172 tmpkey string 173 ) 174 dir, err := os.ReadDir(basePath()) 175 if err != nil { 176 return "" 177 } 178 179 for _, v := range dir { 180 id := v.Name() 181 if strings.HasSuffix(id, containerID[:maxLen-1]) { 182 indexStr = strings.TrimSuffix(id, containerID[:maxLen-1]) 183 tmpindex, err := strconv.Atoi(indexStr) 184 if err != nil { 185 return "" 186 } 187 if tmpindex > index { 188 index = tmpindex 189 tmpkey = id 190 } 191 192 } 193 } 194 containerID = tmpkey 195 if containerID == "" { 196 return "" 197 } 198 } 199 200 if len(containerID) < maxLen { 201 maxLen = len(containerID) 202 } 203 204 return basePath() + "/" + containerID[:maxLen] 205 } 206 207 // NewSandbox provides a new sandbox instance created in an os specific way 208 // provided a key which uniquely identifies the sandbox 209 func NewSandbox(key string, osCreate, isRestore bool) (Sandbox, error) { 210 if !isRestore { 211 err := createNetworkNamespace(key, osCreate) 212 if err != nil { 213 return nil, err 214 } 215 } else { 216 once.Do(createBasePath) 217 } 218 219 n := &networkNamespace{path: key, isDefault: !osCreate, nextIfIndex: make(map[string]int)} 220 221 sboxNs, err := netns.GetFromPath(n.path) 222 if err != nil { 223 return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err) 224 } 225 defer sboxNs.Close() 226 227 n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE) 228 if err != nil { 229 return nil, fmt.Errorf("failed to create a netlink handle: %v", err) 230 } 231 232 err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout) 233 if err != nil { 234 logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err) 235 } 236 // In live-restore mode, IPV6 entries are getting cleaned up due to below code 237 // We should retain IPV6 configurations in live-restore mode when Docker Daemon 238 // comes back. It should work as it is on other cases 239 // As starting point, disable IPv6 on all interfaces 240 if !isRestore && !n.isDefault { 241 err = setIPv6(n.path, "all", false) 242 if err != nil { 243 logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err) 244 } 245 } 246 247 if err = n.loopbackUp(); err != nil { 248 n.nlHandle.Delete() 249 return nil, err 250 } 251 252 return n, nil 253 } 254 255 func (n *networkNamespace) InterfaceOptions() IfaceOptionSetter { 256 return n 257 } 258 259 func (n *networkNamespace) NeighborOptions() NeighborOptionSetter { 260 return n 261 } 262 263 func mountNetworkNamespace(basePath string, lnPath string) error { 264 return syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "") 265 } 266 267 // GetSandboxForExternalKey returns sandbox object for the supplied path 268 func GetSandboxForExternalKey(basePath string, key string) (Sandbox, error) { 269 if err := createNamespaceFile(key); err != nil { 270 return nil, err 271 } 272 273 if err := mountNetworkNamespace(basePath, key); err != nil { 274 return nil, err 275 } 276 n := &networkNamespace{path: key, nextIfIndex: make(map[string]int)} 277 278 sboxNs, err := netns.GetFromPath(n.path) 279 if err != nil { 280 return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err) 281 } 282 defer sboxNs.Close() 283 284 n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE) 285 if err != nil { 286 return nil, fmt.Errorf("failed to create a netlink handle: %v", err) 287 } 288 289 err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout) 290 if err != nil { 291 logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err) 292 } 293 294 // As starting point, disable IPv6 on all interfaces 295 err = setIPv6(n.path, "all", false) 296 if err != nil { 297 logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err) 298 } 299 300 if err = n.loopbackUp(); err != nil { 301 n.nlHandle.Delete() 302 return nil, err 303 } 304 305 return n, nil 306 } 307 308 func reexecCreateNamespace() { 309 if len(os.Args) < 2 { 310 logrus.Fatal("no namespace path provided") 311 } 312 if err := mountNetworkNamespace("/proc/self/ns/net", os.Args[1]); err != nil { 313 logrus.Fatal(err) 314 } 315 } 316 317 func createNetworkNamespace(path string, osCreate bool) error { 318 if err := createNamespaceFile(path); err != nil { 319 return err 320 } 321 322 cmd := &exec.Cmd{ 323 Path: reexec.Self(), 324 Args: append([]string{"netns-create"}, path), 325 Stdout: os.Stdout, 326 Stderr: os.Stderr, 327 } 328 if osCreate { 329 cmd.SysProcAttr = &syscall.SysProcAttr{} 330 cmd.SysProcAttr.Cloneflags = syscall.CLONE_NEWNET 331 } 332 if err := cmd.Run(); err != nil { 333 return fmt.Errorf("namespace creation reexec command failed: %v", err) 334 } 335 336 return nil 337 } 338 339 func unmountNamespaceFile(path string) { 340 if _, err := os.Stat(path); err == nil { 341 if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil && !errors.Is(err, unix.EINVAL) { 342 logrus.WithError(err).Error("Error unmounting namespace file") 343 } 344 } 345 } 346 347 func createNamespaceFile(path string) (err error) { 348 var f *os.File 349 350 once.Do(createBasePath) 351 // Remove it from garbage collection list if present 352 removeFromGarbagePaths(path) 353 354 // If the path is there unmount it first 355 unmountNamespaceFile(path) 356 357 // wait for garbage collection to complete if it is in progress 358 // before trying to create the file. 359 gpmWg.Wait() 360 361 if f, err = os.Create(path); err == nil { 362 f.Close() 363 } 364 365 return err 366 } 367 368 func (n *networkNamespace) loopbackUp() error { 369 iface, err := n.nlHandle.LinkByName("lo") 370 if err != nil { 371 return err 372 } 373 return n.nlHandle.LinkSetUp(iface) 374 } 375 376 func (n *networkNamespace) GetLoopbackIfaceName() string { 377 return "lo" 378 } 379 380 func (n *networkNamespace) AddAliasIP(ifName string, ip *net.IPNet) error { 381 iface, err := n.nlHandle.LinkByName(ifName) 382 if err != nil { 383 return err 384 } 385 return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip}) 386 } 387 388 func (n *networkNamespace) RemoveAliasIP(ifName string, ip *net.IPNet) error { 389 iface, err := n.nlHandle.LinkByName(ifName) 390 if err != nil { 391 return err 392 } 393 return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip}) 394 } 395 396 func (n *networkNamespace) DisableARPForVIP(srcName string) (Err error) { 397 dstName := "" 398 for _, i := range n.Interfaces() { 399 if i.SrcName() == srcName { 400 dstName = i.DstName() 401 break 402 } 403 } 404 if dstName == "" { 405 return fmt.Errorf("failed to find interface %s in sandbox", srcName) 406 } 407 408 err := n.InvokeFunc(func() { 409 path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore") 410 if err := os.WriteFile(path, []byte{'1', '\n'}, 0644); err != nil { 411 Err = fmt.Errorf("Failed to set %s to 1: %v", path, err) 412 return 413 } 414 path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce") 415 if err := os.WriteFile(path, []byte{'2', '\n'}, 0644); err != nil { 416 Err = fmt.Errorf("Failed to set %s to 2: %v", path, err) 417 return 418 } 419 }) 420 if err != nil { 421 return err 422 } 423 return 424 } 425 426 func (n *networkNamespace) InvokeFunc(f func()) error { 427 return nsInvoke(n.nsPath(), func(nsFD int) error { return nil }, func(callerFD int) error { 428 f() 429 return nil 430 }) 431 } 432 433 // InitOSContext initializes OS context while configuring network resources 434 func InitOSContext() func() { 435 runtime.LockOSThread() 436 if err := ns.SetNamespace(); err != nil { 437 logrus.Error(err) 438 } 439 return runtime.UnlockOSThread 440 } 441 442 func nsInvoke(path string, prefunc func(nsFD int) error, postfunc func(callerFD int) error) error { 443 defer InitOSContext()() 444 445 newNs, err := netns.GetFromPath(path) 446 if err != nil { 447 return fmt.Errorf("failed get network namespace %q: %v", path, err) 448 } 449 defer newNs.Close() 450 451 // Invoked before the namespace switch happens but after the namespace file 452 // handle is obtained. 453 if err := prefunc(int(newNs)); err != nil { 454 return fmt.Errorf("failed in prefunc: %v", err) 455 } 456 457 if err = netns.Set(newNs); err != nil { 458 return err 459 } 460 defer ns.SetNamespace() 461 462 // Invoked after the namespace switch. 463 return postfunc(ns.ParseHandlerInt()) 464 } 465 466 func (n *networkNamespace) nsPath() string { 467 n.Lock() 468 defer n.Unlock() 469 470 return n.path 471 } 472 473 func (n *networkNamespace) Info() Info { 474 return n 475 } 476 477 func (n *networkNamespace) Key() string { 478 return n.path 479 } 480 481 func (n *networkNamespace) Destroy() error { 482 if n.nlHandle != nil { 483 n.nlHandle.Delete() 484 } 485 // Assuming no running process is executing in this network namespace, 486 // unmounting is sufficient to destroy it. 487 if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil { 488 return err 489 } 490 491 // Stash it into the garbage collection list 492 addToGarbagePaths(n.path) 493 return nil 494 } 495 496 // Restore restore the network namespace 497 func (n *networkNamespace) Restore(ifsopt map[string][]IfaceOption, routes []*types.StaticRoute, gw net.IP, gw6 net.IP) error { 498 // restore interfaces 499 for name, opts := range ifsopt { 500 if !strings.Contains(name, "+") { 501 return fmt.Errorf("wrong iface name in restore osl sandbox interface: %s", name) 502 } 503 seps := strings.Split(name, "+") 504 srcName := seps[0] 505 dstPrefix := seps[1] 506 i := &nwIface{srcName: srcName, dstName: dstPrefix, ns: n} 507 i.processInterfaceOptions(opts...) 508 if i.master != "" { 509 i.dstMaster = n.findDst(i.master, true) 510 if i.dstMaster == "" { 511 return fmt.Errorf("could not find an appropriate master %q for %q", 512 i.master, i.srcName) 513 } 514 } 515 if n.isDefault { 516 i.dstName = i.srcName 517 } else { 518 links, err := n.nlHandle.LinkList() 519 if err != nil { 520 return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path) 521 } 522 // due to the docker network connect/disconnect, so the dstName should 523 // restore from the namespace 524 for _, link := range links { 525 addrs, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4) 526 if err != nil { 527 return err 528 } 529 ifaceName := link.Attrs().Name 530 if strings.HasPrefix(ifaceName, "vxlan") { 531 if i.dstName == "vxlan" { 532 i.dstName = ifaceName 533 break 534 } 535 } 536 // find the interface name by ip 537 if i.address != nil { 538 for _, addr := range addrs { 539 if addr.IPNet.String() == i.address.String() { 540 i.dstName = ifaceName 541 break 542 } 543 continue 544 } 545 if i.dstName == ifaceName { 546 break 547 } 548 } 549 // This is to find the interface name of the pair in overlay sandbox 550 if strings.HasPrefix(ifaceName, "veth") { 551 if i.master != "" && i.dstName == "veth" { 552 i.dstName = ifaceName 553 } 554 } 555 } 556 557 var index int 558 indexStr := strings.TrimPrefix(i.dstName, dstPrefix) 559 if indexStr != "" { 560 index, err = strconv.Atoi(indexStr) 561 if err != nil { 562 return err 563 } 564 } 565 index++ 566 n.Lock() 567 if index > n.nextIfIndex[dstPrefix] { 568 n.nextIfIndex[dstPrefix] = index 569 } 570 n.iFaces = append(n.iFaces, i) 571 n.Unlock() 572 } 573 } 574 575 // restore routes 576 for _, r := range routes { 577 n.Lock() 578 n.staticRoutes = append(n.staticRoutes, r) 579 n.Unlock() 580 } 581 582 // restore gateway 583 if len(gw) > 0 { 584 n.Lock() 585 n.gw = gw 586 n.Unlock() 587 } 588 589 if len(gw6) > 0 { 590 n.Lock() 591 n.gwv6 = gw6 592 n.Unlock() 593 } 594 595 return nil 596 } 597 598 // Checks whether IPv6 needs to be enabled/disabled on the loopback interface 599 func (n *networkNamespace) checkLoV6() { 600 var ( 601 enable = false 602 action = "disable" 603 ) 604 605 n.Lock() 606 for _, iface := range n.iFaces { 607 if iface.AddressIPv6() != nil { 608 enable = true 609 action = "enable" 610 break 611 } 612 } 613 n.Unlock() 614 615 if n.loV6Enabled == enable { 616 return 617 } 618 619 if err := setIPv6(n.path, "lo", enable); err != nil { 620 logrus.Warnf("Failed to %s IPv6 on loopback interface on network namespace %q: %v", action, n.path, err) 621 } 622 623 n.loV6Enabled = enable 624 } 625 626 func reexecSetIPv6() { 627 runtime.LockOSThread() 628 defer runtime.UnlockOSThread() 629 630 if len(os.Args) < 3 { 631 logrus.Errorf("invalid number of arguments for %s", os.Args[0]) 632 os.Exit(1) 633 } 634 635 ns, err := netns.GetFromPath(os.Args[1]) 636 if err != nil { 637 logrus.Errorf("failed get network namespace %q: %v", os.Args[1], err) 638 os.Exit(2) 639 } 640 defer ns.Close() 641 642 if err = netns.Set(ns); err != nil { 643 logrus.Errorf("setting into container netns %q failed: %v", os.Args[1], err) 644 os.Exit(3) 645 } 646 647 var ( 648 action = "disable" 649 value = byte('1') 650 path = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", os.Args[2]) 651 ) 652 653 if os.Args[3] == "true" { 654 action = "enable" 655 value = byte('0') 656 } 657 658 if _, err := os.Stat(path); err != nil { 659 if os.IsNotExist(err) { 660 logrus.Warnf("file does not exist: %s : %v Has IPv6 been disabled in this node's kernel?", path, err) 661 os.Exit(0) 662 } 663 logrus.Errorf("failed to stat %s : %v", path, err) 664 os.Exit(5) 665 } 666 667 if err = os.WriteFile(path, []byte{value, '\n'}, 0644); err != nil { 668 logrus.Errorf("failed to %s IPv6 forwarding for container's interface %s: %v", action, os.Args[2], err) 669 os.Exit(4) 670 } 671 672 os.Exit(0) 673 } 674 675 func setIPv6(path, iface string, enable bool) error { 676 cmd := &exec.Cmd{ 677 Path: reexec.Self(), 678 Args: append([]string{"set-ipv6"}, path, iface, strconv.FormatBool(enable)), 679 Stdout: os.Stdout, 680 Stderr: os.Stderr, 681 } 682 if err := cmd.Run(); err != nil { 683 return fmt.Errorf("reexec to set IPv6 failed: %v", err) 684 } 685 return nil 686 } 687 688 // ApplyOSTweaks applies linux configs on the sandbox 689 func (n *networkNamespace) ApplyOSTweaks(types []SandboxType) { 690 for _, t := range types { 691 switch t { 692 case SandboxTypeLoadBalancer: 693 kernel.ApplyOSTweaks(loadBalancerConfig) 694 } 695 } 696 }