github.com/adityamillind98/moby@v23.0.0-rc.4+incompatible/libnetwork/osl/namespace_linux.go (about) 1 package osl 2 3 import ( 4 "errors" 5 "fmt" 6 "net" 7 "os" 8 "os/exec" 9 "path/filepath" 10 "runtime" 11 "strconv" 12 "strings" 13 "sync" 14 "syscall" 15 "time" 16 17 "github.com/docker/docker/libnetwork/ns" 18 "github.com/docker/docker/libnetwork/osl/kernel" 19 "github.com/docker/docker/libnetwork/types" 20 "github.com/docker/docker/pkg/reexec" 21 "github.com/sirupsen/logrus" 22 "github.com/vishvananda/netlink" 23 "github.com/vishvananda/netns" 24 "golang.org/x/sys/unix" 25 ) 26 27 const defaultPrefix = "/var/run/docker" 28 29 func init() { 30 reexec.Register("set-ipv6", reexecSetIPv6) 31 } 32 33 var ( 34 once sync.Once 35 garbagePathMap = make(map[string]bool) 36 gpmLock sync.Mutex 37 gpmWg sync.WaitGroup 38 gpmCleanupPeriod = 60 * time.Second 39 gpmChan = make(chan chan struct{}) 40 prefix = defaultPrefix 41 ) 42 43 // The networkNamespace type is the linux implementation of the Sandbox 44 // interface. It represents a linux network namespace, and moves an interface 45 // into it when called on method AddInterface or sets the gateway etc. 46 type networkNamespace struct { 47 path string 48 iFaces []*nwIface 49 gw net.IP 50 gwv6 net.IP 51 staticRoutes []*types.StaticRoute 52 neighbors []*neigh 53 nextIfIndex map[string]int 54 isDefault bool 55 nlHandle *netlink.Handle 56 loV6Enabled bool 57 sync.Mutex 58 } 59 60 // SetBasePath sets the base url prefix for the ns path 61 func SetBasePath(path string) { 62 prefix = path 63 } 64 65 func init() { 66 reexec.Register("netns-create", reexecCreateNamespace) 67 } 68 69 func basePath() string { 70 return filepath.Join(prefix, "netns") 71 } 72 73 func createBasePath() { 74 err := os.MkdirAll(basePath(), 0755) 75 if err != nil { 76 panic("Could not create net namespace path directory") 77 } 78 79 // Start the garbage collection go routine 80 go removeUnusedPaths() 81 } 82 83 func removeUnusedPaths() { 84 gpmLock.Lock() 85 period := gpmCleanupPeriod 86 gpmLock.Unlock() 87 88 ticker := time.NewTicker(period) 89 for { 90 var ( 91 gc chan struct{} 92 gcOk bool 93 ) 94 95 select { 96 case <-ticker.C: 97 case gc, gcOk = <-gpmChan: 98 } 99 100 gpmLock.Lock() 101 pathList := make([]string, 0, len(garbagePathMap)) 102 for path := range garbagePathMap { 103 pathList = append(pathList, path) 104 } 105 garbagePathMap = make(map[string]bool) 106 gpmWg.Add(1) 107 gpmLock.Unlock() 108 109 for _, path := range pathList { 110 os.Remove(path) 111 } 112 113 gpmWg.Done() 114 if gcOk { 115 close(gc) 116 } 117 } 118 } 119 120 func addToGarbagePaths(path string) { 121 gpmLock.Lock() 122 garbagePathMap[path] = true 123 gpmLock.Unlock() 124 } 125 126 func removeFromGarbagePaths(path string) { 127 gpmLock.Lock() 128 delete(garbagePathMap, path) 129 gpmLock.Unlock() 130 } 131 132 // GC triggers garbage collection of namespace path right away 133 // and waits for it. 134 func GC() { 135 gpmLock.Lock() 136 if len(garbagePathMap) == 0 { 137 // No need for GC if map is empty 138 gpmLock.Unlock() 139 return 140 } 141 gpmLock.Unlock() 142 143 // if content exists in the garbage paths 144 // we can trigger GC to run, providing a 145 // channel to be notified on completion 146 waitGC := make(chan struct{}) 147 gpmChan <- waitGC 148 // wait for GC completion 149 <-waitGC 150 } 151 152 // GenerateKey generates a sandbox key based on the passed 153 // container id. 154 func GenerateKey(containerID string) string { 155 maxLen := 12 156 // Read sandbox key from host for overlay 157 if strings.HasPrefix(containerID, "-") { 158 var ( 159 index int 160 indexStr string 161 tmpkey string 162 ) 163 dir, err := os.ReadDir(basePath()) 164 if err != nil { 165 return "" 166 } 167 168 for _, v := range dir { 169 id := v.Name() 170 if strings.HasSuffix(id, containerID[:maxLen-1]) { 171 indexStr = strings.TrimSuffix(id, containerID[:maxLen-1]) 172 tmpindex, err := strconv.Atoi(indexStr) 173 if err != nil { 174 return "" 175 } 176 if tmpindex > index { 177 index = tmpindex 178 tmpkey = id 179 } 180 } 181 } 182 containerID = tmpkey 183 if containerID == "" { 184 return "" 185 } 186 } 187 188 if len(containerID) < maxLen { 189 maxLen = len(containerID) 190 } 191 192 return basePath() + "/" + containerID[:maxLen] 193 } 194 195 // NewSandbox provides a new sandbox instance created in an os specific way 196 // provided a key which uniquely identifies the sandbox 197 func NewSandbox(key string, osCreate, isRestore bool) (Sandbox, error) { 198 if !isRestore { 199 err := createNetworkNamespace(key, osCreate) 200 if err != nil { 201 return nil, err 202 } 203 } else { 204 once.Do(createBasePath) 205 } 206 207 n := &networkNamespace{path: key, isDefault: !osCreate, nextIfIndex: make(map[string]int)} 208 209 sboxNs, err := netns.GetFromPath(n.path) 210 if err != nil { 211 return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err) 212 } 213 defer sboxNs.Close() 214 215 n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE) 216 if err != nil { 217 return nil, fmt.Errorf("failed to create a netlink handle: %v", err) 218 } 219 220 err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout) 221 if err != nil { 222 logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err) 223 } 224 // In live-restore mode, IPV6 entries are getting cleaned up due to below code 225 // We should retain IPV6 configurations in live-restore mode when Docker Daemon 226 // comes back. It should work as it is on other cases 227 // As starting point, disable IPv6 on all interfaces 228 if !isRestore && !n.isDefault { 229 err = setIPv6(n.path, "all", false) 230 if err != nil { 231 logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err) 232 } 233 } 234 235 if err = n.loopbackUp(); err != nil { 236 n.nlHandle.Close() 237 return nil, err 238 } 239 240 return n, nil 241 } 242 243 func (n *networkNamespace) InterfaceOptions() IfaceOptionSetter { 244 return n 245 } 246 247 func (n *networkNamespace) NeighborOptions() NeighborOptionSetter { 248 return n 249 } 250 251 func mountNetworkNamespace(basePath string, lnPath string) error { 252 return syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "") 253 } 254 255 // GetSandboxForExternalKey returns sandbox object for the supplied path 256 func GetSandboxForExternalKey(basePath string, key string) (Sandbox, error) { 257 if err := createNamespaceFile(key); err != nil { 258 return nil, err 259 } 260 261 if err := mountNetworkNamespace(basePath, key); err != nil { 262 return nil, err 263 } 264 n := &networkNamespace{path: key, nextIfIndex: make(map[string]int)} 265 266 sboxNs, err := netns.GetFromPath(n.path) 267 if err != nil { 268 return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err) 269 } 270 defer sboxNs.Close() 271 272 n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE) 273 if err != nil { 274 return nil, fmt.Errorf("failed to create a netlink handle: %v", err) 275 } 276 277 err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout) 278 if err != nil { 279 logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err) 280 } 281 282 // As starting point, disable IPv6 on all interfaces 283 err = setIPv6(n.path, "all", false) 284 if err != nil { 285 logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err) 286 } 287 288 if err = n.loopbackUp(); err != nil { 289 n.nlHandle.Close() 290 return nil, err 291 } 292 293 return n, nil 294 } 295 296 func reexecCreateNamespace() { 297 if len(os.Args) < 2 { 298 logrus.Fatal("no namespace path provided") 299 } 300 if err := mountNetworkNamespace("/proc/self/ns/net", os.Args[1]); err != nil { 301 logrus.Fatal(err) 302 } 303 } 304 305 func createNetworkNamespace(path string, osCreate bool) error { 306 if err := createNamespaceFile(path); err != nil { 307 return err 308 } 309 310 cmd := &exec.Cmd{ 311 Path: reexec.Self(), 312 Args: append([]string{"netns-create"}, path), 313 Stdout: os.Stdout, 314 Stderr: os.Stderr, 315 } 316 if osCreate { 317 cmd.SysProcAttr = &syscall.SysProcAttr{} 318 cmd.SysProcAttr.Cloneflags = syscall.CLONE_NEWNET 319 } 320 if err := cmd.Run(); err != nil { 321 return fmt.Errorf("namespace creation reexec command failed: %v", err) 322 } 323 324 return nil 325 } 326 327 func unmountNamespaceFile(path string) { 328 if _, err := os.Stat(path); err == nil { 329 if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil && !errors.Is(err, unix.EINVAL) { 330 logrus.WithError(err).Error("Error unmounting namespace file") 331 } 332 } 333 } 334 335 func createNamespaceFile(path string) (err error) { 336 var f *os.File 337 338 once.Do(createBasePath) 339 // Remove it from garbage collection list if present 340 removeFromGarbagePaths(path) 341 342 // If the path is there unmount it first 343 unmountNamespaceFile(path) 344 345 // wait for garbage collection to complete if it is in progress 346 // before trying to create the file. 347 gpmWg.Wait() 348 349 if f, err = os.Create(path); err == nil { 350 f.Close() 351 } 352 353 return err 354 } 355 356 func (n *networkNamespace) loopbackUp() error { 357 iface, err := n.nlHandle.LinkByName("lo") 358 if err != nil { 359 return err 360 } 361 return n.nlHandle.LinkSetUp(iface) 362 } 363 364 func (n *networkNamespace) GetLoopbackIfaceName() string { 365 return "lo" 366 } 367 368 func (n *networkNamespace) AddAliasIP(ifName string, ip *net.IPNet) error { 369 iface, err := n.nlHandle.LinkByName(ifName) 370 if err != nil { 371 return err 372 } 373 return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip}) 374 } 375 376 func (n *networkNamespace) RemoveAliasIP(ifName string, ip *net.IPNet) error { 377 iface, err := n.nlHandle.LinkByName(ifName) 378 if err != nil { 379 return err 380 } 381 return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip}) 382 } 383 384 func (n *networkNamespace) DisableARPForVIP(srcName string) (Err error) { 385 dstName := "" 386 for _, i := range n.Interfaces() { 387 if i.SrcName() == srcName { 388 dstName = i.DstName() 389 break 390 } 391 } 392 if dstName == "" { 393 return fmt.Errorf("failed to find interface %s in sandbox", srcName) 394 } 395 396 err := n.InvokeFunc(func() { 397 path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore") 398 if err := os.WriteFile(path, []byte{'1', '\n'}, 0644); err != nil { 399 Err = fmt.Errorf("Failed to set %s to 1: %v", path, err) 400 return 401 } 402 path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce") 403 if err := os.WriteFile(path, []byte{'2', '\n'}, 0644); err != nil { 404 Err = fmt.Errorf("Failed to set %s to 2: %v", path, err) 405 return 406 } 407 }) 408 if err != nil { 409 return err 410 } 411 return 412 } 413 414 func (n *networkNamespace) InvokeFunc(f func()) error { 415 return nsInvoke(n.nsPath(), func(nsFD int) error { return nil }, func(callerFD int) error { 416 f() 417 return nil 418 }) 419 } 420 421 // InitOSContext initializes OS context while configuring network resources 422 func InitOSContext() func() { 423 runtime.LockOSThread() 424 if err := ns.SetNamespace(); err != nil { 425 logrus.Error(err) 426 } 427 return runtime.UnlockOSThread 428 } 429 430 func nsInvoke(path string, prefunc func(nsFD int) error, postfunc func(callerFD int) error) error { 431 defer InitOSContext()() 432 433 newNs, err := netns.GetFromPath(path) 434 if err != nil { 435 return fmt.Errorf("failed get network namespace %q: %v", path, err) 436 } 437 defer newNs.Close() 438 439 // Invoked before the namespace switch happens but after the namespace file 440 // handle is obtained. 441 if err := prefunc(int(newNs)); err != nil { 442 return fmt.Errorf("failed in prefunc: %v", err) 443 } 444 445 if err = netns.Set(newNs); err != nil { 446 return err 447 } 448 defer ns.SetNamespace() 449 450 // Invoked after the namespace switch. 451 return postfunc(ns.ParseHandlerInt()) 452 } 453 454 func (n *networkNamespace) nsPath() string { 455 n.Lock() 456 defer n.Unlock() 457 458 return n.path 459 } 460 461 func (n *networkNamespace) Info() Info { 462 return n 463 } 464 465 func (n *networkNamespace) Key() string { 466 return n.path 467 } 468 469 func (n *networkNamespace) Destroy() error { 470 if n.nlHandle != nil { 471 n.nlHandle.Close() 472 } 473 // Assuming no running process is executing in this network namespace, 474 // unmounting is sufficient to destroy it. 475 if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil { 476 return err 477 } 478 479 // Stash it into the garbage collection list 480 addToGarbagePaths(n.path) 481 return nil 482 } 483 484 // Restore restore the network namespace 485 func (n *networkNamespace) Restore(ifsopt map[string][]IfaceOption, routes []*types.StaticRoute, gw net.IP, gw6 net.IP) error { 486 // restore interfaces 487 for name, opts := range ifsopt { 488 if !strings.Contains(name, "+") { 489 return fmt.Errorf("wrong iface name in restore osl sandbox interface: %s", name) 490 } 491 seps := strings.Split(name, "+") 492 srcName := seps[0] 493 dstPrefix := seps[1] 494 i := &nwIface{srcName: srcName, dstName: dstPrefix, ns: n} 495 i.processInterfaceOptions(opts...) 496 if i.master != "" { 497 i.dstMaster = n.findDst(i.master, true) 498 if i.dstMaster == "" { 499 return fmt.Errorf("could not find an appropriate master %q for %q", 500 i.master, i.srcName) 501 } 502 } 503 if n.isDefault { 504 i.dstName = i.srcName 505 } else { 506 links, err := n.nlHandle.LinkList() 507 if err != nil { 508 return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path) 509 } 510 // due to the docker network connect/disconnect, so the dstName should 511 // restore from the namespace 512 for _, link := range links { 513 addrs, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4) 514 if err != nil { 515 return err 516 } 517 ifaceName := link.Attrs().Name 518 if strings.HasPrefix(ifaceName, "vxlan") { 519 if i.dstName == "vxlan" { 520 i.dstName = ifaceName 521 break 522 } 523 } 524 // find the interface name by ip 525 if i.address != nil { 526 for _, addr := range addrs { 527 if addr.IPNet.String() == i.address.String() { 528 i.dstName = ifaceName 529 break 530 } 531 continue 532 } 533 if i.dstName == ifaceName { 534 break 535 } 536 } 537 // This is to find the interface name of the pair in overlay sandbox 538 if strings.HasPrefix(ifaceName, "veth") { 539 if i.master != "" && i.dstName == "veth" { 540 i.dstName = ifaceName 541 } 542 } 543 } 544 545 var index int 546 indexStr := strings.TrimPrefix(i.dstName, dstPrefix) 547 if indexStr != "" { 548 index, err = strconv.Atoi(indexStr) 549 if err != nil { 550 return err 551 } 552 } 553 index++ 554 n.Lock() 555 if index > n.nextIfIndex[dstPrefix] { 556 n.nextIfIndex[dstPrefix] = index 557 } 558 n.iFaces = append(n.iFaces, i) 559 n.Unlock() 560 } 561 } 562 563 // restore routes 564 for _, r := range routes { 565 n.Lock() 566 n.staticRoutes = append(n.staticRoutes, r) 567 n.Unlock() 568 } 569 570 // restore gateway 571 if len(gw) > 0 { 572 n.Lock() 573 n.gw = gw 574 n.Unlock() 575 } 576 577 if len(gw6) > 0 { 578 n.Lock() 579 n.gwv6 = gw6 580 n.Unlock() 581 } 582 583 return nil 584 } 585 586 // Checks whether IPv6 needs to be enabled/disabled on the loopback interface 587 func (n *networkNamespace) checkLoV6() { 588 var ( 589 enable = false 590 action = "disable" 591 ) 592 593 n.Lock() 594 for _, iface := range n.iFaces { 595 if iface.AddressIPv6() != nil { 596 enable = true 597 action = "enable" 598 break 599 } 600 } 601 n.Unlock() 602 603 if n.loV6Enabled == enable { 604 return 605 } 606 607 if err := setIPv6(n.path, "lo", enable); err != nil { 608 logrus.Warnf("Failed to %s IPv6 on loopback interface on network namespace %q: %v", action, n.path, err) 609 } 610 611 n.loV6Enabled = enable 612 } 613 614 func reexecSetIPv6() { 615 runtime.LockOSThread() 616 defer runtime.UnlockOSThread() 617 618 if len(os.Args) < 3 { 619 logrus.Errorf("invalid number of arguments for %s", os.Args[0]) 620 os.Exit(1) 621 } 622 623 ns, err := netns.GetFromPath(os.Args[1]) 624 if err != nil { 625 logrus.Errorf("failed get network namespace %q: %v", os.Args[1], err) 626 os.Exit(2) 627 } 628 defer ns.Close() 629 630 if err = netns.Set(ns); err != nil { 631 logrus.Errorf("setting into container netns %q failed: %v", os.Args[1], err) 632 os.Exit(3) 633 } 634 635 var ( 636 action = "disable" 637 value = byte('1') 638 path = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", os.Args[2]) 639 ) 640 641 if os.Args[3] == "true" { 642 action = "enable" 643 value = byte('0') 644 } 645 646 if _, err := os.Stat(path); err != nil { 647 if os.IsNotExist(err) { 648 logrus.Warnf("file does not exist: %s : %v Has IPv6 been disabled in this node's kernel?", path, err) 649 os.Exit(0) 650 } 651 logrus.Errorf("failed to stat %s : %v", path, err) 652 os.Exit(5) 653 } 654 655 if err = os.WriteFile(path, []byte{value, '\n'}, 0644); err != nil { 656 logrus.Errorf("failed to %s IPv6 forwarding for container's interface %s: %v", action, os.Args[2], err) 657 os.Exit(4) 658 } 659 660 os.Exit(0) 661 } 662 663 func setIPv6(path, iface string, enable bool) error { 664 cmd := &exec.Cmd{ 665 Path: reexec.Self(), 666 Args: append([]string{"set-ipv6"}, path, iface, strconv.FormatBool(enable)), 667 Stdout: os.Stdout, 668 Stderr: os.Stderr, 669 } 670 if err := cmd.Run(); err != nil { 671 return fmt.Errorf("reexec to set IPv6 failed: %v", err) 672 } 673 return nil 674 } 675 676 // ApplyOSTweaks applies linux configs on the sandbox 677 func (n *networkNamespace) ApplyOSTweaks(types []SandboxType) { 678 for _, t := range types { 679 switch t { 680 case SandboxTypeLoadBalancer, SandboxTypeIngress: 681 kernel.ApplyOSTweaks(map[string]*kernel.OSValue{ 682 // disables any special handling on port reuse of existing IPVS connection table entries 683 // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L32 684 "net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil}, 685 // expires connection from the IPVS connection table when the backend is not available 686 // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L133 687 "net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil}, 688 // expires persistent connections to destination servers with weights set to 0 689 // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L151 690 "net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil}, 691 }) 692 } 693 } 694 }