github.com/jfrazelle/docker@v1.1.2-0.20210712172922-bf78e25fe508/libnetwork/osl/namespace_linux.go (about) 1 package osl 2 3 import ( 4 "errors" 5 "fmt" 6 "io/ioutil" 7 "net" 8 "os" 9 "os/exec" 10 "path/filepath" 11 "runtime" 12 "strconv" 13 "strings" 14 "sync" 15 "syscall" 16 "time" 17 18 "github.com/docker/docker/libnetwork/ns" 19 "github.com/docker/docker/libnetwork/osl/kernel" 20 "github.com/docker/docker/libnetwork/types" 21 "github.com/docker/docker/pkg/reexec" 22 "github.com/sirupsen/logrus" 23 "github.com/vishvananda/netlink" 24 "github.com/vishvananda/netns" 25 "golang.org/x/sys/unix" 26 ) 27 28 const defaultPrefix = "/var/run/docker" 29 30 func init() { 31 reexec.Register("set-ipv6", reexecSetIPv6) 32 } 33 34 var ( 35 once sync.Once 36 garbagePathMap = make(map[string]bool) 37 gpmLock sync.Mutex 38 gpmWg sync.WaitGroup 39 gpmCleanupPeriod = 60 * time.Second 40 gpmChan = make(chan chan struct{}) 41 prefix = defaultPrefix 42 loadBalancerConfig = map[string]*kernel.OSValue{ 43 // disables any special handling on port reuse of existing IPVS connection table entries 44 // more info: https://github.com/torvalds/linux/blob/master/Documentation/networking/ipvs-sysctl.txt#L25:1 45 "net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil}, 46 // expires connection from the IPVS connection table when the backend is not available 47 // more info: https://github.com/torvalds/linux/blob/master/Documentation/networking/ipvs-sysctl.txt#L126:1 48 "net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil}, 49 // expires persistent connections to destination servers with weights set to 0 50 // more info: https://github.com/torvalds/linux/blob/master/Documentation/networking/ipvs-sysctl.txt#L144:1 51 "net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil}, 52 } 53 ) 54 55 // The networkNamespace type is the linux implementation of the Sandbox 56 // interface. It represents a linux network namespace, and moves an interface 57 // into it when called on method AddInterface or sets the gateway etc. 58 type networkNamespace struct { 59 path string 60 iFaces []*nwIface 61 gw net.IP 62 gwv6 net.IP 63 staticRoutes []*types.StaticRoute 64 neighbors []*neigh 65 nextIfIndex map[string]int 66 isDefault bool 67 nlHandle *netlink.Handle 68 loV6Enabled bool 69 sync.Mutex 70 } 71 72 // SetBasePath sets the base url prefix for the ns path 73 func SetBasePath(path string) { 74 prefix = path 75 } 76 77 func init() { 78 reexec.Register("netns-create", reexecCreateNamespace) 79 } 80 81 func basePath() string { 82 return filepath.Join(prefix, "netns") 83 } 84 85 func createBasePath() { 86 err := os.MkdirAll(basePath(), 0755) 87 if err != nil { 88 panic("Could not create net namespace path directory") 89 } 90 91 // Start the garbage collection go routine 92 go removeUnusedPaths() 93 } 94 95 func removeUnusedPaths() { 96 gpmLock.Lock() 97 period := gpmCleanupPeriod 98 gpmLock.Unlock() 99 100 ticker := time.NewTicker(period) 101 for { 102 var ( 103 gc chan struct{} 104 gcOk bool 105 ) 106 107 select { 108 case <-ticker.C: 109 case gc, gcOk = <-gpmChan: 110 } 111 112 gpmLock.Lock() 113 pathList := make([]string, 0, len(garbagePathMap)) 114 for path := range garbagePathMap { 115 pathList = append(pathList, path) 116 } 117 garbagePathMap = make(map[string]bool) 118 gpmWg.Add(1) 119 gpmLock.Unlock() 120 121 for _, path := range pathList { 122 os.Remove(path) 123 } 124 125 gpmWg.Done() 126 if gcOk { 127 close(gc) 128 } 129 } 130 } 131 132 func addToGarbagePaths(path string) { 133 gpmLock.Lock() 134 garbagePathMap[path] = true 135 gpmLock.Unlock() 136 } 137 138 func removeFromGarbagePaths(path string) { 139 gpmLock.Lock() 140 delete(garbagePathMap, path) 141 gpmLock.Unlock() 142 } 143 144 // GC triggers garbage collection of namespace path right away 145 // and waits for it. 146 func GC() { 147 gpmLock.Lock() 148 if len(garbagePathMap) == 0 { 149 // No need for GC if map is empty 150 gpmLock.Unlock() 151 return 152 } 153 gpmLock.Unlock() 154 155 // if content exists in the garbage paths 156 // we can trigger GC to run, providing a 157 // channel to be notified on completion 158 waitGC := make(chan struct{}) 159 gpmChan <- waitGC 160 // wait for GC completion 161 <-waitGC 162 } 163 164 // GenerateKey generates a sandbox key based on the passed 165 // container id. 166 func GenerateKey(containerID string) string { 167 maxLen := 12 168 // Read sandbox key from host for overlay 169 if strings.HasPrefix(containerID, "-") { 170 var ( 171 index int 172 indexStr string 173 tmpkey string 174 ) 175 dir, err := ioutil.ReadDir(basePath()) 176 if err != nil { 177 return "" 178 } 179 180 for _, v := range dir { 181 id := v.Name() 182 if strings.HasSuffix(id, containerID[:maxLen-1]) { 183 indexStr = strings.TrimSuffix(id, containerID[:maxLen-1]) 184 tmpindex, err := strconv.Atoi(indexStr) 185 if err != nil { 186 return "" 187 } 188 if tmpindex > index { 189 index = tmpindex 190 tmpkey = id 191 } 192 193 } 194 } 195 containerID = tmpkey 196 if containerID == "" { 197 return "" 198 } 199 } 200 201 if len(containerID) < maxLen { 202 maxLen = len(containerID) 203 } 204 205 return basePath() + "/" + containerID[:maxLen] 206 } 207 208 // NewSandbox provides a new sandbox instance created in an os specific way 209 // provided a key which uniquely identifies the sandbox 210 func NewSandbox(key string, osCreate, isRestore bool) (Sandbox, error) { 211 if !isRestore { 212 err := createNetworkNamespace(key, osCreate) 213 if err != nil { 214 return nil, err 215 } 216 } else { 217 once.Do(createBasePath) 218 } 219 220 n := &networkNamespace{path: key, isDefault: !osCreate, nextIfIndex: make(map[string]int)} 221 222 sboxNs, err := netns.GetFromPath(n.path) 223 if err != nil { 224 return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err) 225 } 226 defer sboxNs.Close() 227 228 n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE) 229 if err != nil { 230 return nil, fmt.Errorf("failed to create a netlink handle: %v", err) 231 } 232 233 err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout) 234 if err != nil { 235 logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err) 236 } 237 // In live-restore mode, IPV6 entries are getting cleaned up due to below code 238 // We should retain IPV6 configurations in live-restore mode when Docker Daemon 239 // comes back. It should work as it is on other cases 240 // As starting point, disable IPv6 on all interfaces 241 if !isRestore && !n.isDefault { 242 err = setIPv6(n.path, "all", false) 243 if err != nil { 244 logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err) 245 } 246 } 247 248 if err = n.loopbackUp(); err != nil { 249 n.nlHandle.Delete() 250 return nil, err 251 } 252 253 return n, nil 254 } 255 256 func (n *networkNamespace) InterfaceOptions() IfaceOptionSetter { 257 return n 258 } 259 260 func (n *networkNamespace) NeighborOptions() NeighborOptionSetter { 261 return n 262 } 263 264 func mountNetworkNamespace(basePath string, lnPath string) error { 265 return syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "") 266 } 267 268 // GetSandboxForExternalKey returns sandbox object for the supplied path 269 func GetSandboxForExternalKey(basePath string, key string) (Sandbox, error) { 270 if err := createNamespaceFile(key); err != nil { 271 return nil, err 272 } 273 274 if err := mountNetworkNamespace(basePath, key); err != nil { 275 return nil, err 276 } 277 n := &networkNamespace{path: key, nextIfIndex: make(map[string]int)} 278 279 sboxNs, err := netns.GetFromPath(n.path) 280 if err != nil { 281 return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err) 282 } 283 defer sboxNs.Close() 284 285 n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE) 286 if err != nil { 287 return nil, fmt.Errorf("failed to create a netlink handle: %v", err) 288 } 289 290 err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout) 291 if err != nil { 292 logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err) 293 } 294 295 // As starting point, disable IPv6 on all interfaces 296 err = setIPv6(n.path, "all", false) 297 if err != nil { 298 logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err) 299 } 300 301 if err = n.loopbackUp(); err != nil { 302 n.nlHandle.Delete() 303 return nil, err 304 } 305 306 return n, nil 307 } 308 309 func reexecCreateNamespace() { 310 if len(os.Args) < 2 { 311 logrus.Fatal("no namespace path provided") 312 } 313 if err := mountNetworkNamespace("/proc/self/ns/net", os.Args[1]); err != nil { 314 logrus.Fatal(err) 315 } 316 } 317 318 func createNetworkNamespace(path string, osCreate bool) error { 319 if err := createNamespaceFile(path); err != nil { 320 return err 321 } 322 323 cmd := &exec.Cmd{ 324 Path: reexec.Self(), 325 Args: append([]string{"netns-create"}, path), 326 Stdout: os.Stdout, 327 Stderr: os.Stderr, 328 } 329 if osCreate { 330 cmd.SysProcAttr = &syscall.SysProcAttr{} 331 cmd.SysProcAttr.Cloneflags = syscall.CLONE_NEWNET 332 } 333 if err := cmd.Run(); err != nil { 334 return fmt.Errorf("namespace creation reexec command failed: %v", err) 335 } 336 337 return nil 338 } 339 340 func unmountNamespaceFile(path string) { 341 if _, err := os.Stat(path); err == nil { 342 if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil && !errors.Is(err, unix.EINVAL) { 343 logrus.WithError(err).Error("Error unmounting namespace file") 344 } 345 } 346 } 347 348 func createNamespaceFile(path string) (err error) { 349 var f *os.File 350 351 once.Do(createBasePath) 352 // Remove it from garbage collection list if present 353 removeFromGarbagePaths(path) 354 355 // If the path is there unmount it first 356 unmountNamespaceFile(path) 357 358 // wait for garbage collection to complete if it is in progress 359 // before trying to create the file. 360 gpmWg.Wait() 361 362 if f, err = os.Create(path); err == nil { 363 f.Close() 364 } 365 366 return err 367 } 368 369 func (n *networkNamespace) loopbackUp() error { 370 iface, err := n.nlHandle.LinkByName("lo") 371 if err != nil { 372 return err 373 } 374 return n.nlHandle.LinkSetUp(iface) 375 } 376 377 func (n *networkNamespace) GetLoopbackIfaceName() string { 378 return "lo" 379 } 380 381 func (n *networkNamespace) AddAliasIP(ifName string, ip *net.IPNet) error { 382 iface, err := n.nlHandle.LinkByName(ifName) 383 if err != nil { 384 return err 385 } 386 return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip}) 387 } 388 389 func (n *networkNamespace) RemoveAliasIP(ifName string, ip *net.IPNet) error { 390 iface, err := n.nlHandle.LinkByName(ifName) 391 if err != nil { 392 return err 393 } 394 return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip}) 395 } 396 397 func (n *networkNamespace) DisableARPForVIP(srcName string) (Err error) { 398 dstName := "" 399 for _, i := range n.Interfaces() { 400 if i.SrcName() == srcName { 401 dstName = i.DstName() 402 break 403 } 404 } 405 if dstName == "" { 406 return fmt.Errorf("failed to find interface %s in sandbox", srcName) 407 } 408 409 err := n.InvokeFunc(func() { 410 path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore") 411 if err := ioutil.WriteFile(path, []byte{'1', '\n'}, 0644); err != nil { 412 Err = fmt.Errorf("Failed to set %s to 1: %v", path, err) 413 return 414 } 415 path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce") 416 if err := ioutil.WriteFile(path, []byte{'2', '\n'}, 0644); err != nil { 417 Err = fmt.Errorf("Failed to set %s to 2: %v", path, err) 418 return 419 } 420 }) 421 if err != nil { 422 return err 423 } 424 return 425 } 426 427 func (n *networkNamespace) InvokeFunc(f func()) error { 428 return nsInvoke(n.nsPath(), func(nsFD int) error { return nil }, func(callerFD int) error { 429 f() 430 return nil 431 }) 432 } 433 434 // InitOSContext initializes OS context while configuring network resources 435 func InitOSContext() func() { 436 runtime.LockOSThread() 437 if err := ns.SetNamespace(); err != nil { 438 logrus.Error(err) 439 } 440 return runtime.UnlockOSThread 441 } 442 443 func nsInvoke(path string, prefunc func(nsFD int) error, postfunc func(callerFD int) error) error { 444 defer InitOSContext()() 445 446 newNs, err := netns.GetFromPath(path) 447 if err != nil { 448 return fmt.Errorf("failed get network namespace %q: %v", path, err) 449 } 450 defer newNs.Close() 451 452 // Invoked before the namespace switch happens but after the namespace file 453 // handle is obtained. 454 if err := prefunc(int(newNs)); err != nil { 455 return fmt.Errorf("failed in prefunc: %v", err) 456 } 457 458 if err = netns.Set(newNs); err != nil { 459 return err 460 } 461 defer ns.SetNamespace() 462 463 // Invoked after the namespace switch. 464 return postfunc(ns.ParseHandlerInt()) 465 } 466 467 func (n *networkNamespace) nsPath() string { 468 n.Lock() 469 defer n.Unlock() 470 471 return n.path 472 } 473 474 func (n *networkNamespace) Info() Info { 475 return n 476 } 477 478 func (n *networkNamespace) Key() string { 479 return n.path 480 } 481 482 func (n *networkNamespace) Destroy() error { 483 if n.nlHandle != nil { 484 n.nlHandle.Delete() 485 } 486 // Assuming no running process is executing in this network namespace, 487 // unmounting is sufficient to destroy it. 488 if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil { 489 return err 490 } 491 492 // Stash it into the garbage collection list 493 addToGarbagePaths(n.path) 494 return nil 495 } 496 497 // Restore restore the network namespace 498 func (n *networkNamespace) Restore(ifsopt map[string][]IfaceOption, routes []*types.StaticRoute, gw net.IP, gw6 net.IP) error { 499 // restore interfaces 500 for name, opts := range ifsopt { 501 if !strings.Contains(name, "+") { 502 return fmt.Errorf("wrong iface name in restore osl sandbox interface: %s", name) 503 } 504 seps := strings.Split(name, "+") 505 srcName := seps[0] 506 dstPrefix := seps[1] 507 i := &nwIface{srcName: srcName, dstName: dstPrefix, ns: n} 508 i.processInterfaceOptions(opts...) 509 if i.master != "" { 510 i.dstMaster = n.findDst(i.master, true) 511 if i.dstMaster == "" { 512 return fmt.Errorf("could not find an appropriate master %q for %q", 513 i.master, i.srcName) 514 } 515 } 516 if n.isDefault { 517 i.dstName = i.srcName 518 } else { 519 links, err := n.nlHandle.LinkList() 520 if err != nil { 521 return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path) 522 } 523 // due to the docker network connect/disconnect, so the dstName should 524 // restore from the namespace 525 for _, link := range links { 526 addrs, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4) 527 if err != nil { 528 return err 529 } 530 ifaceName := link.Attrs().Name 531 if strings.HasPrefix(ifaceName, "vxlan") { 532 if i.dstName == "vxlan" { 533 i.dstName = ifaceName 534 break 535 } 536 } 537 // find the interface name by ip 538 if i.address != nil { 539 for _, addr := range addrs { 540 if addr.IPNet.String() == i.address.String() { 541 i.dstName = ifaceName 542 break 543 } 544 continue 545 } 546 if i.dstName == ifaceName { 547 break 548 } 549 } 550 // This is to find the interface name of the pair in overlay sandbox 551 if strings.HasPrefix(ifaceName, "veth") { 552 if i.master != "" && i.dstName == "veth" { 553 i.dstName = ifaceName 554 } 555 } 556 } 557 558 var index int 559 indexStr := strings.TrimPrefix(i.dstName, dstPrefix) 560 if indexStr != "" { 561 index, err = strconv.Atoi(indexStr) 562 if err != nil { 563 return err 564 } 565 } 566 index++ 567 n.Lock() 568 if index > n.nextIfIndex[dstPrefix] { 569 n.nextIfIndex[dstPrefix] = index 570 } 571 n.iFaces = append(n.iFaces, i) 572 n.Unlock() 573 } 574 } 575 576 // restore routes 577 for _, r := range routes { 578 n.Lock() 579 n.staticRoutes = append(n.staticRoutes, r) 580 n.Unlock() 581 } 582 583 // restore gateway 584 if len(gw) > 0 { 585 n.Lock() 586 n.gw = gw 587 n.Unlock() 588 } 589 590 if len(gw6) > 0 { 591 n.Lock() 592 n.gwv6 = gw6 593 n.Unlock() 594 } 595 596 return nil 597 } 598 599 // Checks whether IPv6 needs to be enabled/disabled on the loopback interface 600 func (n *networkNamespace) checkLoV6() { 601 var ( 602 enable = false 603 action = "disable" 604 ) 605 606 n.Lock() 607 for _, iface := range n.iFaces { 608 if iface.AddressIPv6() != nil { 609 enable = true 610 action = "enable" 611 break 612 } 613 } 614 n.Unlock() 615 616 if n.loV6Enabled == enable { 617 return 618 } 619 620 if err := setIPv6(n.path, "lo", enable); err != nil { 621 logrus.Warnf("Failed to %s IPv6 on loopback interface on network namespace %q: %v", action, n.path, err) 622 } 623 624 n.loV6Enabled = enable 625 } 626 627 func reexecSetIPv6() { 628 runtime.LockOSThread() 629 defer runtime.UnlockOSThread() 630 631 if len(os.Args) < 3 { 632 logrus.Errorf("invalid number of arguments for %s", os.Args[0]) 633 os.Exit(1) 634 } 635 636 ns, err := netns.GetFromPath(os.Args[1]) 637 if err != nil { 638 logrus.Errorf("failed get network namespace %q: %v", os.Args[1], err) 639 os.Exit(2) 640 } 641 defer ns.Close() 642 643 if err = netns.Set(ns); err != nil { 644 logrus.Errorf("setting into container netns %q failed: %v", os.Args[1], err) 645 os.Exit(3) 646 } 647 648 var ( 649 action = "disable" 650 value = byte('1') 651 path = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", os.Args[2]) 652 ) 653 654 if os.Args[3] == "true" { 655 action = "enable" 656 value = byte('0') 657 } 658 659 if _, err := os.Stat(path); err != nil { 660 if os.IsNotExist(err) { 661 logrus.Warnf("file does not exist: %s : %v Has IPv6 been disabled in this node's kernel?", path, err) 662 os.Exit(0) 663 } 664 logrus.Errorf("failed to stat %s : %v", path, err) 665 os.Exit(5) 666 } 667 668 if err = ioutil.WriteFile(path, []byte{value, '\n'}, 0644); err != nil { 669 logrus.Errorf("failed to %s IPv6 forwarding for container's interface %s: %v", action, os.Args[2], err) 670 os.Exit(4) 671 } 672 673 os.Exit(0) 674 } 675 676 func setIPv6(path, iface string, enable bool) error { 677 cmd := &exec.Cmd{ 678 Path: reexec.Self(), 679 Args: append([]string{"set-ipv6"}, path, iface, strconv.FormatBool(enable)), 680 Stdout: os.Stdout, 681 Stderr: os.Stderr, 682 } 683 if err := cmd.Run(); err != nil { 684 return fmt.Errorf("reexec to set IPv6 failed: %v", err) 685 } 686 return nil 687 } 688 689 // ApplyOSTweaks applies linux configs on the sandbox 690 func (n *networkNamespace) ApplyOSTweaks(types []SandboxType) { 691 for _, t := range types { 692 switch t { 693 case SandboxTypeLoadBalancer: 694 kernel.ApplyOSTweaks(loadBalancerConfig) 695 } 696 } 697 }