github.com/pwn-term/docker@v0.0.0-20210616085119-6e977cce2565/libnetwork/osl/namespace_linux.go (about) 1 package osl 2 3 import ( 4 "fmt" 5 "io/ioutil" 6 "net" 7 "os" 8 "os/exec" 9 "path/filepath" 10 "runtime" 11 "strconv" 12 "strings" 13 "sync" 14 "syscall" 15 "time" 16 17 "github.com/docker/docker/pkg/reexec" 18 "github.com/docker/libnetwork/ns" 19 "github.com/docker/libnetwork/osl/kernel" 20 "github.com/docker/libnetwork/types" 21 "github.com/sirupsen/logrus" 22 "github.com/vishvananda/netlink" 23 "github.com/vishvananda/netns" 24 ) 25 26 const defaultPrefix = "/var/run/docker" 27 28 func init() { 29 reexec.Register("set-ipv6", reexecSetIPv6) 30 } 31 32 var ( 33 once sync.Once 34 garbagePathMap = make(map[string]bool) 35 gpmLock sync.Mutex 36 gpmWg sync.WaitGroup 37 gpmCleanupPeriod = 60 * time.Second 38 gpmChan = make(chan chan struct{}) 39 prefix = defaultPrefix 40 loadBalancerConfig = map[string]*kernel.OSValue{ 41 // disables any special handling on port reuse of existing IPVS connection table entries 42 // more info: https://github.com/torvalds/linux/blob/master/Documentation/networking/ipvs-sysctl.txt#L25:1 43 "net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil}, 44 // expires connection from the IPVS connection table when the backend is not available 45 // more info: https://github.com/torvalds/linux/blob/master/Documentation/networking/ipvs-sysctl.txt#L126:1 46 "net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil}, 47 // expires persistent connections to destination servers with weights set to 0 48 // more info: https://github.com/torvalds/linux/blob/master/Documentation/networking/ipvs-sysctl.txt#L144:1 49 "net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil}, 50 } 51 ) 52 53 // The networkNamespace type is the linux implementation of the Sandbox 54 // interface. It represents a linux network namespace, and moves an interface 55 // into it when called on method AddInterface or sets the gateway etc. 56 type networkNamespace struct { 57 path string 58 iFaces []*nwIface 59 gw net.IP 60 gwv6 net.IP 61 staticRoutes []*types.StaticRoute 62 neighbors []*neigh 63 nextIfIndex map[string]int 64 isDefault bool 65 nlHandle *netlink.Handle 66 loV6Enabled bool 67 sync.Mutex 68 } 69 70 // SetBasePath sets the base url prefix for the ns path 71 func SetBasePath(path string) { 72 prefix = path 73 } 74 75 func init() { 76 reexec.Register("netns-create", reexecCreateNamespace) 77 } 78 79 func basePath() string { 80 return filepath.Join(prefix, "netns") 81 } 82 83 func createBasePath() { 84 err := os.MkdirAll(basePath(), 0755) 85 if err != nil { 86 panic("Could not create net namespace path directory") 87 } 88 89 // Start the garbage collection go routine 90 go removeUnusedPaths() 91 } 92 93 func removeUnusedPaths() { 94 gpmLock.Lock() 95 period := gpmCleanupPeriod 96 gpmLock.Unlock() 97 98 ticker := time.NewTicker(period) 99 for { 100 var ( 101 gc chan struct{} 102 gcOk bool 103 ) 104 105 select { 106 case <-ticker.C: 107 case gc, gcOk = <-gpmChan: 108 } 109 110 gpmLock.Lock() 111 pathList := make([]string, 0, len(garbagePathMap)) 112 for path := range garbagePathMap { 113 pathList = append(pathList, path) 114 } 115 garbagePathMap = make(map[string]bool) 116 gpmWg.Add(1) 117 gpmLock.Unlock() 118 119 for _, path := range pathList { 120 os.Remove(path) 121 } 122 123 gpmWg.Done() 124 if gcOk { 125 close(gc) 126 } 127 } 128 } 129 130 func addToGarbagePaths(path string) { 131 gpmLock.Lock() 132 garbagePathMap[path] = true 133 gpmLock.Unlock() 134 } 135 136 func removeFromGarbagePaths(path string) { 137 gpmLock.Lock() 138 delete(garbagePathMap, path) 139 gpmLock.Unlock() 140 } 141 142 // GC triggers garbage collection of namespace path right away 143 // and waits for it. 144 func GC() { 145 gpmLock.Lock() 146 if len(garbagePathMap) == 0 { 147 // No need for GC if map is empty 148 gpmLock.Unlock() 149 return 150 } 151 gpmLock.Unlock() 152 153 // if content exists in the garbage paths 154 // we can trigger GC to run, providing a 155 // channel to be notified on completion 156 waitGC := make(chan struct{}) 157 gpmChan <- waitGC 158 // wait for GC completion 159 <-waitGC 160 } 161 162 // GenerateKey generates a sandbox key based on the passed 163 // container id. 164 func GenerateKey(containerID string) string { 165 maxLen := 12 166 // Read sandbox key from host for overlay 167 if strings.HasPrefix(containerID, "-") { 168 var ( 169 index int 170 indexStr string 171 tmpkey string 172 ) 173 dir, err := ioutil.ReadDir(basePath()) 174 if err != nil { 175 return "" 176 } 177 178 for _, v := range dir { 179 id := v.Name() 180 if strings.HasSuffix(id, containerID[:maxLen-1]) { 181 indexStr = strings.TrimSuffix(id, containerID[:maxLen-1]) 182 tmpindex, err := strconv.Atoi(indexStr) 183 if err != nil { 184 return "" 185 } 186 if tmpindex > index { 187 index = tmpindex 188 tmpkey = id 189 } 190 191 } 192 } 193 containerID = tmpkey 194 if containerID == "" { 195 return "" 196 } 197 } 198 199 if len(containerID) < maxLen { 200 maxLen = len(containerID) 201 } 202 203 return basePath() + "/" + containerID[:maxLen] 204 } 205 206 // NewSandbox provides a new sandbox instance created in an os specific way 207 // provided a key which uniquely identifies the sandbox 208 func NewSandbox(key string, osCreate, isRestore bool) (Sandbox, error) { 209 if !isRestore { 210 err := createNetworkNamespace(key, osCreate) 211 if err != nil { 212 return nil, err 213 } 214 } else { 215 once.Do(createBasePath) 216 } 217 218 n := &networkNamespace{path: key, isDefault: !osCreate, nextIfIndex: make(map[string]int)} 219 220 sboxNs, err := netns.GetFromPath(n.path) 221 if err != nil { 222 return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err) 223 } 224 defer sboxNs.Close() 225 226 n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE) 227 if err != nil { 228 return nil, fmt.Errorf("failed to create a netlink handle: %v", err) 229 } 230 231 err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout) 232 if err != nil { 233 logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err) 234 } 235 // In live-restore mode, IPV6 entries are getting cleaned up due to below code 236 // We should retain IPV6 configurations in live-restore mode when Docker Daemon 237 // comes back. It should work as it is on other cases 238 // As starting point, disable IPv6 on all interfaces 239 if !isRestore && !n.isDefault { 240 err = setIPv6(n.path, "all", false) 241 if err != nil { 242 logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err) 243 } 244 } 245 246 if err = n.loopbackUp(); err != nil { 247 n.nlHandle.Delete() 248 return nil, err 249 } 250 251 return n, nil 252 } 253 254 func (n *networkNamespace) InterfaceOptions() IfaceOptionSetter { 255 return n 256 } 257 258 func (n *networkNamespace) NeighborOptions() NeighborOptionSetter { 259 return n 260 } 261 262 func mountNetworkNamespace(basePath string, lnPath string) error { 263 return syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "") 264 } 265 266 // GetSandboxForExternalKey returns sandbox object for the supplied path 267 func GetSandboxForExternalKey(basePath string, key string) (Sandbox, error) { 268 if err := createNamespaceFile(key); err != nil { 269 return nil, err 270 } 271 272 if err := mountNetworkNamespace(basePath, key); err != nil { 273 return nil, err 274 } 275 n := &networkNamespace{path: key, nextIfIndex: make(map[string]int)} 276 277 sboxNs, err := netns.GetFromPath(n.path) 278 if err != nil { 279 return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err) 280 } 281 defer sboxNs.Close() 282 283 n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE) 284 if err != nil { 285 return nil, fmt.Errorf("failed to create a netlink handle: %v", err) 286 } 287 288 err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout) 289 if err != nil { 290 logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err) 291 } 292 293 // As starting point, disable IPv6 on all interfaces 294 err = setIPv6(n.path, "all", false) 295 if err != nil { 296 logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err) 297 } 298 299 if err = n.loopbackUp(); err != nil { 300 n.nlHandle.Delete() 301 return nil, err 302 } 303 304 return n, nil 305 } 306 307 func reexecCreateNamespace() { 308 if len(os.Args) < 2 { 309 logrus.Fatal("no namespace path provided") 310 } 311 if err := mountNetworkNamespace("/proc/self/ns/net", os.Args[1]); err != nil { 312 logrus.Fatal(err) 313 } 314 } 315 316 func createNetworkNamespace(path string, osCreate bool) error { 317 if err := createNamespaceFile(path); err != nil { 318 return err 319 } 320 321 cmd := &exec.Cmd{ 322 Path: reexec.Self(), 323 Args: append([]string{"netns-create"}, path), 324 Stdout: os.Stdout, 325 Stderr: os.Stderr, 326 } 327 if osCreate { 328 cmd.SysProcAttr = &syscall.SysProcAttr{} 329 cmd.SysProcAttr.Cloneflags = syscall.CLONE_NEWNET 330 } 331 if err := cmd.Run(); err != nil { 332 return fmt.Errorf("namespace creation reexec command failed: %v", err) 333 } 334 335 return nil 336 } 337 338 func unmountNamespaceFile(path string) { 339 if _, err := os.Stat(path); err == nil { 340 syscall.Unmount(path, syscall.MNT_DETACH) 341 } 342 } 343 344 func createNamespaceFile(path string) (err error) { 345 var f *os.File 346 347 once.Do(createBasePath) 348 // Remove it from garbage collection list if present 349 removeFromGarbagePaths(path) 350 351 // If the path is there unmount it first 352 unmountNamespaceFile(path) 353 354 // wait for garbage collection to complete if it is in progress 355 // before trying to create the file. 356 gpmWg.Wait() 357 358 if f, err = os.Create(path); err == nil { 359 f.Close() 360 } 361 362 return err 363 } 364 365 func (n *networkNamespace) loopbackUp() error { 366 iface, err := n.nlHandle.LinkByName("lo") 367 if err != nil { 368 return err 369 } 370 return n.nlHandle.LinkSetUp(iface) 371 } 372 373 func (n *networkNamespace) GetLoopbackIfaceName() string { 374 return "lo" 375 } 376 377 func (n *networkNamespace) AddAliasIP(ifName string, ip *net.IPNet) error { 378 iface, err := n.nlHandle.LinkByName(ifName) 379 if err != nil { 380 return err 381 } 382 return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip}) 383 } 384 385 func (n *networkNamespace) RemoveAliasIP(ifName string, ip *net.IPNet) error { 386 iface, err := n.nlHandle.LinkByName(ifName) 387 if err != nil { 388 return err 389 } 390 return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip}) 391 } 392 393 func (n *networkNamespace) DisableARPForVIP(srcName string) (Err error) { 394 dstName := "" 395 for _, i := range n.Interfaces() { 396 if i.SrcName() == srcName { 397 dstName = i.DstName() 398 break 399 } 400 } 401 if dstName == "" { 402 return fmt.Errorf("failed to find interface %s in sandbox", srcName) 403 } 404 405 err := n.InvokeFunc(func() { 406 path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore") 407 if err := ioutil.WriteFile(path, []byte{'1', '\n'}, 0644); err != nil { 408 Err = fmt.Errorf("Failed to set %s to 1: %v", path, err) 409 return 410 } 411 path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce") 412 if err := ioutil.WriteFile(path, []byte{'2', '\n'}, 0644); err != nil { 413 Err = fmt.Errorf("Failed to set %s to 2: %v", path, err) 414 return 415 } 416 }) 417 if err != nil { 418 return err 419 } 420 return 421 } 422 423 func (n *networkNamespace) InvokeFunc(f func()) error { 424 return nsInvoke(n.nsPath(), func(nsFD int) error { return nil }, func(callerFD int) error { 425 f() 426 return nil 427 }) 428 } 429 430 // InitOSContext initializes OS context while configuring network resources 431 func InitOSContext() func() { 432 runtime.LockOSThread() 433 if err := ns.SetNamespace(); err != nil { 434 logrus.Error(err) 435 } 436 return runtime.UnlockOSThread 437 } 438 439 func nsInvoke(path string, prefunc func(nsFD int) error, postfunc func(callerFD int) error) error { 440 defer InitOSContext()() 441 442 newNs, err := netns.GetFromPath(path) 443 if err != nil { 444 return fmt.Errorf("failed get network namespace %q: %v", path, err) 445 } 446 defer newNs.Close() 447 448 // Invoked before the namespace switch happens but after the namespace file 449 // handle is obtained. 450 if err := prefunc(int(newNs)); err != nil { 451 return fmt.Errorf("failed in prefunc: %v", err) 452 } 453 454 if err = netns.Set(newNs); err != nil { 455 return err 456 } 457 defer ns.SetNamespace() 458 459 // Invoked after the namespace switch. 460 return postfunc(ns.ParseHandlerInt()) 461 } 462 463 func (n *networkNamespace) nsPath() string { 464 n.Lock() 465 defer n.Unlock() 466 467 return n.path 468 } 469 470 func (n *networkNamespace) Info() Info { 471 return n 472 } 473 474 func (n *networkNamespace) Key() string { 475 return n.path 476 } 477 478 func (n *networkNamespace) Destroy() error { 479 if n.nlHandle != nil { 480 n.nlHandle.Delete() 481 } 482 // Assuming no running process is executing in this network namespace, 483 // unmounting is sufficient to destroy it. 484 if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil { 485 return err 486 } 487 488 // Stash it into the garbage collection list 489 addToGarbagePaths(n.path) 490 return nil 491 } 492 493 // Restore restore the network namespace 494 func (n *networkNamespace) Restore(ifsopt map[string][]IfaceOption, routes []*types.StaticRoute, gw net.IP, gw6 net.IP) error { 495 // restore interfaces 496 for name, opts := range ifsopt { 497 if !strings.Contains(name, "+") { 498 return fmt.Errorf("wrong iface name in restore osl sandbox interface: %s", name) 499 } 500 seps := strings.Split(name, "+") 501 srcName := seps[0] 502 dstPrefix := seps[1] 503 i := &nwIface{srcName: srcName, dstName: dstPrefix, ns: n} 504 i.processInterfaceOptions(opts...) 505 if i.master != "" { 506 i.dstMaster = n.findDst(i.master, true) 507 if i.dstMaster == "" { 508 return fmt.Errorf("could not find an appropriate master %q for %q", 509 i.master, i.srcName) 510 } 511 } 512 if n.isDefault { 513 i.dstName = i.srcName 514 } else { 515 links, err := n.nlHandle.LinkList() 516 if err != nil { 517 return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path) 518 } 519 // due to the docker network connect/disconnect, so the dstName should 520 // restore from the namespace 521 for _, link := range links { 522 addrs, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4) 523 if err != nil { 524 return err 525 } 526 ifaceName := link.Attrs().Name 527 if strings.HasPrefix(ifaceName, "vxlan") { 528 if i.dstName == "vxlan" { 529 i.dstName = ifaceName 530 break 531 } 532 } 533 // find the interface name by ip 534 if i.address != nil { 535 for _, addr := range addrs { 536 if addr.IPNet.String() == i.address.String() { 537 i.dstName = ifaceName 538 break 539 } 540 continue 541 } 542 if i.dstName == ifaceName { 543 break 544 } 545 } 546 // This is to find the interface name of the pair in overlay sandbox 547 if strings.HasPrefix(ifaceName, "veth") { 548 if i.master != "" && i.dstName == "veth" { 549 i.dstName = ifaceName 550 } 551 } 552 } 553 554 var index int 555 indexStr := strings.TrimPrefix(i.dstName, dstPrefix) 556 if indexStr != "" { 557 index, err = strconv.Atoi(indexStr) 558 if err != nil { 559 return err 560 } 561 } 562 index++ 563 n.Lock() 564 if index > n.nextIfIndex[dstPrefix] { 565 n.nextIfIndex[dstPrefix] = index 566 } 567 n.iFaces = append(n.iFaces, i) 568 n.Unlock() 569 } 570 } 571 572 // restore routes 573 for _, r := range routes { 574 n.Lock() 575 n.staticRoutes = append(n.staticRoutes, r) 576 n.Unlock() 577 } 578 579 // restore gateway 580 if len(gw) > 0 { 581 n.Lock() 582 n.gw = gw 583 n.Unlock() 584 } 585 586 if len(gw6) > 0 { 587 n.Lock() 588 n.gwv6 = gw6 589 n.Unlock() 590 } 591 592 return nil 593 } 594 595 // Checks whether IPv6 needs to be enabled/disabled on the loopback interface 596 func (n *networkNamespace) checkLoV6() { 597 var ( 598 enable = false 599 action = "disable" 600 ) 601 602 n.Lock() 603 for _, iface := range n.iFaces { 604 if iface.AddressIPv6() != nil { 605 enable = true 606 action = "enable" 607 break 608 } 609 } 610 n.Unlock() 611 612 if n.loV6Enabled == enable { 613 return 614 } 615 616 if err := setIPv6(n.path, "lo", enable); err != nil { 617 logrus.Warnf("Failed to %s IPv6 on loopback interface on network namespace %q: %v", action, n.path, err) 618 } 619 620 n.loV6Enabled = enable 621 } 622 623 func reexecSetIPv6() { 624 runtime.LockOSThread() 625 defer runtime.UnlockOSThread() 626 627 if len(os.Args) < 3 { 628 logrus.Errorf("invalid number of arguments for %s", os.Args[0]) 629 os.Exit(1) 630 } 631 632 ns, err := netns.GetFromPath(os.Args[1]) 633 if err != nil { 634 logrus.Errorf("failed get network namespace %q: %v", os.Args[1], err) 635 os.Exit(2) 636 } 637 defer ns.Close() 638 639 if err = netns.Set(ns); err != nil { 640 logrus.Errorf("setting into container netns %q failed: %v", os.Args[1], err) 641 os.Exit(3) 642 } 643 644 var ( 645 action = "disable" 646 value = byte('1') 647 path = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", os.Args[2]) 648 ) 649 650 if os.Args[3] == "true" { 651 action = "enable" 652 value = byte('0') 653 } 654 655 if _, err := os.Stat(path); err != nil { 656 if os.IsNotExist(err) { 657 logrus.Warnf("file does not exist: %s : %v Has IPv6 been disabled in this node's kernel?", path, err) 658 os.Exit(0) 659 } 660 logrus.Errorf("failed to stat %s : %v", path, err) 661 os.Exit(5) 662 } 663 664 if err = ioutil.WriteFile(path, []byte{value, '\n'}, 0644); err != nil { 665 logrus.Errorf("failed to %s IPv6 forwarding for container's interface %s: %v", action, os.Args[2], err) 666 os.Exit(4) 667 } 668 669 os.Exit(0) 670 } 671 672 func setIPv6(path, iface string, enable bool) error { 673 cmd := &exec.Cmd{ 674 Path: reexec.Self(), 675 Args: append([]string{"set-ipv6"}, path, iface, strconv.FormatBool(enable)), 676 Stdout: os.Stdout, 677 Stderr: os.Stderr, 678 } 679 if err := cmd.Run(); err != nil { 680 return fmt.Errorf("reexec to set IPv6 failed: %v", err) 681 } 682 return nil 683 } 684 685 // ApplyOSTweaks applies linux configs on the sandbox 686 func (n *networkNamespace) ApplyOSTweaks(types []SandboxType) { 687 for _, t := range types { 688 switch t { 689 case SandboxTypeLoadBalancer: 690 kernel.ApplyOSTweaks(loadBalancerConfig) 691 } 692 } 693 }