github.com/rumpl/bof@v23.0.0-rc.2+incompatible/libnetwork/osl/namespace_linux.go (about) 1 package osl 2 3 import ( 4 "errors" 5 "fmt" 6 "net" 7 "os" 8 "os/exec" 9 "path/filepath" 10 "runtime" 11 "strconv" 12 "strings" 13 "sync" 14 "syscall" 15 "time" 16 17 "github.com/docker/docker/libnetwork/ns" 18 "github.com/docker/docker/libnetwork/osl/kernel" 19 "github.com/docker/docker/libnetwork/types" 20 "github.com/docker/docker/pkg/reexec" 21 "github.com/sirupsen/logrus" 22 "github.com/vishvananda/netlink" 23 "github.com/vishvananda/netns" 24 "golang.org/x/sys/unix" 25 ) 26 27 const defaultPrefix = "/var/run/docker" 28 29 func init() { 30 reexec.Register("set-ipv6", reexecSetIPv6) 31 } 32 33 var ( 34 once sync.Once 35 garbagePathMap = make(map[string]bool) 36 gpmLock sync.Mutex 37 gpmWg sync.WaitGroup 38 gpmCleanupPeriod = 60 * time.Second 39 gpmChan = make(chan chan struct{}) 40 prefix = defaultPrefix 41 ) 42 43 // The networkNamespace type is the linux implementation of the Sandbox 44 // interface. It represents a linux network namespace, and moves an interface 45 // into it when called on method AddInterface or sets the gateway etc. 46 type networkNamespace struct { 47 path string 48 iFaces []*nwIface 49 gw net.IP 50 gwv6 net.IP 51 staticRoutes []*types.StaticRoute 52 neighbors []*neigh 53 nextIfIndex map[string]int 54 isDefault bool 55 nlHandle *netlink.Handle 56 loV6Enabled bool 57 sync.Mutex 58 } 59 60 // SetBasePath sets the base url prefix for the ns path 61 func SetBasePath(path string) { 62 prefix = path 63 } 64 65 func init() { 66 reexec.Register("netns-create", reexecCreateNamespace) 67 } 68 69 func basePath() string { 70 return filepath.Join(prefix, "netns") 71 } 72 73 func createBasePath() { 74 err := os.MkdirAll(basePath(), 0755) 75 if err != nil { 76 panic("Could not create net namespace path directory") 77 } 78 79 // Start the garbage collection go routine 80 go removeUnusedPaths() 81 } 82 83 func removeUnusedPaths() { 84 gpmLock.Lock() 85 period := gpmCleanupPeriod 86 gpmLock.Unlock() 87 88 ticker := time.NewTicker(period) 89 for { 90 var ( 91 gc chan struct{} 92 gcOk bool 93 ) 94 95 select { 96 case <-ticker.C: 97 case gc, gcOk = <-gpmChan: 98 } 99 100 gpmLock.Lock() 101 pathList := make([]string, 0, len(garbagePathMap)) 102 for path := range garbagePathMap { 103 pathList = append(pathList, path) 104 } 105 garbagePathMap = make(map[string]bool) 106 gpmWg.Add(1) 107 gpmLock.Unlock() 108 109 for _, path := range pathList { 110 os.Remove(path) 111 } 112 113 gpmWg.Done() 114 if gcOk { 115 close(gc) 116 } 117 } 118 } 119 120 func addToGarbagePaths(path string) { 121 gpmLock.Lock() 122 garbagePathMap[path] = true 123 gpmLock.Unlock() 124 } 125 126 func removeFromGarbagePaths(path string) { 127 gpmLock.Lock() 128 delete(garbagePathMap, path) 129 gpmLock.Unlock() 130 } 131 132 // GC triggers garbage collection of namespace path right away 133 // and waits for it. 134 func GC() { 135 gpmLock.Lock() 136 if len(garbagePathMap) == 0 { 137 // No need for GC if map is empty 138 gpmLock.Unlock() 139 return 140 } 141 gpmLock.Unlock() 142 143 // if content exists in the garbage paths 144 // we can trigger GC to run, providing a 145 // channel to be notified on completion 146 waitGC := make(chan struct{}) 147 gpmChan <- waitGC 148 // wait for GC completion 149 <-waitGC 150 } 151 152 // GenerateKey generates a sandbox key based on the passed 153 // container id. 154 func GenerateKey(containerID string) string { 155 maxLen := 12 156 // Read sandbox key from host for overlay 157 if strings.HasPrefix(containerID, "-") { 158 var ( 159 index int 160 indexStr string 161 tmpkey string 162 ) 163 dir, err := os.ReadDir(basePath()) 164 if err != nil { 165 return "" 166 } 167 168 for _, v := range dir { 169 id := v.Name() 170 if strings.HasSuffix(id, containerID[:maxLen-1]) { 171 indexStr = strings.TrimSuffix(id, containerID[:maxLen-1]) 172 tmpindex, err := strconv.Atoi(indexStr) 173 if err != nil { 174 return "" 175 } 176 if tmpindex > index { 177 index = tmpindex 178 tmpkey = id 179 } 180 181 } 182 } 183 containerID = tmpkey 184 if containerID == "" { 185 return "" 186 } 187 } 188 189 if len(containerID) < maxLen { 190 maxLen = len(containerID) 191 } 192 193 return basePath() + "/" + containerID[:maxLen] 194 } 195 196 // NewSandbox provides a new sandbox instance created in an os specific way 197 // provided a key which uniquely identifies the sandbox 198 func NewSandbox(key string, osCreate, isRestore bool) (Sandbox, error) { 199 if !isRestore { 200 err := createNetworkNamespace(key, osCreate) 201 if err != nil { 202 return nil, err 203 } 204 } else { 205 once.Do(createBasePath) 206 } 207 208 n := &networkNamespace{path: key, isDefault: !osCreate, nextIfIndex: make(map[string]int)} 209 210 sboxNs, err := netns.GetFromPath(n.path) 211 if err != nil { 212 return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err) 213 } 214 defer sboxNs.Close() 215 216 n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE) 217 if err != nil { 218 return nil, fmt.Errorf("failed to create a netlink handle: %v", err) 219 } 220 221 err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout) 222 if err != nil { 223 logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err) 224 } 225 // In live-restore mode, IPV6 entries are getting cleaned up due to below code 226 // We should retain IPV6 configurations in live-restore mode when Docker Daemon 227 // comes back. It should work as it is on other cases 228 // As starting point, disable IPv6 on all interfaces 229 if !isRestore && !n.isDefault { 230 err = setIPv6(n.path, "all", false) 231 if err != nil { 232 logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err) 233 } 234 } 235 236 if err = n.loopbackUp(); err != nil { 237 n.nlHandle.Close() 238 return nil, err 239 } 240 241 return n, nil 242 } 243 244 func (n *networkNamespace) InterfaceOptions() IfaceOptionSetter { 245 return n 246 } 247 248 func (n *networkNamespace) NeighborOptions() NeighborOptionSetter { 249 return n 250 } 251 252 func mountNetworkNamespace(basePath string, lnPath string) error { 253 return syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "") 254 } 255 256 // GetSandboxForExternalKey returns sandbox object for the supplied path 257 func GetSandboxForExternalKey(basePath string, key string) (Sandbox, error) { 258 if err := createNamespaceFile(key); err != nil { 259 return nil, err 260 } 261 262 if err := mountNetworkNamespace(basePath, key); err != nil { 263 return nil, err 264 } 265 n := &networkNamespace{path: key, nextIfIndex: make(map[string]int)} 266 267 sboxNs, err := netns.GetFromPath(n.path) 268 if err != nil { 269 return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err) 270 } 271 defer sboxNs.Close() 272 273 n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE) 274 if err != nil { 275 return nil, fmt.Errorf("failed to create a netlink handle: %v", err) 276 } 277 278 err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout) 279 if err != nil { 280 logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err) 281 } 282 283 // As starting point, disable IPv6 on all interfaces 284 err = setIPv6(n.path, "all", false) 285 if err != nil { 286 logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err) 287 } 288 289 if err = n.loopbackUp(); err != nil { 290 n.nlHandle.Close() 291 return nil, err 292 } 293 294 return n, nil 295 } 296 297 func reexecCreateNamespace() { 298 if len(os.Args) < 2 { 299 logrus.Fatal("no namespace path provided") 300 } 301 if err := mountNetworkNamespace("/proc/self/ns/net", os.Args[1]); err != nil { 302 logrus.Fatal(err) 303 } 304 } 305 306 func createNetworkNamespace(path string, osCreate bool) error { 307 if err := createNamespaceFile(path); err != nil { 308 return err 309 } 310 311 cmd := &exec.Cmd{ 312 Path: reexec.Self(), 313 Args: append([]string{"netns-create"}, path), 314 Stdout: os.Stdout, 315 Stderr: os.Stderr, 316 } 317 if osCreate { 318 cmd.SysProcAttr = &syscall.SysProcAttr{} 319 cmd.SysProcAttr.Cloneflags = syscall.CLONE_NEWNET 320 } 321 if err := cmd.Run(); err != nil { 322 return fmt.Errorf("namespace creation reexec command failed: %v", err) 323 } 324 325 return nil 326 } 327 328 func unmountNamespaceFile(path string) { 329 if _, err := os.Stat(path); err == nil { 330 if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil && !errors.Is(err, unix.EINVAL) { 331 logrus.WithError(err).Error("Error unmounting namespace file") 332 } 333 } 334 } 335 336 func createNamespaceFile(path string) (err error) { 337 var f *os.File 338 339 once.Do(createBasePath) 340 // Remove it from garbage collection list if present 341 removeFromGarbagePaths(path) 342 343 // If the path is there unmount it first 344 unmountNamespaceFile(path) 345 346 // wait for garbage collection to complete if it is in progress 347 // before trying to create the file. 348 gpmWg.Wait() 349 350 if f, err = os.Create(path); err == nil { 351 f.Close() 352 } 353 354 return err 355 } 356 357 func (n *networkNamespace) loopbackUp() error { 358 iface, err := n.nlHandle.LinkByName("lo") 359 if err != nil { 360 return err 361 } 362 return n.nlHandle.LinkSetUp(iface) 363 } 364 365 func (n *networkNamespace) GetLoopbackIfaceName() string { 366 return "lo" 367 } 368 369 func (n *networkNamespace) AddAliasIP(ifName string, ip *net.IPNet) error { 370 iface, err := n.nlHandle.LinkByName(ifName) 371 if err != nil { 372 return err 373 } 374 return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip}) 375 } 376 377 func (n *networkNamespace) RemoveAliasIP(ifName string, ip *net.IPNet) error { 378 iface, err := n.nlHandle.LinkByName(ifName) 379 if err != nil { 380 return err 381 } 382 return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip}) 383 } 384 385 func (n *networkNamespace) DisableARPForVIP(srcName string) (Err error) { 386 dstName := "" 387 for _, i := range n.Interfaces() { 388 if i.SrcName() == srcName { 389 dstName = i.DstName() 390 break 391 } 392 } 393 if dstName == "" { 394 return fmt.Errorf("failed to find interface %s in sandbox", srcName) 395 } 396 397 err := n.InvokeFunc(func() { 398 path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore") 399 if err := os.WriteFile(path, []byte{'1', '\n'}, 0644); err != nil { 400 Err = fmt.Errorf("Failed to set %s to 1: %v", path, err) 401 return 402 } 403 path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce") 404 if err := os.WriteFile(path, []byte{'2', '\n'}, 0644); err != nil { 405 Err = fmt.Errorf("Failed to set %s to 2: %v", path, err) 406 return 407 } 408 }) 409 if err != nil { 410 return err 411 } 412 return 413 } 414 415 func (n *networkNamespace) InvokeFunc(f func()) error { 416 return nsInvoke(n.nsPath(), func(nsFD int) error { return nil }, func(callerFD int) error { 417 f() 418 return nil 419 }) 420 } 421 422 // InitOSContext initializes OS context while configuring network resources 423 func InitOSContext() func() { 424 runtime.LockOSThread() 425 if err := ns.SetNamespace(); err != nil { 426 logrus.Error(err) 427 } 428 return runtime.UnlockOSThread 429 } 430 431 func nsInvoke(path string, prefunc func(nsFD int) error, postfunc func(callerFD int) error) error { 432 defer InitOSContext()() 433 434 newNs, err := netns.GetFromPath(path) 435 if err != nil { 436 return fmt.Errorf("failed get network namespace %q: %v", path, err) 437 } 438 defer newNs.Close() 439 440 // Invoked before the namespace switch happens but after the namespace file 441 // handle is obtained. 442 if err := prefunc(int(newNs)); err != nil { 443 return fmt.Errorf("failed in prefunc: %v", err) 444 } 445 446 if err = netns.Set(newNs); err != nil { 447 return err 448 } 449 defer ns.SetNamespace() 450 451 // Invoked after the namespace switch. 452 return postfunc(ns.ParseHandlerInt()) 453 } 454 455 func (n *networkNamespace) nsPath() string { 456 n.Lock() 457 defer n.Unlock() 458 459 return n.path 460 } 461 462 func (n *networkNamespace) Info() Info { 463 return n 464 } 465 466 func (n *networkNamespace) Key() string { 467 return n.path 468 } 469 470 func (n *networkNamespace) Destroy() error { 471 if n.nlHandle != nil { 472 n.nlHandle.Close() 473 } 474 // Assuming no running process is executing in this network namespace, 475 // unmounting is sufficient to destroy it. 476 if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil { 477 return err 478 } 479 480 // Stash it into the garbage collection list 481 addToGarbagePaths(n.path) 482 return nil 483 } 484 485 // Restore restore the network namespace 486 func (n *networkNamespace) Restore(ifsopt map[string][]IfaceOption, routes []*types.StaticRoute, gw net.IP, gw6 net.IP) error { 487 // restore interfaces 488 for name, opts := range ifsopt { 489 if !strings.Contains(name, "+") { 490 return fmt.Errorf("wrong iface name in restore osl sandbox interface: %s", name) 491 } 492 seps := strings.Split(name, "+") 493 srcName := seps[0] 494 dstPrefix := seps[1] 495 i := &nwIface{srcName: srcName, dstName: dstPrefix, ns: n} 496 i.processInterfaceOptions(opts...) 497 if i.master != "" { 498 i.dstMaster = n.findDst(i.master, true) 499 if i.dstMaster == "" { 500 return fmt.Errorf("could not find an appropriate master %q for %q", 501 i.master, i.srcName) 502 } 503 } 504 if n.isDefault { 505 i.dstName = i.srcName 506 } else { 507 links, err := n.nlHandle.LinkList() 508 if err != nil { 509 return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path) 510 } 511 // due to the docker network connect/disconnect, so the dstName should 512 // restore from the namespace 513 for _, link := range links { 514 addrs, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4) 515 if err != nil { 516 return err 517 } 518 ifaceName := link.Attrs().Name 519 if strings.HasPrefix(ifaceName, "vxlan") { 520 if i.dstName == "vxlan" { 521 i.dstName = ifaceName 522 break 523 } 524 } 525 // find the interface name by ip 526 if i.address != nil { 527 for _, addr := range addrs { 528 if addr.IPNet.String() == i.address.String() { 529 i.dstName = ifaceName 530 break 531 } 532 continue 533 } 534 if i.dstName == ifaceName { 535 break 536 } 537 } 538 // This is to find the interface name of the pair in overlay sandbox 539 if strings.HasPrefix(ifaceName, "veth") { 540 if i.master != "" && i.dstName == "veth" { 541 i.dstName = ifaceName 542 } 543 } 544 } 545 546 var index int 547 indexStr := strings.TrimPrefix(i.dstName, dstPrefix) 548 if indexStr != "" { 549 index, err = strconv.Atoi(indexStr) 550 if err != nil { 551 return err 552 } 553 } 554 index++ 555 n.Lock() 556 if index > n.nextIfIndex[dstPrefix] { 557 n.nextIfIndex[dstPrefix] = index 558 } 559 n.iFaces = append(n.iFaces, i) 560 n.Unlock() 561 } 562 } 563 564 // restore routes 565 for _, r := range routes { 566 n.Lock() 567 n.staticRoutes = append(n.staticRoutes, r) 568 n.Unlock() 569 } 570 571 // restore gateway 572 if len(gw) > 0 { 573 n.Lock() 574 n.gw = gw 575 n.Unlock() 576 } 577 578 if len(gw6) > 0 { 579 n.Lock() 580 n.gwv6 = gw6 581 n.Unlock() 582 } 583 584 return nil 585 } 586 587 // Checks whether IPv6 needs to be enabled/disabled on the loopback interface 588 func (n *networkNamespace) checkLoV6() { 589 var ( 590 enable = false 591 action = "disable" 592 ) 593 594 n.Lock() 595 for _, iface := range n.iFaces { 596 if iface.AddressIPv6() != nil { 597 enable = true 598 action = "enable" 599 break 600 } 601 } 602 n.Unlock() 603 604 if n.loV6Enabled == enable { 605 return 606 } 607 608 if err := setIPv6(n.path, "lo", enable); err != nil { 609 logrus.Warnf("Failed to %s IPv6 on loopback interface on network namespace %q: %v", action, n.path, err) 610 } 611 612 n.loV6Enabled = enable 613 } 614 615 func reexecSetIPv6() { 616 runtime.LockOSThread() 617 defer runtime.UnlockOSThread() 618 619 if len(os.Args) < 3 { 620 logrus.Errorf("invalid number of arguments for %s", os.Args[0]) 621 os.Exit(1) 622 } 623 624 ns, err := netns.GetFromPath(os.Args[1]) 625 if err != nil { 626 logrus.Errorf("failed get network namespace %q: %v", os.Args[1], err) 627 os.Exit(2) 628 } 629 defer ns.Close() 630 631 if err = netns.Set(ns); err != nil { 632 logrus.Errorf("setting into container netns %q failed: %v", os.Args[1], err) 633 os.Exit(3) 634 } 635 636 var ( 637 action = "disable" 638 value = byte('1') 639 path = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", os.Args[2]) 640 ) 641 642 if os.Args[3] == "true" { 643 action = "enable" 644 value = byte('0') 645 } 646 647 if _, err := os.Stat(path); err != nil { 648 if os.IsNotExist(err) { 649 logrus.Warnf("file does not exist: %s : %v Has IPv6 been disabled in this node's kernel?", path, err) 650 os.Exit(0) 651 } 652 logrus.Errorf("failed to stat %s : %v", path, err) 653 os.Exit(5) 654 } 655 656 if err = os.WriteFile(path, []byte{value, '\n'}, 0644); err != nil { 657 logrus.Errorf("failed to %s IPv6 forwarding for container's interface %s: %v", action, os.Args[2], err) 658 os.Exit(4) 659 } 660 661 os.Exit(0) 662 } 663 664 func setIPv6(path, iface string, enable bool) error { 665 cmd := &exec.Cmd{ 666 Path: reexec.Self(), 667 Args: append([]string{"set-ipv6"}, path, iface, strconv.FormatBool(enable)), 668 Stdout: os.Stdout, 669 Stderr: os.Stderr, 670 } 671 if err := cmd.Run(); err != nil { 672 return fmt.Errorf("reexec to set IPv6 failed: %v", err) 673 } 674 return nil 675 } 676 677 // ApplyOSTweaks applies linux configs on the sandbox 678 func (n *networkNamespace) ApplyOSTweaks(types []SandboxType) { 679 for _, t := range types { 680 switch t { 681 case SandboxTypeLoadBalancer, SandboxTypeIngress: 682 kernel.ApplyOSTweaks(map[string]*kernel.OSValue{ 683 // disables any special handling on port reuse of existing IPVS connection table entries 684 // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L32 685 "net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil}, 686 // expires connection from the IPVS connection table when the backend is not available 687 // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L133 688 "net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil}, 689 // expires persistent connections to destination servers with weights set to 0 690 // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L151 691 "net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil}, 692 }) 693 } 694 } 695 }