github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/libnetwork/osl/namespace_linux.go (about) 1 package osl 2 3 import ( 4 "fmt" 5 "io/ioutil" 6 "net" 7 "os" 8 "os/exec" 9 "path/filepath" 10 "runtime" 11 "strconv" 12 "strings" 13 "sync" 14 "syscall" 15 "time" 16 17 "github.com/docker/docker/pkg/reexec" 18 "github.com/docker/libnetwork/ns" 19 "github.com/docker/libnetwork/osl/kernel" 20 "github.com/docker/libnetwork/types" 21 "github.com/sirupsen/logrus" 22 "github.com/vishvananda/netlink" 23 "github.com/vishvananda/netns" 24 ) 25 26 const defaultPrefix = "/var/run/docker" 27 28 func init() { 29 reexec.Register("set-ipv6", reexecSetIPv6) 30 } 31 32 var ( 33 once sync.Once 34 garbagePathMap = make(map[string]bool) 35 gpmLock sync.Mutex 36 gpmWg sync.WaitGroup 37 gpmCleanupPeriod = 60 * time.Second 38 gpmChan = make(chan chan struct{}) 39 prefix = defaultPrefix 40 ) 41 42 // The networkNamespace type is the linux implementation of the Sandbox 43 // interface. It represents a linux network namespace, and moves an interface 44 // into it when called on method AddInterface or sets the gateway etc. 45 type networkNamespace struct { 46 path string 47 iFaces []*nwIface 48 gw net.IP 49 gwv6 net.IP 50 staticRoutes []*types.StaticRoute 51 neighbors []*neigh 52 nextIfIndex map[string]int 53 isDefault bool 54 nlHandle *netlink.Handle 55 loV6Enabled bool 56 sync.Mutex 57 } 58 59 // SetBasePath sets the base url prefix for the ns path 60 func SetBasePath(path string) { 61 prefix = path 62 } 63 64 func init() { 65 reexec.Register("netns-create", reexecCreateNamespace) 66 } 67 68 func basePath() string { 69 return filepath.Join(prefix, "netns") 70 } 71 72 func createBasePath() { 73 err := os.MkdirAll(basePath(), 0755) 74 if err != nil { 75 panic("Could not create net namespace path directory") 76 } 77 78 // Start the garbage collection go routine 79 go removeUnusedPaths() 80 } 81 82 func removeUnusedPaths() { 83 gpmLock.Lock() 84 period := gpmCleanupPeriod 85 gpmLock.Unlock() 86 87 ticker := time.NewTicker(period) 88 for { 89 var ( 90 gc chan struct{} 91 gcOk bool 92 ) 93 94 select { 95 case <-ticker.C: 96 case gc, gcOk = <-gpmChan: 97 } 98 99 gpmLock.Lock() 100 pathList := make([]string, 0, len(garbagePathMap)) 101 for path := range garbagePathMap { 102 pathList = append(pathList, path) 103 } 104 garbagePathMap = make(map[string]bool) 105 gpmWg.Add(1) 106 gpmLock.Unlock() 107 108 for _, path := range pathList { 109 os.Remove(path) 110 } 111 112 gpmWg.Done() 113 if gcOk { 114 close(gc) 115 } 116 } 117 } 118 119 func addToGarbagePaths(path string) { 120 gpmLock.Lock() 121 garbagePathMap[path] = true 122 gpmLock.Unlock() 123 } 124 125 func removeFromGarbagePaths(path string) { 126 gpmLock.Lock() 127 delete(garbagePathMap, path) 128 gpmLock.Unlock() 129 } 130 131 // GC triggers garbage collection of namespace path right away 132 // and waits for it. 133 func GC() { 134 gpmLock.Lock() 135 if len(garbagePathMap) == 0 { 136 // No need for GC if map is empty 137 gpmLock.Unlock() 138 return 139 } 140 gpmLock.Unlock() 141 142 // if content exists in the garbage paths 143 // we can trigger GC to run, providing a 144 // channel to be notified on completion 145 waitGC := make(chan struct{}) 146 gpmChan <- waitGC 147 // wait for GC completion 148 <-waitGC 149 } 150 151 // GenerateKey generates a sandbox key based on the passed 152 // container id. 153 func GenerateKey(containerID string) string { 154 maxLen := 12 155 // Read sandbox key from host for overlay 156 if strings.HasPrefix(containerID, "-") { 157 var ( 158 index int 159 indexStr string 160 tmpkey string 161 ) 162 dir, err := ioutil.ReadDir(basePath()) 163 if err != nil { 164 return "" 165 } 166 167 for _, v := range dir { 168 id := v.Name() 169 if strings.HasSuffix(id, containerID[:maxLen-1]) { 170 indexStr = strings.TrimSuffix(id, containerID[:maxLen-1]) 171 tmpindex, err := strconv.Atoi(indexStr) 172 if err != nil { 173 return "" 174 } 175 if tmpindex > index { 176 index = tmpindex 177 tmpkey = id 178 } 179 180 } 181 } 182 containerID = tmpkey 183 if containerID == "" { 184 return "" 185 } 186 } 187 188 if len(containerID) < maxLen { 189 maxLen = len(containerID) 190 } 191 192 return basePath() + "/" + containerID[:maxLen] 193 } 194 195 // NewSandbox provides a new sandbox instance created in an os specific way 196 // provided a key which uniquely identifies the sandbox 197 func NewSandbox(key string, osCreate, isRestore bool) (Sandbox, error) { 198 if !isRestore { 199 err := createNetworkNamespace(key, osCreate) 200 if err != nil { 201 return nil, err 202 } 203 } else { 204 once.Do(createBasePath) 205 } 206 207 n := &networkNamespace{path: key, isDefault: !osCreate, nextIfIndex: make(map[string]int)} 208 209 sboxNs, err := netns.GetFromPath(n.path) 210 if err != nil { 211 return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err) 212 } 213 defer sboxNs.Close() 214 215 n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE) 216 if err != nil { 217 return nil, fmt.Errorf("failed to create a netlink handle: %v", err) 218 } 219 220 err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout) 221 if err != nil { 222 logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err) 223 } 224 // In live-restore mode, IPV6 entries are getting cleaned up due to below code 225 // We should retain IPV6 configurations in live-restore mode when Docker Daemon 226 // comes back. It should work as it is on other cases 227 // As starting point, disable IPv6 on all interfaces 228 if !isRestore && !n.isDefault { 229 err = setIPv6(n.path, "all", false) 230 if err != nil { 231 logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err) 232 } 233 } 234 235 if err = n.loopbackUp(); err != nil { 236 n.nlHandle.Delete() 237 return nil, err 238 } 239 240 return n, nil 241 } 242 243 func (n *networkNamespace) InterfaceOptions() IfaceOptionSetter { 244 return n 245 } 246 247 func (n *networkNamespace) NeighborOptions() NeighborOptionSetter { 248 return n 249 } 250 251 func mountNetworkNamespace(basePath string, lnPath string) error { 252 return syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "") 253 } 254 255 // GetSandboxForExternalKey returns sandbox object for the supplied path 256 func GetSandboxForExternalKey(basePath string, key string) (Sandbox, error) { 257 if err := createNamespaceFile(key); err != nil { 258 return nil, err 259 } 260 261 if err := mountNetworkNamespace(basePath, key); err != nil { 262 return nil, err 263 } 264 n := &networkNamespace{path: key, nextIfIndex: make(map[string]int)} 265 266 sboxNs, err := netns.GetFromPath(n.path) 267 if err != nil { 268 return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err) 269 } 270 defer sboxNs.Close() 271 272 n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE) 273 if err != nil { 274 return nil, fmt.Errorf("failed to create a netlink handle: %v", err) 275 } 276 277 err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout) 278 if err != nil { 279 logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err) 280 } 281 282 // As starting point, disable IPv6 on all interfaces 283 err = setIPv6(n.path, "all", false) 284 if err != nil { 285 logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err) 286 } 287 288 if err = n.loopbackUp(); err != nil { 289 n.nlHandle.Delete() 290 return nil, err 291 } 292 293 return n, nil 294 } 295 296 func reexecCreateNamespace() { 297 if len(os.Args) < 2 { 298 logrus.Fatal("no namespace path provided") 299 } 300 if err := mountNetworkNamespace("/proc/self/ns/net", os.Args[1]); err != nil { 301 logrus.Fatal(err) 302 } 303 } 304 305 func createNetworkNamespace(path string, osCreate bool) error { 306 if err := createNamespaceFile(path); err != nil { 307 return err 308 } 309 310 cmd := &exec.Cmd{ 311 Path: reexec.Self(), 312 Args: append([]string{"netns-create"}, path), 313 Stdout: os.Stdout, 314 Stderr: os.Stderr, 315 } 316 if osCreate { 317 cmd.SysProcAttr = &syscall.SysProcAttr{} 318 cmd.SysProcAttr.Cloneflags = syscall.CLONE_NEWNET 319 } 320 if err := cmd.Run(); err != nil { 321 return fmt.Errorf("namespace creation reexec command failed: %v", err) 322 } 323 324 return nil 325 } 326 327 func unmountNamespaceFile(path string) { 328 if _, err := os.Stat(path); err == nil { 329 syscall.Unmount(path, syscall.MNT_DETACH) 330 } 331 } 332 333 func createNamespaceFile(path string) (err error) { 334 var f *os.File 335 336 once.Do(createBasePath) 337 // Remove it from garbage collection list if present 338 removeFromGarbagePaths(path) 339 340 // If the path is there unmount it first 341 unmountNamespaceFile(path) 342 343 // wait for garbage collection to complete if it is in progress 344 // before trying to create the file. 345 gpmWg.Wait() 346 347 if f, err = os.Create(path); err == nil { 348 f.Close() 349 } 350 351 return err 352 } 353 354 func (n *networkNamespace) loopbackUp() error { 355 iface, err := n.nlHandle.LinkByName("lo") 356 if err != nil { 357 return err 358 } 359 return n.nlHandle.LinkSetUp(iface) 360 } 361 362 func (n *networkNamespace) GetLoopbackIfaceName() string { 363 return "lo" 364 } 365 366 func (n *networkNamespace) AddAliasIP(ifName string, ip *net.IPNet) error { 367 iface, err := n.nlHandle.LinkByName(ifName) 368 if err != nil { 369 return err 370 } 371 return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip}) 372 } 373 374 func (n *networkNamespace) RemoveAliasIP(ifName string, ip *net.IPNet) error { 375 iface, err := n.nlHandle.LinkByName(ifName) 376 if err != nil { 377 return err 378 } 379 return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip}) 380 } 381 382 func (n *networkNamespace) DisableARPForVIP(srcName string) (Err error) { 383 dstName := "" 384 for _, i := range n.Interfaces() { 385 if i.SrcName() == srcName { 386 dstName = i.DstName() 387 break 388 } 389 } 390 if dstName == "" { 391 return fmt.Errorf("failed to find interface %s in sandbox", srcName) 392 } 393 394 err := n.InvokeFunc(func() { 395 path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore") 396 if err := ioutil.WriteFile(path, []byte{'1', '\n'}, 0644); err != nil { 397 Err = fmt.Errorf("Failed to set %s to 1: %v", path, err) 398 return 399 } 400 path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce") 401 if err := ioutil.WriteFile(path, []byte{'2', '\n'}, 0644); err != nil { 402 Err = fmt.Errorf("Failed to set %s to 2: %v", path, err) 403 return 404 } 405 }) 406 if err != nil { 407 return err 408 } 409 return 410 } 411 412 func (n *networkNamespace) InvokeFunc(f func()) error { 413 return nsInvoke(n.nsPath(), func(nsFD int) error { return nil }, func(callerFD int) error { 414 f() 415 return nil 416 }) 417 } 418 419 // InitOSContext initializes OS context while configuring network resources 420 func InitOSContext() func() { 421 runtime.LockOSThread() 422 if err := ns.SetNamespace(); err != nil { 423 logrus.Error(err) 424 } 425 return runtime.UnlockOSThread 426 } 427 428 func nsInvoke(path string, prefunc func(nsFD int) error, postfunc func(callerFD int) error) error { 429 defer InitOSContext()() 430 431 newNs, err := netns.GetFromPath(path) 432 if err != nil { 433 return fmt.Errorf("failed get network namespace %q: %v", path, err) 434 } 435 defer newNs.Close() 436 437 // Invoked before the namespace switch happens but after the namespace file 438 // handle is obtained. 439 if err := prefunc(int(newNs)); err != nil { 440 return fmt.Errorf("failed in prefunc: %v", err) 441 } 442 443 if err = netns.Set(newNs); err != nil { 444 return err 445 } 446 defer ns.SetNamespace() 447 448 // Invoked after the namespace switch. 449 return postfunc(ns.ParseHandlerInt()) 450 } 451 452 func (n *networkNamespace) nsPath() string { 453 n.Lock() 454 defer n.Unlock() 455 456 return n.path 457 } 458 459 func (n *networkNamespace) Info() Info { 460 return n 461 } 462 463 func (n *networkNamespace) Key() string { 464 return n.path 465 } 466 467 func (n *networkNamespace) Destroy() error { 468 if n.nlHandle != nil { 469 n.nlHandle.Delete() 470 } 471 // Assuming no running process is executing in this network namespace, 472 // unmounting is sufficient to destroy it. 473 if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil { 474 return err 475 } 476 477 // Stash it into the garbage collection list 478 addToGarbagePaths(n.path) 479 return nil 480 } 481 482 // Restore restore the network namespace 483 func (n *networkNamespace) Restore(ifsopt map[string][]IfaceOption, routes []*types.StaticRoute, gw net.IP, gw6 net.IP) error { 484 // restore interfaces 485 for name, opts := range ifsopt { 486 if !strings.Contains(name, "+") { 487 return fmt.Errorf("wrong iface name in restore osl sandbox interface: %s", name) 488 } 489 seps := strings.Split(name, "+") 490 srcName := seps[0] 491 dstPrefix := seps[1] 492 i := &nwIface{srcName: srcName, dstName: dstPrefix, ns: n} 493 i.processInterfaceOptions(opts...) 494 if i.master != "" { 495 i.dstMaster = n.findDst(i.master, true) 496 if i.dstMaster == "" { 497 return fmt.Errorf("could not find an appropriate master %q for %q", 498 i.master, i.srcName) 499 } 500 } 501 if n.isDefault { 502 i.dstName = i.srcName 503 } else { 504 links, err := n.nlHandle.LinkList() 505 if err != nil { 506 return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path) 507 } 508 // due to the docker network connect/disconnect, so the dstName should 509 // restore from the namespace 510 for _, link := range links { 511 addrs, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4) 512 if err != nil { 513 return err 514 } 515 ifaceName := link.Attrs().Name 516 if strings.HasPrefix(ifaceName, "vxlan") { 517 if i.dstName == "vxlan" { 518 i.dstName = ifaceName 519 break 520 } 521 } 522 // find the interface name by ip 523 if i.address != nil { 524 for _, addr := range addrs { 525 if addr.IPNet.String() == i.address.String() { 526 i.dstName = ifaceName 527 break 528 } 529 continue 530 } 531 if i.dstName == ifaceName { 532 break 533 } 534 } 535 // This is to find the interface name of the pair in overlay sandbox 536 if strings.HasPrefix(ifaceName, "veth") { 537 if i.master != "" && i.dstName == "veth" { 538 i.dstName = ifaceName 539 } 540 } 541 } 542 543 var index int 544 indexStr := strings.TrimPrefix(i.dstName, dstPrefix) 545 if indexStr != "" { 546 index, err = strconv.Atoi(indexStr) 547 if err != nil { 548 return err 549 } 550 } 551 index++ 552 n.Lock() 553 if index > n.nextIfIndex[dstPrefix] { 554 n.nextIfIndex[dstPrefix] = index 555 } 556 n.iFaces = append(n.iFaces, i) 557 n.Unlock() 558 } 559 } 560 561 // restore routes 562 for _, r := range routes { 563 n.Lock() 564 n.staticRoutes = append(n.staticRoutes, r) 565 n.Unlock() 566 } 567 568 // restore gateway 569 if len(gw) > 0 { 570 n.Lock() 571 n.gw = gw 572 n.Unlock() 573 } 574 575 if len(gw6) > 0 { 576 n.Lock() 577 n.gwv6 = gw6 578 n.Unlock() 579 } 580 581 return nil 582 } 583 584 // Checks whether IPv6 needs to be enabled/disabled on the loopback interface 585 func (n *networkNamespace) checkLoV6() { 586 var ( 587 enable = false 588 action = "disable" 589 ) 590 591 n.Lock() 592 for _, iface := range n.iFaces { 593 if iface.AddressIPv6() != nil { 594 enable = true 595 action = "enable" 596 break 597 } 598 } 599 n.Unlock() 600 601 if n.loV6Enabled == enable { 602 return 603 } 604 605 if err := setIPv6(n.path, "lo", enable); err != nil { 606 logrus.Warnf("Failed to %s IPv6 on loopback interface on network namespace %q: %v", action, n.path, err) 607 } 608 609 n.loV6Enabled = enable 610 } 611 612 func reexecSetIPv6() { 613 runtime.LockOSThread() 614 defer runtime.UnlockOSThread() 615 616 if len(os.Args) < 3 { 617 logrus.Errorf("invalid number of arguments for %s", os.Args[0]) 618 os.Exit(1) 619 } 620 621 ns, err := netns.GetFromPath(os.Args[1]) 622 if err != nil { 623 logrus.Errorf("failed get network namespace %q: %v", os.Args[1], err) 624 os.Exit(2) 625 } 626 defer ns.Close() 627 628 if err = netns.Set(ns); err != nil { 629 logrus.Errorf("setting into container netns %q failed: %v", os.Args[1], err) 630 os.Exit(3) 631 } 632 633 var ( 634 action = "disable" 635 value = byte('1') 636 path = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", os.Args[2]) 637 ) 638 639 if os.Args[3] == "true" { 640 action = "enable" 641 value = byte('0') 642 } 643 644 if _, err := os.Stat(path); err != nil { 645 if os.IsNotExist(err) { 646 logrus.Warnf("file does not exist: %s : %v Has IPv6 been disabled in this node's kernel?", path, err) 647 os.Exit(0) 648 } 649 logrus.Errorf("failed to stat %s : %v", path, err) 650 os.Exit(5) 651 } 652 653 if err = ioutil.WriteFile(path, []byte{value, '\n'}, 0644); err != nil { 654 logrus.Errorf("failed to %s IPv6 forwarding for container's interface %s: %v", action, os.Args[2], err) 655 os.Exit(4) 656 } 657 658 os.Exit(0) 659 } 660 661 func setIPv6(path, iface string, enable bool) error { 662 cmd := &exec.Cmd{ 663 Path: reexec.Self(), 664 Args: append([]string{"set-ipv6"}, path, iface, strconv.FormatBool(enable)), 665 Stdout: os.Stdout, 666 Stderr: os.Stderr, 667 } 668 if err := cmd.Run(); err != nil { 669 return fmt.Errorf("reexec to set IPv6 failed: %v", err) 670 } 671 return nil 672 } 673 674 // ApplyOSTweaks applies linux configs on the sandbox 675 func (n *networkNamespace) ApplyOSTweaks(types []SandboxType) { 676 for _, t := range types { 677 switch t { 678 case SandboxTypeLoadBalancer, SandboxTypeIngress: 679 kernel.ApplyOSTweaks(map[string]*kernel.OSValue{ 680 // disables any special handling on port reuse of existing IPVS connection table entries 681 // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L32 682 "net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil}, 683 // expires connection from the IPVS connection table when the backend is not available 684 // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L133 685 "net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil}, 686 // expires persistent connections to destination servers with weights set to 0 687 // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L151 688 "net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil}, 689 }) 690 } 691 } 692 }