github.com/Prakhar-Agarwal-byte/moby@v0.0.0-20231027092010-a14e3e8ab87e/libnetwork/osl/namespace_linux.go (about) 1 package osl 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "net" 8 "os" 9 "path/filepath" 10 "runtime" 11 "strconv" 12 "strings" 13 "sync" 14 "syscall" 15 "time" 16 17 "github.com/containerd/log" 18 "github.com/Prakhar-Agarwal-byte/moby/internal/unshare" 19 "github.com/Prakhar-Agarwal-byte/moby/libnetwork/ns" 20 "github.com/Prakhar-Agarwal-byte/moby/libnetwork/osl/kernel" 21 "github.com/Prakhar-Agarwal-byte/moby/libnetwork/types" 22 "github.com/vishvananda/netlink" 23 "github.com/vishvananda/netns" 24 "golang.org/x/sys/unix" 25 ) 26 27 const defaultPrefix = "/var/run/docker" 28 29 func init() { 30 // Lock main() to the initial thread to exclude the goroutines spawned 31 // by func (*Namespace) InvokeFunc() or func setIPv6() below from 32 // being scheduled onto that thread. Changes to the network namespace of 33 // the initial thread alter /proc/self/ns/net, which would break any 34 // code which (incorrectly) assumes that that file is the network 35 // namespace for the thread it is currently executing on. 36 runtime.LockOSThread() 37 } 38 39 var ( 40 once sync.Once 41 garbagePathMap = make(map[string]bool) 42 gpmLock sync.Mutex 43 gpmWg sync.WaitGroup 44 gpmCleanupPeriod = 60 * time.Second 45 gpmChan = make(chan chan struct{}) 46 netnsBasePath = filepath.Join(defaultPrefix, "netns") 47 ) 48 49 // SetBasePath sets the base url prefix for the ns path 50 func SetBasePath(path string) { 51 netnsBasePath = filepath.Join(path, "netns") 52 } 53 54 func basePath() string { 55 return netnsBasePath 56 } 57 58 func createBasePath() { 59 err := os.MkdirAll(basePath(), 0o755) 60 if err != nil { 61 panic("Could not create net namespace path directory") 62 } 63 64 // Start the garbage collection go routine 65 go removeUnusedPaths() 66 } 67 68 func removeUnusedPaths() { 69 gpmLock.Lock() 70 period := gpmCleanupPeriod 71 gpmLock.Unlock() 72 73 ticker := time.NewTicker(period) 74 for { 75 var ( 76 gc chan struct{} 77 gcOk bool 78 ) 79 80 select { 81 case <-ticker.C: 82 case gc, gcOk = <-gpmChan: 83 } 84 85 gpmLock.Lock() 86 pathList := make([]string, 0, len(garbagePathMap)) 87 for path := range garbagePathMap { 88 pathList = append(pathList, path) 89 } 90 garbagePathMap = make(map[string]bool) 91 gpmWg.Add(1) 92 gpmLock.Unlock() 93 94 for _, path := range pathList { 95 os.Remove(path) 96 } 97 98 gpmWg.Done() 99 if gcOk { 100 close(gc) 101 } 102 } 103 } 104 105 func addToGarbagePaths(path string) { 106 gpmLock.Lock() 107 garbagePathMap[path] = true 108 gpmLock.Unlock() 109 } 110 111 func removeFromGarbagePaths(path string) { 112 gpmLock.Lock() 113 delete(garbagePathMap, path) 114 gpmLock.Unlock() 115 } 116 117 // GC triggers garbage collection of namespace path right away 118 // and waits for it. 119 func GC() { 120 gpmLock.Lock() 121 if len(garbagePathMap) == 0 { 122 // No need for GC if map is empty 123 gpmLock.Unlock() 124 return 125 } 126 gpmLock.Unlock() 127 128 // if content exists in the garbage paths 129 // we can trigger GC to run, providing a 130 // channel to be notified on completion 131 waitGC := make(chan struct{}) 132 gpmChan <- waitGC 133 // wait for GC completion 134 <-waitGC 135 } 136 137 // GenerateKey generates a sandbox key based on the passed 138 // container id. 139 func GenerateKey(containerID string) string { 140 maxLen := 12 141 // Read sandbox key from host for overlay 142 if strings.HasPrefix(containerID, "-") { 143 var ( 144 index int 145 indexStr string 146 tmpkey string 147 ) 148 dir, err := os.ReadDir(basePath()) 149 if err != nil { 150 return "" 151 } 152 153 for _, v := range dir { 154 id := v.Name() 155 if strings.HasSuffix(id, containerID[:maxLen-1]) { 156 indexStr = strings.TrimSuffix(id, containerID[:maxLen-1]) 157 tmpindex, err := strconv.Atoi(indexStr) 158 if err != nil { 159 return "" 160 } 161 if tmpindex > index { 162 index = tmpindex 163 tmpkey = id 164 } 165 } 166 } 167 containerID = tmpkey 168 if containerID == "" { 169 return "" 170 } 171 } 172 173 if len(containerID) < maxLen { 174 maxLen = len(containerID) 175 } 176 177 return basePath() + "/" + containerID[:maxLen] 178 } 179 180 // NewSandbox provides a new Namespace instance created in an os specific way 181 // provided a key which uniquely identifies the sandbox. 182 func NewSandbox(key string, osCreate, isRestore bool) (*Namespace, error) { 183 if !isRestore { 184 err := createNetworkNamespace(key, osCreate) 185 if err != nil { 186 return nil, err 187 } 188 } else { 189 once.Do(createBasePath) 190 } 191 192 n := &Namespace{path: key, isDefault: !osCreate, nextIfIndex: make(map[string]int)} 193 194 sboxNs, err := netns.GetFromPath(n.path) 195 if err != nil { 196 return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err) 197 } 198 defer sboxNs.Close() 199 200 n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE) 201 if err != nil { 202 return nil, fmt.Errorf("failed to create a netlink handle: %v", err) 203 } 204 205 err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout) 206 if err != nil { 207 log.G(context.TODO()).Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err) 208 } 209 // In live-restore mode, IPV6 entries are getting cleaned up due to below code 210 // We should retain IPV6 configurations in live-restore mode when Docker Daemon 211 // comes back. It should work as it is on other cases 212 // As starting point, disable IPv6 on all interfaces 213 if !isRestore && !n.isDefault { 214 err = setIPv6(n.path, "all", false) 215 if err != nil { 216 log.G(context.TODO()).Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err) 217 } 218 } 219 220 if err = n.loopbackUp(); err != nil { 221 n.nlHandle.Close() 222 return nil, err 223 } 224 225 return n, nil 226 } 227 228 func mountNetworkNamespace(basePath string, lnPath string) error { 229 return syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "") 230 } 231 232 // GetSandboxForExternalKey returns sandbox object for the supplied path 233 func GetSandboxForExternalKey(basePath string, key string) (*Namespace, error) { 234 if err := createNamespaceFile(key); err != nil { 235 return nil, err 236 } 237 238 if err := mountNetworkNamespace(basePath, key); err != nil { 239 return nil, err 240 } 241 n := &Namespace{path: key, nextIfIndex: make(map[string]int)} 242 243 sboxNs, err := netns.GetFromPath(n.path) 244 if err != nil { 245 return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err) 246 } 247 defer sboxNs.Close() 248 249 n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE) 250 if err != nil { 251 return nil, fmt.Errorf("failed to create a netlink handle: %v", err) 252 } 253 254 err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout) 255 if err != nil { 256 log.G(context.TODO()).Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err) 257 } 258 259 // As starting point, disable IPv6 on all interfaces 260 err = setIPv6(n.path, "all", false) 261 if err != nil { 262 log.G(context.TODO()).Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err) 263 } 264 265 if err = n.loopbackUp(); err != nil { 266 n.nlHandle.Close() 267 return nil, err 268 } 269 270 return n, nil 271 } 272 273 func createNetworkNamespace(path string, osCreate bool) error { 274 if err := createNamespaceFile(path); err != nil { 275 return err 276 } 277 278 do := func() error { 279 return mountNetworkNamespace(fmt.Sprintf("/proc/self/task/%d/ns/net", unix.Gettid()), path) 280 } 281 if osCreate { 282 return unshare.Go(unix.CLONE_NEWNET, do, nil) 283 } 284 return do() 285 } 286 287 func unmountNamespaceFile(path string) { 288 if _, err := os.Stat(path); err != nil { 289 // ignore when we cannot stat the path 290 return 291 } 292 if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil && !errors.Is(err, unix.EINVAL) { 293 log.G(context.TODO()).WithError(err).Error("Error unmounting namespace file") 294 } 295 } 296 297 func createNamespaceFile(path string) error { 298 once.Do(createBasePath) 299 // Remove it from garbage collection list if present 300 removeFromGarbagePaths(path) 301 302 // If the path is there unmount it first 303 unmountNamespaceFile(path) 304 305 // wait for garbage collection to complete if it is in progress 306 // before trying to create the file. 307 // 308 // TODO(aker): This garbage-collection was for a kernel bug in kernels 3.18-4.0.1: is this still needed on current kernels (and on kernel 3.10)? see https://github.com/moby/moby/pull/46315/commits/c0a6beba8e61d4019e1806d5241ba22007072ca2#r1331327103 309 gpmWg.Wait() 310 311 f, err := os.Create(path) 312 if err != nil { 313 return err 314 } 315 _ = f.Close() 316 return nil 317 } 318 319 // Namespace represents a network sandbox. It represents a Linux network 320 // namespace, and moves an interface into it when called on method AddInterface 321 // or sets the gateway etc. It holds a list of Interfaces, routes etc., and more 322 // can be added dynamically. 323 type Namespace struct { 324 path string 325 iFaces []*Interface 326 gw net.IP 327 gwv6 net.IP 328 staticRoutes []*types.StaticRoute 329 neighbors []*neigh 330 nextIfIndex map[string]int 331 isDefault bool 332 nlHandle *netlink.Handle 333 loV6Enabled bool 334 mu sync.Mutex 335 } 336 337 // Interfaces returns the collection of Interface previously added with the AddInterface 338 // method. Note that this doesn't include network interfaces added in any 339 // other way (such as the default loopback interface which is automatically 340 // created on creation of a sandbox). 341 func (n *Namespace) Interfaces() []*Interface { 342 ifaces := make([]*Interface, len(n.iFaces)) 343 copy(ifaces, n.iFaces) 344 return ifaces 345 } 346 347 func (n *Namespace) loopbackUp() error { 348 iface, err := n.nlHandle.LinkByName("lo") 349 if err != nil { 350 return err 351 } 352 return n.nlHandle.LinkSetUp(iface) 353 } 354 355 // GetLoopbackIfaceName returns the name of the loopback interface 356 func (n *Namespace) GetLoopbackIfaceName() string { 357 return "lo" 358 } 359 360 // AddAliasIP adds the passed IP address to the named interface 361 func (n *Namespace) AddAliasIP(ifName string, ip *net.IPNet) error { 362 iface, err := n.nlHandle.LinkByName(ifName) 363 if err != nil { 364 return err 365 } 366 return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip}) 367 } 368 369 // RemoveAliasIP removes the passed IP address from the named interface 370 func (n *Namespace) RemoveAliasIP(ifName string, ip *net.IPNet) error { 371 iface, err := n.nlHandle.LinkByName(ifName) 372 if err != nil { 373 return err 374 } 375 return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip}) 376 } 377 378 // DisableARPForVIP disables ARP replies and requests for VIP addresses 379 // on a particular interface. 380 func (n *Namespace) DisableARPForVIP(srcName string) (Err error) { 381 dstName := "" 382 for _, i := range n.Interfaces() { 383 if i.SrcName() == srcName { 384 dstName = i.DstName() 385 break 386 } 387 } 388 if dstName == "" { 389 return fmt.Errorf("failed to find interface %s in sandbox", srcName) 390 } 391 392 err := n.InvokeFunc(func() { 393 path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore") 394 if err := os.WriteFile(path, []byte{'1', '\n'}, 0o644); err != nil { 395 Err = fmt.Errorf("Failed to set %s to 1: %v", path, err) 396 return 397 } 398 path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce") 399 if err := os.WriteFile(path, []byte{'2', '\n'}, 0o644); err != nil { 400 Err = fmt.Errorf("Failed to set %s to 2: %v", path, err) 401 return 402 } 403 }) 404 if err != nil { 405 return err 406 } 407 return 408 } 409 410 // InvokeFunc invoke a function in the network namespace. 411 func (n *Namespace) InvokeFunc(f func()) error { 412 path := n.nsPath() 413 newNS, err := netns.GetFromPath(path) 414 if err != nil { 415 return fmt.Errorf("failed get network namespace %q: %w", path, err) 416 } 417 defer newNS.Close() 418 419 done := make(chan error, 1) 420 go func() { 421 runtime.LockOSThread() 422 // InvokeFunc() could have been called from a goroutine with 423 // tampered thread state, e.g. from another InvokeFunc() 424 // callback. The outer goroutine's thread state cannot be 425 // trusted. 426 origNS, err := netns.Get() 427 if err != nil { 428 runtime.UnlockOSThread() 429 done <- fmt.Errorf("failed to get original network namespace: %w", err) 430 return 431 } 432 defer origNS.Close() 433 434 if err := netns.Set(newNS); err != nil { 435 runtime.UnlockOSThread() 436 done <- err 437 return 438 } 439 defer func() { 440 close(done) 441 if err := netns.Set(origNS); err != nil { 442 log.G(context.TODO()).WithError(err).Warn("failed to restore thread's network namespace") 443 // Recover from the error by leaving this goroutine locked to 444 // the thread. The runtime will terminate the thread and replace 445 // it with a clean one when this goroutine returns. 446 } else { 447 runtime.UnlockOSThread() 448 } 449 }() 450 f() 451 }() 452 return <-done 453 } 454 455 func (n *Namespace) nsPath() string { 456 n.mu.Lock() 457 defer n.mu.Unlock() 458 459 return n.path 460 } 461 462 // Key returns the path where the network namespace is mounted. 463 func (n *Namespace) Key() string { 464 return n.path 465 } 466 467 // Destroy destroys the sandbox. 468 func (n *Namespace) Destroy() error { 469 if n.nlHandle != nil { 470 n.nlHandle.Close() 471 } 472 // Assuming no running process is executing in this network namespace, 473 // unmounting is sufficient to destroy it. 474 if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil { 475 return err 476 } 477 478 // Stash it into the garbage collection list 479 addToGarbagePaths(n.path) 480 return nil 481 } 482 483 // Restore restores the network namespace. 484 func (n *Namespace) Restore(interfaces map[Iface][]IfaceOption, routes []*types.StaticRoute, gw net.IP, gw6 net.IP) error { 485 // restore interfaces 486 for iface, opts := range interfaces { 487 i, err := newInterface(n, iface.SrcName, iface.DstPrefix, opts...) 488 if err != nil { 489 return err 490 } 491 if n.isDefault { 492 i.dstName = i.srcName 493 } else { 494 links, err := n.nlHandle.LinkList() 495 if err != nil { 496 return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path) 497 } 498 // due to the docker network connect/disconnect, so the dstName should 499 // restore from the namespace 500 for _, link := range links { 501 ifaceName := link.Attrs().Name 502 if i.dstName == "vxlan" && strings.HasPrefix(ifaceName, "vxlan") { 503 i.dstName = ifaceName 504 break 505 } 506 // find the interface name by ip 507 if i.address != nil { 508 addresses, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4) 509 if err != nil { 510 return err 511 } 512 for _, addr := range addresses { 513 if addr.IPNet.String() == i.address.String() { 514 i.dstName = ifaceName 515 break 516 } 517 } 518 if i.dstName == ifaceName { 519 break 520 } 521 } 522 // This is to find the interface name of the pair in overlay sandbox 523 if i.master != "" && i.dstName == "veth" && strings.HasPrefix(ifaceName, "veth") { 524 i.dstName = ifaceName 525 } 526 } 527 528 var index int 529 if idx := strings.TrimPrefix(i.dstName, iface.DstPrefix); idx != "" { 530 index, err = strconv.Atoi(idx) 531 if err != nil { 532 return fmt.Errorf("failed to restore interface in network namespace %q: invalid dstName for interface: %s: %v", n.path, i.dstName, err) 533 } 534 } 535 index++ 536 n.mu.Lock() 537 if index > n.nextIfIndex[iface.DstPrefix] { 538 n.nextIfIndex[iface.DstPrefix] = index 539 } 540 n.iFaces = append(n.iFaces, i) 541 n.mu.Unlock() 542 } 543 } 544 545 // restore routes and gateways 546 n.mu.Lock() 547 n.staticRoutes = append(n.staticRoutes, routes...) 548 if len(gw) > 0 { 549 n.gw = gw 550 } 551 if len(gw6) > 0 { 552 n.gwv6 = gw6 553 } 554 n.mu.Unlock() 555 return nil 556 } 557 558 // Checks whether IPv6 needs to be enabled/disabled on the loopback interface 559 func (n *Namespace) checkLoV6() { 560 var ( 561 enable = false 562 action = "disable" 563 ) 564 565 n.mu.Lock() 566 for _, iface := range n.iFaces { 567 if iface.AddressIPv6() != nil { 568 enable = true 569 action = "enable" 570 break 571 } 572 } 573 n.mu.Unlock() 574 575 if n.loV6Enabled == enable { 576 return 577 } 578 579 if err := setIPv6(n.path, "lo", enable); err != nil { 580 log.G(context.TODO()).Warnf("Failed to %s IPv6 on loopback interface on network namespace %q: %v", action, n.path, err) 581 } 582 583 n.loV6Enabled = enable 584 } 585 586 // ApplyOSTweaks applies operating system specific knobs on the sandbox. 587 func (n *Namespace) ApplyOSTweaks(types []SandboxType) { 588 for _, t := range types { 589 switch t { 590 case SandboxTypeLoadBalancer, SandboxTypeIngress: 591 kernel.ApplyOSTweaks(map[string]*kernel.OSValue{ 592 // disables any special handling on port reuse of existing IPVS connection table entries 593 // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L32 594 "net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil}, 595 // expires connection from the IPVS connection table when the backend is not available 596 // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L133 597 "net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil}, 598 // expires persistent connections to destination servers with weights set to 0 599 // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L151 600 "net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil}, 601 }) 602 } 603 } 604 } 605 606 func setIPv6(nspath, iface string, enable bool) error { 607 errCh := make(chan error, 1) 608 go func() { 609 defer close(errCh) 610 611 namespace, err := netns.GetFromPath(nspath) 612 if err != nil { 613 errCh <- fmt.Errorf("failed get network namespace %q: %w", nspath, err) 614 return 615 } 616 defer namespace.Close() 617 618 runtime.LockOSThread() 619 620 origNS, err := netns.Get() 621 if err != nil { 622 runtime.UnlockOSThread() 623 errCh <- fmt.Errorf("failed to get current network namespace: %w", err) 624 return 625 } 626 defer origNS.Close() 627 628 if err = netns.Set(namespace); err != nil { 629 runtime.UnlockOSThread() 630 errCh <- fmt.Errorf("setting into container netns %q failed: %w", nspath, err) 631 return 632 } 633 defer func() { 634 if err := netns.Set(origNS); err != nil { 635 log.G(context.TODO()).WithError(err).Error("libnetwork: restoring thread network namespace failed") 636 // The error is only fatal for the current thread. Keep this 637 // goroutine locked to the thread to make the runtime replace it 638 // with a clean thread once this goroutine returns. 639 } else { 640 runtime.UnlockOSThread() 641 } 642 }() 643 644 var ( 645 action = "disable" 646 value = byte('1') 647 path = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", iface) 648 ) 649 650 if enable { 651 action = "enable" 652 value = '0' 653 } 654 655 if _, err := os.Stat(path); err != nil { 656 if os.IsNotExist(err) { 657 log.G(context.TODO()).WithError(err).Warn("Cannot configure IPv6 forwarding on container interface. Has IPv6 been disabled in this node's kernel?") 658 return 659 } 660 errCh <- err 661 return 662 } 663 664 if err = os.WriteFile(path, []byte{value, '\n'}, 0o644); err != nil { 665 errCh <- fmt.Errorf("failed to %s IPv6 forwarding for container's interface %s: %w", action, iface, err) 666 return 667 } 668 }() 669 return <-errCh 670 }