github.com/rish1988/moby@v25.0.2+incompatible/libnetwork/drivers/overlay/ov_network.go (about) 1 //go:build linux 2 3 package overlay 4 5 import ( 6 "context" 7 "errors" 8 "fmt" 9 "net" 10 "os" 11 "path/filepath" 12 "runtime" 13 "strconv" 14 "strings" 15 "sync" 16 17 "github.com/containerd/log" 18 "github.com/docker/docker/libnetwork/driverapi" 19 "github.com/docker/docker/libnetwork/drivers/overlay/overlayutils" 20 "github.com/docker/docker/libnetwork/netlabel" 21 "github.com/docker/docker/libnetwork/ns" 22 "github.com/docker/docker/libnetwork/osl" 23 "github.com/docker/docker/libnetwork/types" 24 "github.com/hashicorp/go-multierror" 25 "github.com/vishvananda/netlink" 26 "github.com/vishvananda/netns" 27 "golang.org/x/sys/unix" 28 ) 29 30 var ( 31 networkOnce sync.Once 32 networkMu sync.Mutex 33 vniTbl = make(map[uint32]string) 34 ) 35 36 type networkTable map[string]*network 37 38 type subnet struct { 39 sboxInit bool 40 vxlanName string 41 brName string 42 vni uint32 43 initErr error 44 subnetIP *net.IPNet 45 gwIP *net.IPNet 46 } 47 48 type network struct { 49 id string 50 sbox *osl.Namespace 51 endpoints endpointTable 52 driver *driver 53 joinCnt int 54 sboxInit bool 55 initEpoch int 56 initErr error 57 subnets []*subnet 58 secure bool 59 mtu int 60 sync.Mutex 61 } 62 63 func init() { 64 // Lock main() to the initial thread to exclude the goroutines executing 65 // func setDefaultVLAN() from being scheduled onto that thread. Changes to 66 // the network namespace of the initial thread alter /proc/self/ns/net, 67 // which would break any code which (incorrectly) assumes that that file is 68 // a handle to the network namespace for the thread it is currently 69 // executing on. 70 runtime.LockOSThread() 71 } 72 73 func (d *driver) NetworkAllocate(id string, option map[string]string, ipV4Data, ipV6Data []driverapi.IPAMData) (map[string]string, error) { 74 return nil, types.NotImplementedErrorf("not implemented") 75 } 76 77 func (d *driver) NetworkFree(id string) error { 78 return types.NotImplementedErrorf("not implemented") 79 } 80 81 func (d *driver) CreateNetwork(id string, option map[string]interface{}, nInfo driverapi.NetworkInfo, ipV4Data, ipV6Data []driverapi.IPAMData) error { 82 if id == "" { 83 return fmt.Errorf("invalid network id") 84 } 85 if len(ipV4Data) == 0 || ipV4Data[0].Pool.String() == "0.0.0.0/0" { 86 return types.InvalidParameterErrorf("ipv4 pool is empty") 87 } 88 89 // Since we perform lazy configuration make sure we try 90 // configuring the driver when we enter CreateNetwork 91 if err := d.configure(); err != nil { 92 return err 93 } 94 95 n := &network{ 96 id: id, 97 driver: d, 98 endpoints: endpointTable{}, 99 subnets: []*subnet{}, 100 } 101 102 vnis := make([]uint32, 0, len(ipV4Data)) 103 gval, ok := option[netlabel.GenericData] 104 if !ok { 105 return fmt.Errorf("option %s is missing", netlabel.GenericData) 106 } 107 108 optMap := gval.(map[string]string) 109 vnisOpt, ok := optMap[netlabel.OverlayVxlanIDList] 110 if !ok { 111 return errors.New("no VNI provided") 112 } 113 log.G(context.TODO()).Debugf("overlay: Received vxlan IDs: %s", vnisOpt) 114 var err error 115 vnis, err = overlayutils.AppendVNIList(vnis, vnisOpt) 116 if err != nil { 117 return err 118 } 119 120 if _, ok := optMap[secureOption]; ok { 121 n.secure = true 122 } 123 if val, ok := optMap[netlabel.DriverMTU]; ok { 124 var err error 125 if n.mtu, err = strconv.Atoi(val); err != nil { 126 return fmt.Errorf("failed to parse %v: %v", val, err) 127 } 128 if n.mtu < 0 { 129 return fmt.Errorf("invalid MTU value: %v", n.mtu) 130 } 131 } 132 133 if len(vnis) == 0 { 134 return errors.New("no VNI provided") 135 } else if len(vnis) < len(ipV4Data) { 136 return fmt.Errorf("insufficient vnis(%d) passed to overlay", len(vnis)) 137 } 138 139 for i, ipd := range ipV4Data { 140 s := &subnet{ 141 subnetIP: ipd.Pool, 142 gwIP: ipd.Gateway, 143 vni: vnis[i], 144 } 145 146 n.subnets = append(n.subnets, s) 147 } 148 149 d.Lock() 150 defer d.Unlock() 151 if d.networks[n.id] != nil { 152 return fmt.Errorf("attempt to create overlay network %v that already exists", n.id) 153 } 154 155 // Make sure no rule is on the way from any stale secure network 156 if !n.secure { 157 for _, vni := range vnis { 158 d.programMangle(vni, false) 159 d.programInput(vni, false) 160 } 161 } 162 163 if nInfo != nil { 164 if err := nInfo.TableEventRegister(ovPeerTable, driverapi.EndpointObject); err != nil { 165 // XXX Undo writeToStore? No method to so. Why? 166 return err 167 } 168 } 169 170 d.networks[id] = n 171 172 return nil 173 } 174 175 func (d *driver) DeleteNetwork(nid string) error { 176 if nid == "" { 177 return fmt.Errorf("invalid network id") 178 } 179 180 // Make sure driver resources are initialized before proceeding 181 if err := d.configure(); err != nil { 182 return err 183 } 184 185 d.Lock() 186 // Only perform a peer flush operation (if required) AFTER unlocking 187 // the driver lock to avoid deadlocking w/ the peerDB. 188 var doPeerFlush bool 189 defer func() { 190 d.Unlock() 191 if doPeerFlush { 192 d.peerFlush(nid) 193 } 194 }() 195 196 // This is similar to d.network(), but we need to keep holding the lock 197 // until we are done removing this network. 198 n := d.networks[nid] 199 if n == nil { 200 return fmt.Errorf("could not find network with id %s", nid) 201 } 202 203 for _, ep := range n.endpoints { 204 if ep.ifName != "" { 205 if link, err := ns.NlHandle().LinkByName(ep.ifName); err == nil { 206 if err := ns.NlHandle().LinkDel(link); err != nil { 207 log.G(context.TODO()).WithError(err).Warnf("Failed to delete interface (%s)'s link on endpoint (%s) delete", ep.ifName, ep.id) 208 } 209 } 210 } 211 } 212 213 doPeerFlush = true 214 delete(d.networks, nid) 215 216 if n.secure { 217 for _, s := range n.subnets { 218 if err := d.programMangle(s.vni, false); err != nil { 219 log.G(context.TODO()).WithFields(log.Fields{ 220 "error": err, 221 "network_id": n.id, 222 "subnet": s.subnetIP, 223 }).Warn("Failed to clean up iptables rules during overlay network deletion") 224 } 225 if err := d.programInput(s.vni, false); err != nil { 226 log.G(context.TODO()).WithFields(log.Fields{ 227 "error": err, 228 "network_id": n.id, 229 "subnet": s.subnetIP, 230 }).Warn("Failed to clean up iptables rules during overlay network deletion") 231 } 232 } 233 } 234 235 return nil 236 } 237 238 func (d *driver) ProgramExternalConnectivity(nid, eid string, options map[string]interface{}) error { 239 return nil 240 } 241 242 func (d *driver) RevokeExternalConnectivity(nid, eid string) error { 243 return nil 244 } 245 246 func (n *network) joinSandbox(s *subnet, incJoinCount bool) error { 247 // If there is a race between two go routines here only one will win 248 // the other will wait. 249 networkOnce.Do(populateVNITbl) 250 251 n.Lock() 252 // If initialization was successful then tell the peerDB to initialize the 253 // sandbox with all the peers previously received from networkdb. But only 254 // do this after unlocking the network. Otherwise we could deadlock with 255 // on the peerDB channel while peerDB is waiting for the network lock. 256 var doInitPeerDB bool 257 defer func() { 258 n.Unlock() 259 if doInitPeerDB { 260 go n.driver.initSandboxPeerDB(n.id) 261 } 262 }() 263 264 if !n.sboxInit { 265 n.initErr = n.initSandbox() 266 doInitPeerDB = n.initErr == nil 267 // If there was an error, we cannot recover it 268 n.sboxInit = true 269 } 270 271 if n.initErr != nil { 272 return fmt.Errorf("network sandbox join failed: %v", n.initErr) 273 } 274 275 subnetErr := s.initErr 276 if !s.sboxInit { 277 subnetErr = n.initSubnetSandbox(s) 278 // We can recover from these errors 279 if subnetErr == nil { 280 s.initErr = subnetErr 281 s.sboxInit = true 282 } 283 } 284 if subnetErr != nil { 285 return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), subnetErr) 286 } 287 288 if incJoinCount { 289 n.joinCnt++ 290 } 291 292 return nil 293 } 294 295 func (n *network) leaveSandbox() { 296 n.Lock() 297 defer n.Unlock() 298 n.joinCnt-- 299 if n.joinCnt != 0 { 300 return 301 } 302 303 n.destroySandbox() 304 305 n.sboxInit = false 306 n.initErr = nil 307 for _, s := range n.subnets { 308 s.sboxInit = false 309 s.initErr = nil 310 } 311 } 312 313 // to be called while holding network lock 314 func (n *network) destroySandbox() { 315 if n.sbox != nil { 316 for _, iface := range n.sbox.Interfaces() { 317 if err := iface.Remove(); err != nil { 318 log.G(context.TODO()).Debugf("Remove interface %s failed: %v", iface.SrcName(), err) 319 } 320 } 321 322 for _, s := range n.subnets { 323 if s.vxlanName != "" { 324 err := deleteInterface(s.vxlanName) 325 if err != nil { 326 log.G(context.TODO()).Warnf("could not cleanup sandbox properly: %v", err) 327 } 328 } 329 } 330 331 n.sbox.Destroy() 332 n.sbox = nil 333 } 334 } 335 336 func populateVNITbl() { 337 filepath.WalkDir(filepath.Dir(osl.GenerateKey("walk")), 338 // NOTE(cpuguy83): The linter picked up on the fact that this walk function was not using this error argument 339 // That seems wrong... however I'm not familiar with this code or if that error matters 340 func(path string, _ os.DirEntry, _ error) error { 341 _, fname := filepath.Split(path) 342 343 if len(strings.Split(fname, "-")) <= 1 { 344 return nil 345 } 346 347 n, err := netns.GetFromPath(path) 348 if err != nil { 349 log.G(context.TODO()).Errorf("Could not open namespace path %s during vni population: %v", path, err) 350 return nil 351 } 352 defer n.Close() 353 354 nlh, err := netlink.NewHandleAt(n, unix.NETLINK_ROUTE) 355 if err != nil { 356 log.G(context.TODO()).Errorf("Could not open netlink handle during vni population for ns %s: %v", path, err) 357 return nil 358 } 359 defer nlh.Close() 360 361 err = nlh.SetSocketTimeout(soTimeout) 362 if err != nil { 363 log.G(context.TODO()).Warnf("Failed to set the timeout on the netlink handle sockets for vni table population: %v", err) 364 } 365 366 links, err := nlh.LinkList() 367 if err != nil { 368 log.G(context.TODO()).Errorf("Failed to list interfaces during vni population for ns %s: %v", path, err) 369 return nil 370 } 371 372 for _, l := range links { 373 if l.Type() == "vxlan" { 374 vniTbl[uint32(l.(*netlink.Vxlan).VxlanId)] = path 375 } 376 } 377 378 return nil 379 }) 380 } 381 382 func (n *network) generateVxlanName(s *subnet) string { 383 id := n.id 384 if len(n.id) > 5 { 385 id = n.id[:5] 386 } 387 388 return fmt.Sprintf("vx-%06x-%v", s.vni, id) 389 } 390 391 func (n *network) generateBridgeName(s *subnet) string { 392 id := n.id 393 if len(n.id) > 5 { 394 id = n.id[:5] 395 } 396 397 return n.getBridgeNamePrefix(s) + "-" + id 398 } 399 400 func (n *network) getBridgeNamePrefix(s *subnet) string { 401 return fmt.Sprintf("ov-%06x", s.vni) 402 } 403 404 func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error { 405 // Try to find this subnet's vni is being used in some 406 // other namespace by looking at vniTbl that we just 407 // populated in the once init. If a hit is found then 408 // it must a stale namespace from previous 409 // life. Destroy it completely and reclaim resourced. 410 networkMu.Lock() 411 path, ok := vniTbl[s.vni] 412 networkMu.Unlock() 413 414 if ok { 415 deleteVxlanByVNI(path, s.vni) 416 if err := unix.Unmount(path, unix.MNT_FORCE); err != nil { 417 log.G(context.TODO()).Errorf("unmount of %s failed: %v", path, err) 418 } 419 os.Remove(path) 420 421 networkMu.Lock() 422 delete(vniTbl, s.vni) 423 networkMu.Unlock() 424 } 425 426 // create a bridge and vxlan device for this subnet and move it to the sandbox 427 sbox := n.sbox 428 429 if err := sbox.AddInterface(brName, "br", osl.WithIPv4Address(s.gwIP), osl.WithIsBridge(true)); err != nil { 430 return fmt.Errorf("bridge creation in sandbox failed for subnet %q: %v", s.subnetIP.String(), err) 431 } 432 433 v6transport, err := n.driver.isIPv6Transport() 434 if err != nil { 435 log.G(context.TODO()).WithError(err).Errorf("Assuming IPv4 transport; overlay network %s will not pass traffic if the Swarm data plane is IPv6.", n.id) 436 } 437 if err := createVxlan(vxlanName, s.vni, n.maxMTU(), v6transport); err != nil { 438 return err 439 } 440 441 if err := sbox.AddInterface(vxlanName, "vxlan", osl.WithMaster(brName)); err != nil { 442 // If adding vxlan device to the overlay namespace fails, remove the bridge interface we 443 // already added to the namespace. This allows the caller to try the setup again. 444 for _, iface := range sbox.Interfaces() { 445 if iface.SrcName() == brName { 446 if ierr := iface.Remove(); ierr != nil { 447 log.G(context.TODO()).Errorf("removing bridge failed from ov ns %v failed, %v", n.sbox.Key(), ierr) 448 } 449 } 450 } 451 452 // Also, delete the vxlan interface. Since a global vni id is associated 453 // with the vxlan interface, an orphaned vxlan interface will result in 454 // failure of vxlan device creation if the vni is assigned to some other 455 // network. 456 if deleteErr := deleteInterface(vxlanName); deleteErr != nil { 457 log.G(context.TODO()).Warnf("could not delete vxlan interface, %s, error %v, after config error, %v", vxlanName, deleteErr, err) 458 } 459 return fmt.Errorf("vxlan interface creation failed for subnet %q: %v", s.subnetIP.String(), err) 460 } 461 462 if err := setDefaultVLAN(sbox); err != nil { 463 // not a fatal error 464 log.G(context.TODO()).WithError(err).Error("set bridge default vlan failed") 465 } 466 return nil 467 } 468 469 func setDefaultVLAN(ns *osl.Namespace) error { 470 var brName string 471 for _, i := range ns.Interfaces() { 472 if i.Bridge() { 473 brName = i.DstName() 474 } 475 } 476 477 // IFLA_BR_VLAN_DEFAULT_PVID was added in Linux v4.4 (see torvalds/linux@0f963b7), so we can't use netlink for 478 // setting this until Docker drops support for CentOS/RHEL 7 (kernel 3.10, eol date: 2024-06-30). 479 var innerErr error 480 err := ns.InvokeFunc(func() { 481 // Contrary to what the sysfs(5) man page says, the entries of /sys/class/net 482 // represent the networking devices visible in the network namespace of the 483 // process which mounted the sysfs filesystem, irrespective of the network 484 // namespace of the process accessing the directory. Remount sysfs in order to 485 // see the network devices in sbox's network namespace, making sure the mount 486 // doesn't propagate back. 487 // 488 // The Linux implementation of (osl.Sandbox).InvokeFunc() runs the function in a 489 // dedicated goroutine. The effects of unshare(CLONE_NEWNS) on a thread cannot 490 // be reverted so the thread needs to be terminated once the goroutine is 491 // finished. 492 runtime.LockOSThread() 493 if err := unix.Unshare(unix.CLONE_NEWNS); err != nil { 494 innerErr = os.NewSyscallError("unshare", err) 495 return 496 } 497 if err := unix.Mount("", "/", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil { 498 innerErr = &os.PathError{Op: "mount", Path: "/", Err: err} 499 return 500 } 501 if err := unix.Mount("sysfs", "/sys", "sysfs", 0, ""); err != nil { 502 innerErr = &os.PathError{Op: "mount", Path: "/sys", Err: err} 503 return 504 } 505 506 path := filepath.Join("/sys/class/net", brName, "bridge/default_pvid") 507 data := []byte{'0', '\n'} 508 509 if err := os.WriteFile(path, data, 0o644); err != nil { 510 innerErr = fmt.Errorf("failed to enable default vlan on bridge %s: %w", brName, err) 511 return 512 } 513 }) 514 if err != nil { 515 return err 516 } 517 return innerErr 518 } 519 520 // Must be called with the network lock 521 func (n *network) initSubnetSandbox(s *subnet) error { 522 brName := n.generateBridgeName(s) 523 vxlanName := n.generateVxlanName(s) 524 525 // Program iptables rules for mandatory encryption of the secure 526 // network, or clean up leftover rules for a stale secure network which 527 // was previously assigned the same VNI. 528 if err := n.driver.programMangle(s.vni, n.secure); err != nil { 529 return err 530 } 531 if err := n.driver.programInput(s.vni, n.secure); err != nil { 532 if n.secure { 533 return multierror.Append(err, n.driver.programMangle(s.vni, false)) 534 } 535 } 536 537 if err := n.setupSubnetSandbox(s, brName, vxlanName); err != nil { 538 return err 539 } 540 541 s.vxlanName = vxlanName 542 s.brName = brName 543 544 return nil 545 } 546 547 func (n *network) cleanupStaleSandboxes() { 548 filepath.WalkDir(filepath.Dir(osl.GenerateKey("walk")), 549 func(path string, _ os.DirEntry, _ error) error { 550 _, fname := filepath.Split(path) 551 552 pList := strings.Split(fname, "-") 553 if len(pList) <= 1 { 554 return nil 555 } 556 557 pattern := pList[1] 558 if strings.Contains(n.id, pattern) { 559 // Delete all vnis 560 deleteVxlanByVNI(path, 0) 561 unix.Unmount(path, unix.MNT_DETACH) 562 os.Remove(path) 563 564 // Now that we have destroyed this 565 // sandbox, remove all references to 566 // it in vniTbl so that we don't 567 // inadvertently destroy the sandbox 568 // created in this life. 569 networkMu.Lock() 570 for vni, tblPath := range vniTbl { 571 if tblPath == path { 572 delete(vniTbl, vni) 573 } 574 } 575 networkMu.Unlock() 576 } 577 578 return nil 579 }) 580 } 581 582 func (n *network) initSandbox() error { 583 n.initEpoch++ 584 585 // If there are any stale sandboxes related to this network 586 // from previous daemon life clean it up here 587 n.cleanupStaleSandboxes() 588 589 key := osl.GenerateKey(fmt.Sprintf("%d-", n.initEpoch) + n.id) 590 sbox, err := osl.NewSandbox(key, true, false) 591 if err != nil { 592 return fmt.Errorf("could not get network sandbox: %v", err) 593 } 594 595 // this is needed to let the peerAdd configure the sandbox 596 n.sbox = sbox 597 598 return nil 599 } 600 601 func (d *driver) network(nid string) *network { 602 d.Lock() 603 n := d.networks[nid] 604 d.Unlock() 605 606 return n 607 } 608 609 func (n *network) sandbox() *osl.Namespace { 610 n.Lock() 611 defer n.Unlock() 612 return n.sbox 613 } 614 615 // getSubnetforIP returns the subnet to which the given IP belongs 616 func (n *network) getSubnetforIP(ip *net.IPNet) *subnet { 617 for _, s := range n.subnets { 618 // first check if the mask lengths are the same 619 i, _ := s.subnetIP.Mask.Size() 620 j, _ := ip.Mask.Size() 621 if i != j { 622 continue 623 } 624 if s.subnetIP.Contains(ip.IP) { 625 return s 626 } 627 } 628 return nil 629 }