github.com/rawahars/moby@v24.0.4+incompatible/libnetwork/drivers/overlay/ov_network.go (about) 1 //go:build linux 2 // +build linux 3 4 package overlay 5 6 import ( 7 "errors" 8 "fmt" 9 "net" 10 "os" 11 "path/filepath" 12 "runtime" 13 "strconv" 14 "strings" 15 "sync" 16 17 "github.com/docker/docker/libnetwork/driverapi" 18 "github.com/docker/docker/libnetwork/netlabel" 19 "github.com/docker/docker/libnetwork/ns" 20 "github.com/docker/docker/libnetwork/osl" 21 "github.com/docker/docker/libnetwork/types" 22 "github.com/hashicorp/go-multierror" 23 "github.com/sirupsen/logrus" 24 "github.com/vishvananda/netlink" 25 "github.com/vishvananda/netns" 26 "golang.org/x/sys/unix" 27 ) 28 29 var ( 30 networkOnce sync.Once 31 networkMu sync.Mutex 32 vniTbl = make(map[uint32]string) 33 ) 34 35 type networkTable map[string]*network 36 37 type subnet struct { 38 sboxInit bool 39 vxlanName string 40 brName string 41 vni uint32 42 initErr error 43 subnetIP *net.IPNet 44 gwIP *net.IPNet 45 } 46 47 type network struct { 48 id string 49 sbox osl.Sandbox 50 endpoints endpointTable 51 driver *driver 52 joinCnt int 53 sboxInit bool 54 initEpoch int 55 initErr error 56 subnets []*subnet 57 secure bool 58 mtu int 59 sync.Mutex 60 } 61 62 func init() { 63 // Lock main() to the initial thread to exclude the goroutines executing 64 // func setDefaultVLAN() from being scheduled onto that thread. Changes to 65 // the network namespace of the initial thread alter /proc/self/ns/net, 66 // which would break any code which (incorrectly) assumes that that file is 67 // a handle to the network namespace for the thread it is currently 68 // executing on. 69 runtime.LockOSThread() 70 } 71 72 func (d *driver) NetworkAllocate(id string, option map[string]string, ipV4Data, ipV6Data []driverapi.IPAMData) (map[string]string, error) { 73 return nil, types.NotImplementedErrorf("not implemented") 74 } 75 76 func (d *driver) NetworkFree(id string) error { 77 return types.NotImplementedErrorf("not implemented") 78 } 79 80 func (d *driver) CreateNetwork(id string, option map[string]interface{}, nInfo driverapi.NetworkInfo, ipV4Data, ipV6Data []driverapi.IPAMData) error { 81 if id == "" { 82 return fmt.Errorf("invalid network id") 83 } 84 if len(ipV4Data) == 0 || ipV4Data[0].Pool.String() == "0.0.0.0/0" { 85 return types.BadRequestErrorf("ipv4 pool is empty") 86 } 87 88 // Since we perform lazy configuration make sure we try 89 // configuring the driver when we enter CreateNetwork 90 if err := d.configure(); err != nil { 91 return err 92 } 93 94 n := &network{ 95 id: id, 96 driver: d, 97 endpoints: endpointTable{}, 98 subnets: []*subnet{}, 99 } 100 101 vnis := make([]uint32, 0, len(ipV4Data)) 102 gval, ok := option[netlabel.GenericData] 103 if !ok { 104 return fmt.Errorf("option %s is missing", netlabel.GenericData) 105 } 106 107 optMap := gval.(map[string]string) 108 vnisOpt, ok := optMap[netlabel.OverlayVxlanIDList] 109 if !ok { 110 return errors.New("no VNI provided") 111 } 112 logrus.Debugf("overlay: Received vxlan IDs: %s", vnisOpt) 113 vniStrings := strings.Split(vnisOpt, ",") 114 for _, vniStr := range vniStrings { 115 vni, err := strconv.Atoi(vniStr) 116 if err != nil { 117 return fmt.Errorf("invalid vxlan id value %q passed", vniStr) 118 } 119 120 vnis = append(vnis, uint32(vni)) 121 } 122 123 if _, ok := optMap[secureOption]; ok { 124 n.secure = true 125 } 126 if val, ok := optMap[netlabel.DriverMTU]; ok { 127 var err error 128 if n.mtu, err = strconv.Atoi(val); err != nil { 129 return fmt.Errorf("failed to parse %v: %v", val, err) 130 } 131 if n.mtu < 0 { 132 return fmt.Errorf("invalid MTU value: %v", n.mtu) 133 } 134 } 135 136 if len(vnis) == 0 { 137 return errors.New("no VNI provided") 138 } else if len(vnis) < len(ipV4Data) { 139 return fmt.Errorf("insufficient vnis(%d) passed to overlay", len(vnis)) 140 } 141 142 for i, ipd := range ipV4Data { 143 s := &subnet{ 144 subnetIP: ipd.Pool, 145 gwIP: ipd.Gateway, 146 vni: vnis[i], 147 } 148 149 n.subnets = append(n.subnets, s) 150 } 151 152 d.Lock() 153 defer d.Unlock() 154 if d.networks[n.id] != nil { 155 return fmt.Errorf("attempt to create overlay network %v that already exists", n.id) 156 } 157 158 // Make sure no rule is on the way from any stale secure network 159 if !n.secure { 160 for _, vni := range vnis { 161 programMangle(vni, false) 162 programInput(vni, false) 163 } 164 } 165 166 if nInfo != nil { 167 if err := nInfo.TableEventRegister(ovPeerTable, driverapi.EndpointObject); err != nil { 168 // XXX Undo writeToStore? No method to so. Why? 169 return err 170 } 171 } 172 173 d.networks[id] = n 174 175 return nil 176 } 177 178 func (d *driver) DeleteNetwork(nid string) error { 179 if nid == "" { 180 return fmt.Errorf("invalid network id") 181 } 182 183 // Make sure driver resources are initialized before proceeding 184 if err := d.configure(); err != nil { 185 return err 186 } 187 188 d.Lock() 189 // Only perform a peer flush operation (if required) AFTER unlocking 190 // the driver lock to avoid deadlocking w/ the peerDB. 191 var doPeerFlush bool 192 defer func() { 193 d.Unlock() 194 if doPeerFlush { 195 d.peerFlush(nid) 196 } 197 }() 198 199 // This is similar to d.network(), but we need to keep holding the lock 200 // until we are done removing this network. 201 n := d.networks[nid] 202 if n == nil { 203 return fmt.Errorf("could not find network with id %s", nid) 204 } 205 206 for _, ep := range n.endpoints { 207 if ep.ifName != "" { 208 if link, err := ns.NlHandle().LinkByName(ep.ifName); err == nil { 209 if err := ns.NlHandle().LinkDel(link); err != nil { 210 logrus.WithError(err).Warnf("Failed to delete interface (%s)'s link on endpoint (%s) delete", ep.ifName, ep.id) 211 } 212 } 213 } 214 } 215 216 doPeerFlush = true 217 delete(d.networks, nid) 218 219 if n.secure { 220 for _, s := range n.subnets { 221 if err := programMangle(s.vni, false); err != nil { 222 logrus.WithFields(logrus.Fields{ 223 logrus.ErrorKey: err, 224 "network_id": n.id, 225 "subnet": s.subnetIP, 226 }).Warn("Failed to clean up iptables rules during overlay network deletion") 227 } 228 if err := programInput(s.vni, false); err != nil { 229 logrus.WithFields(logrus.Fields{ 230 logrus.ErrorKey: err, 231 "network_id": n.id, 232 "subnet": s.subnetIP, 233 }).Warn("Failed to clean up iptables rules during overlay network deletion") 234 } 235 } 236 } 237 238 return nil 239 } 240 241 func (d *driver) ProgramExternalConnectivity(nid, eid string, options map[string]interface{}) error { 242 return nil 243 } 244 245 func (d *driver) RevokeExternalConnectivity(nid, eid string) error { 246 return nil 247 } 248 249 func (n *network) joinSandbox(s *subnet, incJoinCount bool) error { 250 // If there is a race between two go routines here only one will win 251 // the other will wait. 252 networkOnce.Do(populateVNITbl) 253 254 n.Lock() 255 // If initialization was successful then tell the peerDB to initialize the 256 // sandbox with all the peers previously received from networkdb. But only 257 // do this after unlocking the network. Otherwise we could deadlock with 258 // on the peerDB channel while peerDB is waiting for the network lock. 259 var doInitPeerDB bool 260 defer func() { 261 n.Unlock() 262 if doInitPeerDB { 263 go n.driver.initSandboxPeerDB(n.id) 264 } 265 }() 266 267 if !n.sboxInit { 268 n.initErr = n.initSandbox() 269 doInitPeerDB = n.initErr == nil 270 // If there was an error, we cannot recover it 271 n.sboxInit = true 272 } 273 274 if n.initErr != nil { 275 return fmt.Errorf("network sandbox join failed: %v", n.initErr) 276 } 277 278 subnetErr := s.initErr 279 if !s.sboxInit { 280 subnetErr = n.initSubnetSandbox(s) 281 // We can recover from these errors 282 if subnetErr == nil { 283 s.initErr = subnetErr 284 s.sboxInit = true 285 } 286 } 287 if subnetErr != nil { 288 return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), subnetErr) 289 } 290 291 if incJoinCount { 292 n.joinCnt++ 293 } 294 295 return nil 296 } 297 298 func (n *network) leaveSandbox() { 299 n.Lock() 300 defer n.Unlock() 301 n.joinCnt-- 302 if n.joinCnt != 0 { 303 return 304 } 305 306 n.destroySandbox() 307 308 n.sboxInit = false 309 n.initErr = nil 310 for _, s := range n.subnets { 311 s.sboxInit = false 312 s.initErr = nil 313 } 314 } 315 316 // to be called while holding network lock 317 func (n *network) destroySandbox() { 318 if n.sbox != nil { 319 for _, iface := range n.sbox.Info().Interfaces() { 320 if err := iface.Remove(); err != nil { 321 logrus.Debugf("Remove interface %s failed: %v", iface.SrcName(), err) 322 } 323 } 324 325 for _, s := range n.subnets { 326 if s.vxlanName != "" { 327 err := deleteInterface(s.vxlanName) 328 if err != nil { 329 logrus.Warnf("could not cleanup sandbox properly: %v", err) 330 } 331 } 332 } 333 334 n.sbox.Destroy() 335 n.sbox = nil 336 } 337 } 338 339 func populateVNITbl() { 340 filepath.WalkDir(filepath.Dir(osl.GenerateKey("walk")), 341 // NOTE(cpuguy83): The linter picked up on the fact that this walk function was not using this error argument 342 // That seems wrong... however I'm not familiar with this code or if that error matters 343 func(path string, _ os.DirEntry, _ error) error { 344 _, fname := filepath.Split(path) 345 346 if len(strings.Split(fname, "-")) <= 1 { 347 return nil 348 } 349 350 n, err := netns.GetFromPath(path) 351 if err != nil { 352 logrus.Errorf("Could not open namespace path %s during vni population: %v", path, err) 353 return nil 354 } 355 defer n.Close() 356 357 nlh, err := netlink.NewHandleAt(n, unix.NETLINK_ROUTE) 358 if err != nil { 359 logrus.Errorf("Could not open netlink handle during vni population for ns %s: %v", path, err) 360 return nil 361 } 362 defer nlh.Close() 363 364 err = nlh.SetSocketTimeout(soTimeout) 365 if err != nil { 366 logrus.Warnf("Failed to set the timeout on the netlink handle sockets for vni table population: %v", err) 367 } 368 369 links, err := nlh.LinkList() 370 if err != nil { 371 logrus.Errorf("Failed to list interfaces during vni population for ns %s: %v", path, err) 372 return nil 373 } 374 375 for _, l := range links { 376 if l.Type() == "vxlan" { 377 vniTbl[uint32(l.(*netlink.Vxlan).VxlanId)] = path 378 } 379 } 380 381 return nil 382 }) 383 } 384 385 func (n *network) generateVxlanName(s *subnet) string { 386 id := n.id 387 if len(n.id) > 5 { 388 id = n.id[:5] 389 } 390 391 return fmt.Sprintf("vx-%06x-%v", s.vni, id) 392 } 393 394 func (n *network) generateBridgeName(s *subnet) string { 395 id := n.id 396 if len(n.id) > 5 { 397 id = n.id[:5] 398 } 399 400 return n.getBridgeNamePrefix(s) + "-" + id 401 } 402 403 func (n *network) getBridgeNamePrefix(s *subnet) string { 404 return fmt.Sprintf("ov-%06x", s.vni) 405 } 406 407 func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error { 408 // Try to find this subnet's vni is being used in some 409 // other namespace by looking at vniTbl that we just 410 // populated in the once init. If a hit is found then 411 // it must a stale namespace from previous 412 // life. Destroy it completely and reclaim resourced. 413 networkMu.Lock() 414 path, ok := vniTbl[s.vni] 415 networkMu.Unlock() 416 417 if ok { 418 deleteVxlanByVNI(path, s.vni) 419 if err := unix.Unmount(path, unix.MNT_FORCE); err != nil { 420 logrus.Errorf("unmount of %s failed: %v", path, err) 421 } 422 os.Remove(path) 423 424 networkMu.Lock() 425 delete(vniTbl, s.vni) 426 networkMu.Unlock() 427 } 428 429 // create a bridge and vxlan device for this subnet and move it to the sandbox 430 sbox := n.sbox 431 432 if err := sbox.AddInterface(brName, "br", 433 sbox.InterfaceOptions().Address(s.gwIP), 434 sbox.InterfaceOptions().Bridge(true)); err != nil { 435 return fmt.Errorf("bridge creation in sandbox failed for subnet %q: %v", s.subnetIP.String(), err) 436 } 437 438 err := createVxlan(vxlanName, s.vni, n.maxMTU()) 439 if err != nil { 440 return err 441 } 442 443 if err := sbox.AddInterface(vxlanName, "vxlan", 444 sbox.InterfaceOptions().Master(brName)); err != nil { 445 // If adding vxlan device to the overlay namespace fails, remove the bridge interface we 446 // already added to the namespace. This allows the caller to try the setup again. 447 for _, iface := range sbox.Info().Interfaces() { 448 if iface.SrcName() == brName { 449 if ierr := iface.Remove(); ierr != nil { 450 logrus.Errorf("removing bridge failed from ov ns %v failed, %v", n.sbox.Key(), ierr) 451 } 452 } 453 } 454 455 // Also, delete the vxlan interface. Since a global vni id is associated 456 // with the vxlan interface, an orphaned vxlan interface will result in 457 // failure of vxlan device creation if the vni is assigned to some other 458 // network. 459 if deleteErr := deleteInterface(vxlanName); deleteErr != nil { 460 logrus.Warnf("could not delete vxlan interface, %s, error %v, after config error, %v", vxlanName, deleteErr, err) 461 } 462 return fmt.Errorf("vxlan interface creation failed for subnet %q: %v", s.subnetIP.String(), err) 463 } 464 465 if err := setDefaultVLAN(sbox); err != nil { 466 // not a fatal error 467 logrus.WithError(err).Error("set bridge default vlan failed") 468 } 469 return nil 470 } 471 472 func setDefaultVLAN(sbox osl.Sandbox) error { 473 var brName string 474 for _, i := range sbox.Info().Interfaces() { 475 if i.Bridge() { 476 brName = i.DstName() 477 } 478 } 479 480 // IFLA_BR_VLAN_DEFAULT_PVID was added in Linux v4.4 (see torvalds/linux@0f963b7), so we can't use netlink for 481 // setting this until Docker drops support for CentOS/RHEL 7 (kernel 3.10, eol date: 2024-06-30). 482 var innerErr error 483 err := sbox.InvokeFunc(func() { 484 // Contrary to what the sysfs(5) man page says, the entries of /sys/class/net 485 // represent the networking devices visible in the network namespace of the 486 // process which mounted the sysfs filesystem, irrespective of the network 487 // namespace of the process accessing the directory. Remount sysfs in order to 488 // see the network devices in sbox's network namespace, making sure the mount 489 // doesn't propagate back. 490 // 491 // The Linux implementation of (osl.Sandbox).InvokeFunc() runs the function in a 492 // dedicated goroutine. The effects of unshare(CLONE_NEWNS) on a thread cannot 493 // be reverted so the thread needs to be terminated once the goroutine is 494 // finished. 495 runtime.LockOSThread() 496 if err := unix.Unshare(unix.CLONE_NEWNS); err != nil { 497 innerErr = os.NewSyscallError("unshare", err) 498 return 499 } 500 if err := unix.Mount("", "/", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil { 501 innerErr = &os.PathError{Op: "mount", Path: "/", Err: err} 502 return 503 } 504 if err := unix.Mount("sysfs", "/sys", "sysfs", 0, ""); err != nil { 505 innerErr = &os.PathError{Op: "mount", Path: "/sys", Err: err} 506 return 507 } 508 509 path := filepath.Join("/sys/class/net", brName, "bridge/default_pvid") 510 data := []byte{'0', '\n'} 511 512 if err := os.WriteFile(path, data, 0o644); err != nil { 513 innerErr = fmt.Errorf("failed to enable default vlan on bridge %s: %w", brName, err) 514 return 515 } 516 }) 517 if err != nil { 518 return err 519 } 520 return innerErr 521 } 522 523 // Must be called with the network lock 524 func (n *network) initSubnetSandbox(s *subnet) error { 525 brName := n.generateBridgeName(s) 526 vxlanName := n.generateVxlanName(s) 527 528 // Program iptables rules for mandatory encryption of the secure 529 // network, or clean up leftover rules for a stale secure network which 530 // was previously assigned the same VNI. 531 if err := programMangle(s.vni, n.secure); err != nil { 532 return err 533 } 534 if err := programInput(s.vni, n.secure); err != nil { 535 if n.secure { 536 return multierror.Append(err, programMangle(s.vni, false)) 537 } 538 } 539 540 if err := n.setupSubnetSandbox(s, brName, vxlanName); err != nil { 541 return err 542 } 543 544 s.vxlanName = vxlanName 545 s.brName = brName 546 547 return nil 548 } 549 550 func (n *network) cleanupStaleSandboxes() { 551 filepath.WalkDir(filepath.Dir(osl.GenerateKey("walk")), 552 func(path string, _ os.DirEntry, _ error) error { 553 _, fname := filepath.Split(path) 554 555 pList := strings.Split(fname, "-") 556 if len(pList) <= 1 { 557 return nil 558 } 559 560 pattern := pList[1] 561 if strings.Contains(n.id, pattern) { 562 // Delete all vnis 563 deleteVxlanByVNI(path, 0) 564 unix.Unmount(path, unix.MNT_DETACH) 565 os.Remove(path) 566 567 // Now that we have destroyed this 568 // sandbox, remove all references to 569 // it in vniTbl so that we don't 570 // inadvertently destroy the sandbox 571 // created in this life. 572 networkMu.Lock() 573 for vni, tblPath := range vniTbl { 574 if tblPath == path { 575 delete(vniTbl, vni) 576 } 577 } 578 networkMu.Unlock() 579 } 580 581 return nil 582 }) 583 } 584 585 func (n *network) initSandbox() error { 586 n.initEpoch++ 587 588 // If there are any stale sandboxes related to this network 589 // from previous daemon life clean it up here 590 n.cleanupStaleSandboxes() 591 592 key := osl.GenerateKey(fmt.Sprintf("%d-", n.initEpoch) + n.id) 593 sbox, err := osl.NewSandbox(key, true, false) 594 if err != nil { 595 return fmt.Errorf("could not get network sandbox: %v", err) 596 } 597 598 // this is needed to let the peerAdd configure the sandbox 599 n.sbox = sbox 600 601 return nil 602 } 603 604 func (d *driver) network(nid string) *network { 605 d.Lock() 606 n := d.networks[nid] 607 d.Unlock() 608 609 return n 610 } 611 612 func (n *network) sandbox() osl.Sandbox { 613 n.Lock() 614 defer n.Unlock() 615 return n.sbox 616 } 617 618 // getSubnetforIP returns the subnet to which the given IP belongs 619 func (n *network) getSubnetforIP(ip *net.IPNet) *subnet { 620 for _, s := range n.subnets { 621 // first check if the mask lengths are the same 622 i, _ := s.subnetIP.Mask.Size() 623 j, _ := ip.Mask.Size() 624 if i != j { 625 continue 626 } 627 if s.subnetIP.Contains(ip.IP) { 628 return s 629 } 630 } 631 return nil 632 }