github.com/Prakhar-Agarwal-byte/moby@v0.0.0-20231027092010-a14e3e8ab87e/libnetwork/drivers/overlay/ov_network.go (about) 1 //go:build linux 2 3 package overlay 4 5 import ( 6 "context" 7 "errors" 8 "fmt" 9 "net" 10 "os" 11 "path/filepath" 12 "runtime" 13 "strconv" 14 "strings" 15 "sync" 16 17 "github.com/Prakhar-Agarwal-byte/moby/libnetwork/driverapi" 18 "github.com/Prakhar-Agarwal-byte/moby/libnetwork/drivers/overlay/overlayutils" 19 "github.com/Prakhar-Agarwal-byte/moby/libnetwork/netlabel" 20 "github.com/Prakhar-Agarwal-byte/moby/libnetwork/ns" 21 "github.com/Prakhar-Agarwal-byte/moby/libnetwork/osl" 22 "github.com/Prakhar-Agarwal-byte/moby/libnetwork/types" 23 "github.com/containerd/log" 24 "github.com/vishvananda/netlink" 25 "github.com/vishvananda/netns" 26 "golang.org/x/sys/unix" 27 ) 28 29 var ( 30 networkOnce sync.Once 31 networkMu sync.Mutex 32 vniTbl = make(map[uint32]string) 33 ) 34 35 type networkTable map[string]*network 36 37 type subnet struct { 38 sboxInit bool 39 vxlanName string 40 brName string 41 vni uint32 42 initErr error 43 subnetIP *net.IPNet 44 gwIP *net.IPNet 45 } 46 47 type network struct { 48 id string 49 sbox *osl.Namespace 50 endpoints endpointTable 51 driver *driver 52 joinCnt int 53 sboxInit bool 54 initEpoch int 55 initErr error 56 subnets []*subnet 57 secure bool 58 mtu int 59 sync.Mutex 60 } 61 62 func init() { 63 // Lock main() to the initial thread to exclude the goroutines executing 64 // func setDefaultVLAN() from being scheduled onto that thread. Changes to 65 // the network namespace of the initial thread alter /proc/self/ns/net, 66 // which would break any code which (incorrectly) assumes that that file is 67 // a handle to the network namespace for the thread it is currently 68 // executing on. 69 runtime.LockOSThread() 70 } 71 72 func (d *driver) NetworkAllocate(id string, option map[string]string, ipV4Data, ipV6Data []driverapi.IPAMData) (map[string]string, error) { 73 return nil, types.NotImplementedErrorf("not implemented") 74 } 75 76 func (d *driver) NetworkFree(id string) error { 77 return types.NotImplementedErrorf("not implemented") 78 } 79 80 func (d *driver) CreateNetwork(id string, option map[string]interface{}, nInfo driverapi.NetworkInfo, ipV4Data, ipV6Data []driverapi.IPAMData) error { 81 if id == "" { 82 return fmt.Errorf("invalid network id") 83 } 84 if len(ipV4Data) == 0 || ipV4Data[0].Pool.String() == "0.0.0.0/0" { 85 return types.InvalidParameterErrorf("ipv4 pool is empty") 86 } 87 88 // Since we perform lazy configuration make sure we try 89 // configuring the driver when we enter CreateNetwork 90 if err := d.configure(); err != nil { 91 return err 92 } 93 94 n := &network{ 95 id: id, 96 driver: d, 97 endpoints: endpointTable{}, 98 subnets: []*subnet{}, 99 } 100 101 vnis := make([]uint32, 0, len(ipV4Data)) 102 gval, ok := option[netlabel.GenericData] 103 if !ok { 104 return fmt.Errorf("option %s is missing", netlabel.GenericData) 105 } 106 107 optMap := gval.(map[string]string) 108 vnisOpt, ok := optMap[netlabel.OverlayVxlanIDList] 109 if !ok { 110 return errors.New("no VNI provided") 111 } 112 log.G(context.TODO()).Debugf("overlay: Received vxlan IDs: %s", vnisOpt) 113 var err error 114 vnis, err = overlayutils.AppendVNIList(vnis, vnisOpt) 115 if err != nil { 116 return err 117 } 118 119 if _, ok := optMap[secureOption]; ok { 120 n.secure = true 121 } 122 if val, ok := optMap[netlabel.DriverMTU]; ok { 123 var err error 124 if n.mtu, err = strconv.Atoi(val); err != nil { 125 return fmt.Errorf("failed to parse %v: %v", val, err) 126 } 127 if n.mtu < 0 { 128 return fmt.Errorf("invalid MTU value: %v", n.mtu) 129 } 130 } 131 132 if len(vnis) == 0 { 133 return errors.New("no VNI provided") 134 } else if len(vnis) < len(ipV4Data) { 135 return fmt.Errorf("insufficient vnis(%d) passed to overlay", len(vnis)) 136 } 137 138 for i, ipd := range ipV4Data { 139 s := &subnet{ 140 subnetIP: ipd.Pool, 141 gwIP: ipd.Gateway, 142 vni: vnis[i], 143 } 144 145 n.subnets = append(n.subnets, s) 146 } 147 148 d.Lock() 149 defer d.Unlock() 150 if d.networks[n.id] != nil { 151 return fmt.Errorf("attempt to create overlay network %v that already exists", n.id) 152 } 153 154 // Make sure no rule is on the way from any stale secure network 155 if !n.secure { 156 for _, vni := range vnis { 157 programMangle(vni, false) 158 programInput(vni, false) 159 } 160 } 161 162 if nInfo != nil { 163 if err := nInfo.TableEventRegister(ovPeerTable, driverapi.EndpointObject); err != nil { 164 // XXX Undo writeToStore? No method to so. Why? 165 return err 166 } 167 } 168 169 d.networks[id] = n 170 171 return nil 172 } 173 174 func (d *driver) DeleteNetwork(nid string) error { 175 if nid == "" { 176 return fmt.Errorf("invalid network id") 177 } 178 179 // Make sure driver resources are initialized before proceeding 180 if err := d.configure(); err != nil { 181 return err 182 } 183 184 d.Lock() 185 // Only perform a peer flush operation (if required) AFTER unlocking 186 // the driver lock to avoid deadlocking w/ the peerDB. 187 var doPeerFlush bool 188 defer func() { 189 d.Unlock() 190 if doPeerFlush { 191 d.peerFlush(nid) 192 } 193 }() 194 195 // This is similar to d.network(), but we need to keep holding the lock 196 // until we are done removing this network. 197 n := d.networks[nid] 198 if n == nil { 199 return fmt.Errorf("could not find network with id %s", nid) 200 } 201 202 for _, ep := range n.endpoints { 203 if ep.ifName != "" { 204 if link, err := ns.NlHandle().LinkByName(ep.ifName); err == nil { 205 if err := ns.NlHandle().LinkDel(link); err != nil { 206 log.G(context.TODO()).WithError(err).Warnf("Failed to delete interface (%s)'s link on endpoint (%s) delete", ep.ifName, ep.id) 207 } 208 } 209 } 210 } 211 212 doPeerFlush = true 213 delete(d.networks, nid) 214 215 if n.secure { 216 for _, s := range n.subnets { 217 if err := programMangle(s.vni, false); err != nil { 218 log.G(context.TODO()).WithFields(log.Fields{ 219 "error": err, 220 "network_id": n.id, 221 "subnet": s.subnetIP, 222 }).Warn("Failed to clean up iptables rules during overlay network deletion") 223 } 224 if err := programInput(s.vni, false); err != nil { 225 log.G(context.TODO()).WithFields(log.Fields{ 226 "error": err, 227 "network_id": n.id, 228 "subnet": s.subnetIP, 229 }).Warn("Failed to clean up iptables rules during overlay network deletion") 230 } 231 } 232 } 233 234 return nil 235 } 236 237 func (d *driver) ProgramExternalConnectivity(nid, eid string, options map[string]interface{}) error { 238 return nil 239 } 240 241 func (d *driver) RevokeExternalConnectivity(nid, eid string) error { 242 return nil 243 } 244 245 func (n *network) joinSandbox(s *subnet, incJoinCount bool) error { 246 // If there is a race between two go routines here only one will win 247 // the other will wait. 248 networkOnce.Do(populateVNITbl) 249 250 n.Lock() 251 // If initialization was successful then tell the peerDB to initialize the 252 // sandbox with all the peers previously received from networkdb. But only 253 // do this after unlocking the network. Otherwise we could deadlock with 254 // on the peerDB channel while peerDB is waiting for the network lock. 255 var doInitPeerDB bool 256 defer func() { 257 n.Unlock() 258 if doInitPeerDB { 259 go n.driver.initSandboxPeerDB(n.id) 260 } 261 }() 262 263 if !n.sboxInit { 264 n.initErr = n.initSandbox() 265 doInitPeerDB = n.initErr == nil 266 // If there was an error, we cannot recover it 267 n.sboxInit = true 268 } 269 270 if n.initErr != nil { 271 return fmt.Errorf("network sandbox join failed: %v", n.initErr) 272 } 273 274 subnetErr := s.initErr 275 if !s.sboxInit { 276 subnetErr = n.initSubnetSandbox(s) 277 // We can recover from these errors 278 if subnetErr == nil { 279 s.initErr = subnetErr 280 s.sboxInit = true 281 } 282 } 283 if subnetErr != nil { 284 return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), subnetErr) 285 } 286 287 if incJoinCount { 288 n.joinCnt++ 289 } 290 291 return nil 292 } 293 294 func (n *network) leaveSandbox() { 295 n.Lock() 296 defer n.Unlock() 297 n.joinCnt-- 298 if n.joinCnt != 0 { 299 return 300 } 301 302 n.destroySandbox() 303 304 n.sboxInit = false 305 n.initErr = nil 306 for _, s := range n.subnets { 307 s.sboxInit = false 308 s.initErr = nil 309 } 310 } 311 312 // to be called while holding network lock 313 func (n *network) destroySandbox() { 314 if n.sbox != nil { 315 for _, iface := range n.sbox.Interfaces() { 316 if err := iface.Remove(); err != nil { 317 log.G(context.TODO()).Debugf("Remove interface %s failed: %v", iface.SrcName(), err) 318 } 319 } 320 321 for _, s := range n.subnets { 322 if s.vxlanName != "" { 323 err := deleteInterface(s.vxlanName) 324 if err != nil { 325 log.G(context.TODO()).Warnf("could not cleanup sandbox properly: %v", err) 326 } 327 } 328 } 329 330 n.sbox.Destroy() 331 n.sbox = nil 332 } 333 } 334 335 func populateVNITbl() { 336 filepath.WalkDir(filepath.Dir(osl.GenerateKey("walk")), 337 // NOTE(cpuguy83): The linter picked up on the fact that this walk function was not using this error argument 338 // That seems wrong... however I'm not familiar with this code or if that error matters 339 func(path string, _ os.DirEntry, _ error) error { 340 _, fname := filepath.Split(path) 341 342 if len(strings.Split(fname, "-")) <= 1 { 343 return nil 344 } 345 346 n, err := netns.GetFromPath(path) 347 if err != nil { 348 log.G(context.TODO()).Errorf("Could not open namespace path %s during vni population: %v", path, err) 349 return nil 350 } 351 defer n.Close() 352 353 nlh, err := netlink.NewHandleAt(n, unix.NETLINK_ROUTE) 354 if err != nil { 355 log.G(context.TODO()).Errorf("Could not open netlink handle during vni population for ns %s: %v", path, err) 356 return nil 357 } 358 defer nlh.Close() 359 360 err = nlh.SetSocketTimeout(soTimeout) 361 if err != nil { 362 log.G(context.TODO()).Warnf("Failed to set the timeout on the netlink handle sockets for vni table population: %v", err) 363 } 364 365 links, err := nlh.LinkList() 366 if err != nil { 367 log.G(context.TODO()).Errorf("Failed to list interfaces during vni population for ns %s: %v", path, err) 368 return nil 369 } 370 371 for _, l := range links { 372 if l.Type() == "vxlan" { 373 vniTbl[uint32(l.(*netlink.Vxlan).VxlanId)] = path 374 } 375 } 376 377 return nil 378 }) 379 } 380 381 func (n *network) generateVxlanName(s *subnet) string { 382 id := n.id 383 if len(n.id) > 5 { 384 id = n.id[:5] 385 } 386 387 return fmt.Sprintf("vx-%06x-%v", s.vni, id) 388 } 389 390 func (n *network) generateBridgeName(s *subnet) string { 391 id := n.id 392 if len(n.id) > 5 { 393 id = n.id[:5] 394 } 395 396 return n.getBridgeNamePrefix(s) + "-" + id 397 } 398 399 func (n *network) getBridgeNamePrefix(s *subnet) string { 400 return fmt.Sprintf("ov-%06x", s.vni) 401 } 402 403 func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error { 404 // Try to find this subnet's vni is being used in some 405 // other namespace by looking at vniTbl that we just 406 // populated in the once init. If a hit is found then 407 // it must a stale namespace from previous 408 // life. Destroy it completely and reclaim resourced. 409 networkMu.Lock() 410 path, ok := vniTbl[s.vni] 411 networkMu.Unlock() 412 413 if ok { 414 deleteVxlanByVNI(path, s.vni) 415 if err := unix.Unmount(path, unix.MNT_FORCE); err != nil { 416 log.G(context.TODO()).Errorf("unmount of %s failed: %v", path, err) 417 } 418 os.Remove(path) 419 420 networkMu.Lock() 421 delete(vniTbl, s.vni) 422 networkMu.Unlock() 423 } 424 425 // create a bridge and vxlan device for this subnet and move it to the sandbox 426 sbox := n.sbox 427 428 if err := sbox.AddInterface(brName, "br", osl.WithIPv4Address(s.gwIP), osl.WithIsBridge(true)); err != nil { 429 return fmt.Errorf("bridge creation in sandbox failed for subnet %q: %v", s.subnetIP.String(), err) 430 } 431 432 err := createVxlan(vxlanName, s.vni, n.maxMTU()) 433 if err != nil { 434 return err 435 } 436 437 if err := sbox.AddInterface(vxlanName, "vxlan", osl.WithMaster(brName)); err != nil { 438 // If adding vxlan device to the overlay namespace fails, remove the bridge interface we 439 // already added to the namespace. This allows the caller to try the setup again. 440 for _, iface := range sbox.Interfaces() { 441 if iface.SrcName() == brName { 442 if ierr := iface.Remove(); ierr != nil { 443 log.G(context.TODO()).Errorf("removing bridge failed from ov ns %v failed, %v", n.sbox.Key(), ierr) 444 } 445 } 446 } 447 448 // Also, delete the vxlan interface. Since a global vni id is associated 449 // with the vxlan interface, an orphaned vxlan interface will result in 450 // failure of vxlan device creation if the vni is assigned to some other 451 // network. 452 if deleteErr := deleteInterface(vxlanName); deleteErr != nil { 453 log.G(context.TODO()).Warnf("could not delete vxlan interface, %s, error %v, after config error, %v", vxlanName, deleteErr, err) 454 } 455 return fmt.Errorf("vxlan interface creation failed for subnet %q: %v", s.subnetIP.String(), err) 456 } 457 458 if err := setDefaultVLAN(sbox); err != nil { 459 // not a fatal error 460 log.G(context.TODO()).WithError(err).Error("set bridge default vlan failed") 461 } 462 return nil 463 } 464 465 func setDefaultVLAN(ns *osl.Namespace) error { 466 var brName string 467 for _, i := range ns.Interfaces() { 468 if i.Bridge() { 469 brName = i.DstName() 470 } 471 } 472 473 // IFLA_BR_VLAN_DEFAULT_PVID was added in Linux v4.4 (see torvalds/linux@0f963b7), so we can't use netlink for 474 // setting this until Docker drops support for CentOS/RHEL 7 (kernel 3.10, eol date: 2024-06-30). 475 var innerErr error 476 err := ns.InvokeFunc(func() { 477 // Contrary to what the sysfs(5) man page says, the entries of /sys/class/net 478 // represent the networking devices visible in the network namespace of the 479 // process which mounted the sysfs filesystem, irrespective of the network 480 // namespace of the process accessing the directory. Remount sysfs in order to 481 // see the network devices in sbox's network namespace, making sure the mount 482 // doesn't propagate back. 483 // 484 // The Linux implementation of (osl.Sandbox).InvokeFunc() runs the function in a 485 // dedicated goroutine. The effects of unshare(CLONE_NEWNS) on a thread cannot 486 // be reverted so the thread needs to be terminated once the goroutine is 487 // finished. 488 runtime.LockOSThread() 489 if err := unix.Unshare(unix.CLONE_NEWNS); err != nil { 490 innerErr = os.NewSyscallError("unshare", err) 491 return 492 } 493 if err := unix.Mount("", "/", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil { 494 innerErr = &os.PathError{Op: "mount", Path: "/", Err: err} 495 return 496 } 497 if err := unix.Mount("sysfs", "/sys", "sysfs", 0, ""); err != nil { 498 innerErr = &os.PathError{Op: "mount", Path: "/sys", Err: err} 499 return 500 } 501 502 path := filepath.Join("/sys/class/net", brName, "bridge/default_pvid") 503 data := []byte{'0', '\n'} 504 505 if err := os.WriteFile(path, data, 0o644); err != nil { 506 innerErr = fmt.Errorf("failed to enable default vlan on bridge %s: %w", brName, err) 507 return 508 } 509 }) 510 if err != nil { 511 return err 512 } 513 return innerErr 514 } 515 516 // Must be called with the network lock 517 func (n *network) initSubnetSandbox(s *subnet) error { 518 brName := n.generateBridgeName(s) 519 vxlanName := n.generateVxlanName(s) 520 521 // Program iptables rules for mandatory encryption of the secure 522 // network, or clean up leftover rules for a stale secure network which 523 // was previously assigned the same VNI. 524 if err := programMangle(s.vni, n.secure); err != nil { 525 return err 526 } 527 if err := programInput(s.vni, n.secure); err != nil { 528 if n.secure { 529 return multierror.Append(err, programMangle(s.vni, false)) 530 } 531 } 532 533 if err := n.setupSubnetSandbox(s, brName, vxlanName); err != nil { 534 return err 535 } 536 537 s.vxlanName = vxlanName 538 s.brName = brName 539 540 return nil 541 } 542 543 func (n *network) cleanupStaleSandboxes() { 544 filepath.WalkDir(filepath.Dir(osl.GenerateKey("walk")), 545 func(path string, _ os.DirEntry, _ error) error { 546 _, fname := filepath.Split(path) 547 548 pList := strings.Split(fname, "-") 549 if len(pList) <= 1 { 550 return nil 551 } 552 553 pattern := pList[1] 554 if strings.Contains(n.id, pattern) { 555 // Delete all vnis 556 deleteVxlanByVNI(path, 0) 557 unix.Unmount(path, unix.MNT_DETACH) 558 os.Remove(path) 559 560 // Now that we have destroyed this 561 // sandbox, remove all references to 562 // it in vniTbl so that we don't 563 // inadvertently destroy the sandbox 564 // created in this life. 565 networkMu.Lock() 566 for vni, tblPath := range vniTbl { 567 if tblPath == path { 568 delete(vniTbl, vni) 569 } 570 } 571 networkMu.Unlock() 572 } 573 574 return nil 575 }) 576 } 577 578 func (n *network) initSandbox() error { 579 n.initEpoch++ 580 581 // If there are any stale sandboxes related to this network 582 // from previous daemon life clean it up here 583 n.cleanupStaleSandboxes() 584 585 key := osl.GenerateKey(fmt.Sprintf("%d-", n.initEpoch) + n.id) 586 sbox, err := osl.NewSandbox(key, true, false) 587 if err != nil { 588 return fmt.Errorf("could not get network sandbox: %v", err) 589 } 590 591 // this is needed to let the peerAdd configure the sandbox 592 n.sbox = sbox 593 594 return nil 595 } 596 597 func (d *driver) network(nid string) *network { 598 d.Lock() 599 n := d.networks[nid] 600 d.Unlock() 601 602 return n 603 } 604 605 func (n *network) sandbox() *osl.Namespace { 606 n.Lock() 607 defer n.Unlock() 608 return n.sbox 609 } 610 611 // getSubnetforIP returns the subnet to which the given IP belongs 612 func (n *network) getSubnetforIP(ip *net.IPNet) *subnet { 613 for _, s := range n.subnets { 614 // first check if the mask lengths are the same 615 i, _ := s.subnetIP.Mask.Size() 616 j, _ := ip.Mask.Size() 617 if i != j { 618 continue 619 } 620 if s.subnetIP.Contains(ip.IP) { 621 return s 622 } 623 } 624 return nil 625 }