github.com/cilium/cilium@v1.16.2/pkg/datapath/linux/devices_controller_test.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 //go:build linux 5 6 package linux 7 8 import ( 9 "context" 10 "encoding/json" 11 "errors" 12 "log/slog" 13 "net" 14 "net/netip" 15 "os" 16 "sync/atomic" 17 "testing" 18 "time" 19 20 "github.com/cilium/hive/cell" 21 "github.com/cilium/hive/hivetest" 22 "github.com/cilium/statedb" 23 "github.com/stretchr/testify/assert" 24 "github.com/stretchr/testify/require" 25 "github.com/vishvananda/netlink" 26 "github.com/vishvananda/netlink/nl" 27 "go.uber.org/goleak" 28 "golang.org/x/sys/unix" 29 30 "github.com/cilium/cilium/pkg/datapath/tables" 31 "github.com/cilium/cilium/pkg/hive" 32 "github.com/cilium/cilium/pkg/logging" 33 "github.com/cilium/cilium/pkg/testutils" 34 "github.com/cilium/cilium/pkg/testutils/netns" 35 ) 36 37 func devicesControllerTestSetup(t *testing.T) { 38 t.Cleanup(func() { 39 goleak.VerifyNone( 40 t, 41 goleak.IgnoreCurrent(), 42 // Ignore loop() and the netlink goroutines. These are left behind as netlink library has a bug 43 // that causes it to be stuck in Recvfrom even after stop channel closes. 44 // This is fixed by https://github.com/vishvananda/netlink/pull/793, but that has not been merged. 45 // These goroutines will terminate after any route or address update. 46 goleak.IgnoreTopFunction("github.com/cilium/cilium/pkg/datapath/linux.(*devicesController).loop"), 47 goleak.IgnoreTopFunction("syscall.Syscall6"), // Recvfrom 48 ) 49 }) 50 } 51 52 const ( 53 secondaryAddress = true 54 primaryAddress = false 55 ) 56 57 func containsAddress(dev *tables.Device, addrStr string, secondary bool) bool { 58 addr := netip.MustParseAddr(addrStr) 59 for _, a := range dev.Addrs { 60 if a.Addr == addr && a.Secondary == secondary { 61 return true 62 } 63 } 64 return false 65 } 66 67 func TestDevicesController(t *testing.T) { 68 ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) 69 defer cancel() 70 71 testutils.PrivilegedTest(t) 72 devicesControllerTestSetup(t) 73 74 logging.SetLogLevelToDebug() 75 76 addrToString := func(addr netip.Addr) string { 77 if !addr.IsValid() { 78 return "" 79 } 80 return addr.String() 81 } 82 83 routeExists := func(routes []*tables.Route, linkIndex int, dst, src, gw string) bool { 84 for _, r := range routes { 85 // undefined IP will stringify as "invalid IP", turn them into "". 86 actualDst, actualSrc, actualGw := r.Dst.String(), addrToString(r.Src), addrToString(r.Gw) 87 if r.LinkIndex == linkIndex && actualDst == dst && actualSrc == src && 88 actualGw == gw { 89 return true 90 } 91 } 92 return false 93 } 94 95 v4Routes := func(routes []*tables.Route) (out []*tables.Route) { 96 for _, r := range routes { 97 if r.Dst.Addr().Is4() { 98 out = append(out, r) 99 } 100 } 101 return 102 } 103 104 orphanRoutes := func(devs []*tables.Device, routes []*tables.Route) bool { 105 indexes := map[int]bool{} 106 for _, dev := range devs { 107 indexes[dev.Index] = true 108 } 109 for _, r := range routes { 110 if !indexes[r.LinkIndex] { 111 // A route exists without a device. 112 t.Logf("Orphan route found: %+v", r) 113 return true 114 } 115 } 116 return false 117 } 118 119 // The test steps perform an action, wait for devices table to change 120 // and then validate the change. Since we may see intermediate states 121 // in the devices table (as there's multiple netlink updates that may 122 // be processed at different times) the check function is repeated 123 // until the desired state is reached or [ctx] times out. 124 testSteps := []struct { 125 name string 126 prepare func(*testing.T) 127 check func(*testing.T, []*tables.Device, []*tables.Route) bool 128 }{ 129 { 130 "initial", 131 func(*testing.T) {}, 132 func(t *testing.T, devs []*tables.Device, routes []*tables.Route) bool { 133 return len(devs) == 1 && 134 devs[0].Name == "dummy0" && 135 devs[0].Index > 0 && 136 devs[0].Selected && 137 routeExists(routes, devs[0].Index, "192.168.0.0/24", "192.168.0.1", "") 138 }, 139 }, 140 { 141 "add dummy1", 142 func(t *testing.T) { 143 // Create another dummy to check that the table updates. 144 require.NoError(t, createDummy("dummy1", "192.168.1.1/24", false)) 145 146 // Add a default route 147 assert.NoError(t, 148 addRoute(addRouteParams{iface: "dummy1", gw: "192.168.1.254", table: unix.RT_TABLE_MAIN})) 149 }, 150 func(t *testing.T, devs []*tables.Device, routes []*tables.Route) bool { 151 // Since we're indexing by ifindex, we expect these to be in the order 152 // they were added. 153 return len(devs) == 2 && 154 "dummy0" == devs[0].Name && 155 routeExists(routes, devs[0].Index, "192.168.0.0/24", "192.168.0.1", "") && 156 devs[0].Selected && 157 "dummy1" == devs[1].Name && 158 devs[1].Selected && 159 routeExists(routes, devs[1].Index, "192.168.1.0/24", "192.168.1.1", "") 160 }, 161 }, 162 163 { 164 "secondary address", 165 func(t *testing.T) { 166 require.NoError(t, addAddrScoped("dummy1", "192.168.1.2/24", netlink.SCOPE_SITE, unix.IFA_F_SECONDARY)) 167 }, 168 func(t *testing.T, devs []*tables.Device, routes []*tables.Route) bool { 169 // Since we're indexing by ifindex, we expect these to be in the order 170 // they were added. 171 return len(devs) == 2 && 172 "dummy1" == devs[1].Name && 173 devs[1].Selected && 174 containsAddress(devs[1], "192.168.1.1", primaryAddress) && 175 containsAddress(devs[1], "192.168.1.2", secondaryAddress) 176 }, 177 }, 178 179 { // Only consider veth devices when they have a default route. 180 "veth-without-default-gw", 181 func(t *testing.T) { 182 require.NoError(t, createVeth("veth0", "192.168.4.1/24", false)) 183 }, 184 func(t *testing.T, devs []*tables.Device, routes []*tables.Route) bool { 185 // No changes expected to previous step. 186 return len(devs) == 2 && 187 "dummy0" == devs[0].Name && 188 "dummy1" == devs[1].Name 189 }, 190 }, 191 192 { 193 "veth-with-default-gw", 194 func(t *testing.T) { 195 // Remove default route from dummy1 196 assert.NoError(t, 197 delRoute(addRouteParams{iface: "dummy1", gw: "192.168.1.254", table: unix.RT_TABLE_MAIN})) 198 199 // And add one for veth0. 200 assert.NoError(t, 201 addRoute(addRouteParams{iface: "veth0", gw: "192.168.4.254", table: unix.RT_TABLE_MAIN})) 202 }, 203 func(t *testing.T, devs []*tables.Device, routes []*tables.Route) bool { 204 return len(devs) == 3 && 205 devs[0].Name == "dummy0" && 206 devs[1].Name == "dummy1" && 207 devs[2].Name == "veth0" && 208 containsAddress(devs[2], "192.168.4.1", primaryAddress) && 209 routeExists(routes, devs[2].Index, "0.0.0.0/0", "", "192.168.4.254") 210 }, 211 }, 212 213 { 214 "check-all-v4-routes", 215 func(t *testing.T) {}, 216 func(t *testing.T, devs []*tables.Device, routes []*tables.Route) bool { 217 routes = v4Routes(routes) 218 json, _ := json.Marshal(routes) 219 os.WriteFile("/tmp/routes.json", json, 0644) 220 return routeExists(routes, devs[0].Index, "192.168.0.0/24", "192.168.0.1", "") && 221 routeExists(routes, devs[0].Index, "192.168.0.1/32", "192.168.0.1", "") && 222 routeExists(routes, devs[0].Index, "192.168.0.255/32", "192.168.0.1", "") && 223 224 routeExists(routes, devs[1].Index, "192.168.1.0/24", "192.168.1.1", "") && 225 routeExists(routes, devs[1].Index, "192.168.1.1/32", "192.168.1.1", "") && 226 routeExists(routes, devs[1].Index, "192.168.1.2/32", "192.168.1.1", "") && 227 routeExists(routes, devs[1].Index, "192.168.1.255/32", "192.168.1.1", "") && 228 229 routeExists(routes, devs[2].Index, "192.168.4.0/24", "192.168.4.1", "") && 230 routeExists(routes, devs[2].Index, "192.168.4.1/32", "192.168.4.1", "") && 231 routeExists(routes, devs[2].Index, "192.168.4.255/32", "192.168.4.1", "") && 232 routeExists(routes, devs[2].Index, "0.0.0.0/0", "", "192.168.4.254") && 233 len(routes) == 11 234 }, 235 }, 236 237 { 238 "delete-dummy0", 239 func(t *testing.T) { 240 require.NoError(t, deleteLink("dummy0")) 241 }, 242 func(t *testing.T, devs []*tables.Device, routes []*tables.Route) bool { 243 return len(devs) == 2 && 244 "dummy1" == devs[0].Name && 245 "veth0" == devs[1].Name 246 }, 247 }, 248 249 { 250 "bond-is-selected", 251 func(t *testing.T) { 252 require.NoError(t, deleteLink("veth0")) 253 require.NoError(t, createBond("bond0", "192.168.6.1/24", false)) 254 require.NoError(t, setBondMaster("dummy1", "bond0")) 255 }, 256 func(t *testing.T, devs []*tables.Device, routes []*tables.Route) bool { 257 // Slaved devices are ignored, so we should only see bond0. 258 return len(devs) == 1 && 259 devs[0].Name == "bond0" && 260 devs[0].Selected 261 }, 262 }, 263 { 264 "dummy1-restored", 265 func(t *testing.T) { 266 // Deleting the bond device restores dummy1 as a selected device 267 // as it is no longer a slave device. 268 assert.NoError(t, deleteLink("bond0")) 269 assert.NoError(t, setLinkUp("dummy1")) 270 }, 271 func(t *testing.T, devs []*tables.Device, routes []*tables.Route) bool { 272 return len(devs) == 1 && 273 devs[0].Name == "dummy1" && 274 devs[0].Selected 275 }, 276 }, 277 { 278 "skip-bridge-devices", 279 func(t *testing.T) { 280 require.NoError(t, createBridge("br0", "192.168.5.1/24", false)) 281 require.NoError(t, setMaster("dummy1", "br0")) 282 }, 283 func(t *testing.T, devs []*tables.Device, routes []*tables.Route) bool { 284 return len(devs) == 0 285 }, 286 }, 287 } 288 289 tlog := hivetest.Logger(t) 290 ns := netns.NewNetNS(t) 291 ns.Do(func() error { 292 var ( 293 db *statedb.DB 294 devicesTable statedb.Table[*tables.Device] 295 routesTable statedb.Table[*tables.Route] 296 ) 297 h := hive.New( 298 DevicesControllerCell, 299 cell.Provide(func() (*netlinkFuncs, error) { 300 // Provide the normal netlink interface, but restrict it to the test network 301 // namespace. 302 return makeNetlinkFuncs() 303 }), 304 305 cell.Invoke(func(db_ *statedb.DB, devicesTable_ statedb.Table[*tables.Device], routesTable_ statedb.Table[*tables.Route]) { 306 db = db_ 307 devicesTable = devicesTable_ 308 routesTable = routesTable_ 309 })) 310 311 // Create a dummy device before starting to exercise initialize() 312 require.NoError(t, createDummy("dummy0", "192.168.0.1/24", false)) 313 314 err := h.Start(tlog, ctx) 315 require.NoError(t, err) 316 317 for _, step := range testSteps { 318 step.prepare(t) 319 320 // Get the new set of devices 321 for { 322 txn := db.ReadTxn() 323 allDevs := statedb.Collect(devicesTable.All(txn)) 324 devs, devsInvalidated := tables.SelectedDevices(devicesTable, txn) 325 326 routesIter, routesIterInvalidated := routesTable.AllWatch(txn) 327 routes := statedb.Collect(routesIter) 328 329 // Stop if the test case passes and there are no orphan routes left in the 330 // route table. 331 if step.check(t, devs, routes) && !orphanRoutes(allDevs, routes) { 332 break 333 } 334 335 // Wait for a changes and try again. 336 select { 337 case <-routesIterInvalidated: 338 case <-devsInvalidated: 339 case <-ctx.Done(): 340 txn.WriteJSON(os.Stdout) 341 t.Fatalf("Test case %q timed out while waiting for devices", step.name) 342 } 343 } 344 345 if t.Failed() { 346 break 347 } 348 } 349 350 err = h.Stop(tlog, ctx) 351 require.NoError(t, err) 352 return nil 353 }) 354 } 355 356 // Test that if the user specifies a device wildcard, then all devices not matching the wildcard 357 // will be marked as non-selected. 358 func TestDevicesController_Wildcards(t *testing.T) { 359 ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) 360 defer cancel() 361 362 testutils.PrivilegedTest(t) 363 devicesControllerTestSetup(t) 364 365 tlog := hivetest.Logger(t) 366 ns := netns.NewNetNS(t) 367 ns.Do(func() error { 368 var ( 369 db *statedb.DB 370 devicesTable statedb.Table[*tables.Device] 371 ) 372 h := hive.New( 373 DevicesControllerCell, 374 cell.Provide(func() (*netlinkFuncs, error) { return makeNetlinkFuncs() }), 375 cell.Invoke(func(db_ *statedb.DB, devicesTable_ statedb.Table[*tables.Device]) { 376 db = db_ 377 devicesTable = devicesTable_ 378 })) 379 hive.AddConfigOverride(h, func(c *DevicesConfig) { 380 c.Devices = []string{"dummy+"} 381 }) 382 383 err := h.Start(tlog, ctx) 384 require.NoError(t, err) 385 require.NoError(t, createDummy("dummy0", "192.168.0.1/24", false)) 386 require.NoError(t, createDummy("nonviable", "192.168.1.1/24", false)) 387 388 // This device satisfies the autodetection rule, but should not be included 389 // because the ForceDeviceDetection option is not enabled 390 require.NoError(t, createDummy("eth0", "1.2.3.4/24", false)) 391 392 for { 393 rxn := db.ReadTxn() 394 devs, invalidated := tables.SelectedDevices(devicesTable, rxn) 395 396 if len(devs) == 1 && devs[0].Name == "dummy0" { 397 break 398 } 399 400 // Not yet what we expected, wait for changes and try again. 401 select { 402 case <-ctx.Done(): 403 t.Fatalf("Test timed out while waiting for devices, last seen: %v", devs) 404 case <-invalidated: 405 } 406 } 407 408 err = h.Stop(tlog, context.TODO()) 409 assert.NoError(t, err) 410 return nil 411 }) 412 } 413 414 // TestDevicesController_with_ForcedDetection tests the behavior of device detection when forced detection is enabled. 415 // It expects all devices matching a specific pattern to be detected will append to detected devices and marked as selected. 416 func TestDevicesController_with_ForcedDetection(t *testing.T) { 417 ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) 418 defer cancel() 419 420 testutils.PrivilegedTest(t) 421 devicesControllerTestSetup(t) 422 423 tlog := hivetest.Logger(t) 424 ns := netns.NewNetNS(t) 425 ns.Do(func() error { 426 var ( 427 db *statedb.DB 428 devicesTable statedb.Table[*tables.Device] 429 h *hive.Hive 430 ) 431 432 // Function to set up the hive and run device detection 433 runDeviceDetection := func(devicePattern string, forceDetection bool) error { 434 h = hive.New( 435 DevicesControllerCell, 436 cell.Provide(func() (*netlinkFuncs, error) { return makeNetlinkFuncs() }), 437 cell.Invoke(func(db_ *statedb.DB, devicesTable_ statedb.Table[*tables.Device]) { 438 db = db_ 439 devicesTable = devicesTable_ 440 }), 441 ) 442 hive.AddConfigOverride(h, func(c *DevicesConfig) { 443 c.Devices = []string{devicePattern} 444 c.ForceDeviceDetection = forceDetection 445 }) 446 447 return h.Start(tlog, ctx) 448 } 449 450 // Function to check the expected number of devices 451 testDevices := func(expectedCount int) bool { 452 rxn := db.ReadTxn() 453 devs, invalidated := tables.SelectedDevices(devicesTable, rxn) 454 if len(devs) == expectedCount { 455 return true 456 } 457 458 select { 459 case <-ctx.Done(): 460 t.Fatalf("Test timed out while waiting for devices, last seen: %v", devs) 461 return false 462 case <-invalidated: 463 return false 464 } 465 } 466 467 // Create dummy interfaces as per test requirements 468 require.NoError(t, createDummy("dummy0", "192.168.0.1/24", false)) 469 require.NoError(t, createDummy("dummy1", "192.168.1.1/24", false)) 470 471 // This device does not match the "dummy+" pattern, but should be included 472 // because the ForceDeviceDetection option is enabled 473 require.NoError(t, createDummy("eth0", "1.2.3.4/24", false)) 474 475 // Test with forced detection enabled 476 require.NoError(t, runDeviceDetection("dummy+", true)) 477 require.True(t, testDevices(3), "Expecting all three devices to be detected") 478 require.NoError(t, h.Stop(tlog, ctx)) 479 480 return nil 481 }) 482 } 483 484 func TestDevicesController_Restarts(t *testing.T) { 485 ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) 486 defer cancel() 487 488 var ( 489 db *statedb.DB 490 devicesTable statedb.Table[*tables.Device] 491 ) 492 493 // Is this the first subscription? 494 var first atomic.Bool 495 first.Store(true) 496 497 funcs := netlinkFuncs{ 498 AddrList: func(link netlink.Link, family int) ([]netlink.Addr, error) { 499 return nil, nil 500 }, 501 502 Close: func() {}, 503 504 LinkList: func() ([]netlink.Link, error) { 505 if first.Load() { 506 // On first round we create a stale device that should get flushed 507 // from the devices table. 508 return []netlink.Link{&netlink.Dummy{ 509 LinkAttrs: netlink.LinkAttrs{ 510 Index: 2, 511 Name: "stale", 512 HardwareAddr: []byte{2, 3, 4, 5, 6, 7}, 513 }, 514 }}, nil 515 } 516 return nil, nil 517 }, 518 519 RouteListFiltered: func(family int, filter *netlink.Route, filterMask uint64) ([]netlink.Route, error) { 520 return nil, nil 521 }, 522 523 RouteSubscribe: func(ch chan<- netlink.RouteUpdate, done <-chan struct{}, errorCallback func(error)) error { 524 go func() { 525 defer close(ch) 526 if !first.Load() { 527 _, ipn, _ := net.ParseCIDR("1.2.3.0/24") 528 select { 529 case <-done: 530 case ch <- netlink.RouteUpdate{ 531 Type: unix.RTM_NEWROUTE, 532 Route: netlink.Route{ 533 LinkIndex: 1, 534 Table: unix.RT_TABLE_DEFAULT, 535 Scope: unix.RT_SCOPE_SITE, 536 Dst: ipn, 537 }, 538 }: 539 } 540 } 541 <-done 542 }() 543 return nil 544 }, 545 AddrSubscribe: func(ch chan<- netlink.AddrUpdate, done <-chan struct{}, errorCallback func(error)) error { 546 go func() { 547 defer close(ch) 548 if !first.Load() { 549 _, ipn, _ := net.ParseCIDR("1.2.3.4/24") 550 select { 551 case <-done: 552 case ch <- netlink.AddrUpdate{ 553 LinkAddress: *ipn, 554 LinkIndex: 1, 555 NewAddr: true, 556 }: 557 } 558 } 559 <-done 560 }() 561 return nil 562 }, 563 LinkSubscribe: func(ch chan<- netlink.LinkUpdate, done <-chan struct{}, errorCallback func(error)) error { 564 go func() { 565 defer close(ch) 566 if first.Load() { 567 // Simulate a netlink socket failure on the first subscription round 568 errorCallback(errors.New("first")) 569 first.Store(false) 570 } else { 571 select { 572 case <-done: 573 case ch <- netlink.LinkUpdate{ 574 IfInfomsg: nl.IfInfomsg{IfInfomsg: unix.IfInfomsg{Index: 1}}, 575 Header: unix.NlMsghdr{Type: unix.RTM_NEWLINK}, 576 Link: &netlink.Dummy{ 577 LinkAttrs: netlink.LinkAttrs{ 578 Index: 1, 579 Name: "dummy", 580 HardwareAddr: []byte{1, 2, 3, 4, 5, 6}, 581 }, 582 }, 583 }: 584 } 585 } 586 <-done 587 }() 588 return nil 589 }, 590 } 591 592 tlog := hivetest.Logger(t, hivetest.LogLevel(slog.LevelDebug)) 593 h := hive.New( 594 DevicesControllerCell, 595 cell.Provide(func() *netlinkFuncs { return &funcs }), 596 cell.Invoke(func(db_ *statedb.DB, devicesTable_ statedb.Table[*tables.Device]) { 597 db = db_ 598 devicesTable = devicesTable_ 599 })) 600 601 err := h.Start(tlog, ctx) 602 assert.NoError(t, err) 603 604 for { 605 rxn := db.ReadTxn() 606 iter, invalidated := devicesTable.AllWatch(rxn) 607 devs := statedb.Collect(iter) 608 609 // We expect the 'stale' device to have been flushed by the restart 610 // and for the 'dummy' to have appeared. 611 if len(devs) == 1 && devs[0].Name == "dummy" { 612 break 613 } 614 615 select { 616 case <-ctx.Done(): 617 rxn.WriteJSON(os.Stdout) 618 t.Fatalf("Test timed out while waiting for device, last seen: %v", devs) 619 case <-invalidated: 620 } 621 } 622 623 err = h.Stop(tlog, ctx) 624 assert.NoError(t, err) 625 626 }