github.com/docker/engine@v22.0.0-20211208180946-d456264580cf+incompatible/libnetwork/drivers/overlay/peerdb.go (about) 1 //go:build linux 2 // +build linux 3 4 package overlay 5 6 import ( 7 "context" 8 "fmt" 9 "net" 10 "sync" 11 "syscall" 12 13 "github.com/docker/docker/libnetwork/internal/caller" 14 "github.com/docker/docker/libnetwork/internal/setmatrix" 15 "github.com/docker/docker/libnetwork/osl" 16 "github.com/sirupsen/logrus" 17 ) 18 19 const ovPeerTable = "overlay_peer_table" 20 21 type peerKey struct { 22 peerIP net.IP 23 peerMac net.HardwareAddr 24 } 25 26 type peerEntry struct { 27 eid string 28 vtep net.IP 29 peerIPMask net.IPMask 30 isLocal bool 31 } 32 33 func (p *peerEntry) MarshalDB() peerEntryDB { 34 ones, bits := p.peerIPMask.Size() 35 return peerEntryDB{ 36 eid: p.eid, 37 vtep: p.vtep.String(), 38 peerIPMaskOnes: ones, 39 peerIPMaskBits: bits, 40 isLocal: p.isLocal, 41 } 42 } 43 44 // This the structure saved into the set (SetMatrix), due to the implementation of it 45 // the value inserted in the set has to be Hashable so the []byte had to be converted into 46 // strings 47 type peerEntryDB struct { 48 eid string 49 vtep string 50 peerIPMaskOnes int 51 peerIPMaskBits int 52 isLocal bool 53 } 54 55 func (p *peerEntryDB) UnMarshalDB() peerEntry { 56 return peerEntry{ 57 eid: p.eid, 58 vtep: net.ParseIP(p.vtep), 59 peerIPMask: net.CIDRMask(p.peerIPMaskOnes, p.peerIPMaskBits), 60 isLocal: p.isLocal, 61 } 62 } 63 64 type peerMap struct { 65 // set of peerEntry, note they have to be objects and not pointers to maintain the proper equality checks 66 mp setmatrix.SetMatrix 67 sync.Mutex 68 } 69 70 type peerNetworkMap struct { 71 // map with key peerKey 72 mp map[string]*peerMap 73 sync.Mutex 74 } 75 76 func (pKey peerKey) String() string { 77 return fmt.Sprintf("%s %s", pKey.peerIP, pKey.peerMac) 78 } 79 80 func (pKey *peerKey) Scan(state fmt.ScanState, verb rune) error { 81 ipB, err := state.Token(true, nil) 82 if err != nil { 83 return err 84 } 85 86 pKey.peerIP = net.ParseIP(string(ipB)) 87 88 macB, err := state.Token(true, nil) 89 if err != nil { 90 return err 91 } 92 93 pKey.peerMac, err = net.ParseMAC(string(macB)) 94 return err 95 } 96 97 func (d *driver) peerDbWalk(f func(string, *peerKey, *peerEntry) bool) error { 98 d.peerDb.Lock() 99 nids := []string{} 100 for nid := range d.peerDb.mp { 101 nids = append(nids, nid) 102 } 103 d.peerDb.Unlock() 104 105 for _, nid := range nids { 106 d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool { 107 return f(nid, pKey, pEntry) 108 }) 109 } 110 return nil 111 } 112 113 func (d *driver) peerDbNetworkWalk(nid string, f func(*peerKey, *peerEntry) bool) error { 114 d.peerDb.Lock() 115 pMap, ok := d.peerDb.mp[nid] 116 d.peerDb.Unlock() 117 118 if !ok { 119 return nil 120 } 121 122 mp := map[string]peerEntry{} 123 pMap.Lock() 124 for _, pKeyStr := range pMap.mp.Keys() { 125 entryDBList, ok := pMap.mp.Get(pKeyStr) 126 if ok { 127 peerEntryDB := entryDBList[0].(peerEntryDB) 128 mp[pKeyStr] = peerEntryDB.UnMarshalDB() 129 } 130 } 131 pMap.Unlock() 132 133 for pKeyStr, pEntry := range mp { 134 var pKey peerKey 135 pEntry := pEntry 136 if _, err := fmt.Sscan(pKeyStr, &pKey); err != nil { 137 logrus.Warnf("Peer key scan on network %s failed: %v", nid, err) 138 } 139 if f(&pKey, &pEntry) { 140 return nil 141 } 142 } 143 144 return nil 145 } 146 147 func (d *driver) peerDbSearch(nid string, peerIP net.IP) (*peerKey, *peerEntry, error) { 148 var pKeyMatched *peerKey 149 var pEntryMatched *peerEntry 150 err := d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool { 151 if pKey.peerIP.Equal(peerIP) { 152 pKeyMatched = pKey 153 pEntryMatched = pEntry 154 return true 155 } 156 157 return false 158 }) 159 160 if err != nil { 161 return nil, nil, fmt.Errorf("peerdb search for peer ip %q failed: %v", peerIP, err) 162 } 163 164 if pKeyMatched == nil || pEntryMatched == nil { 165 return nil, nil, fmt.Errorf("peer ip %q not found in peerdb", peerIP) 166 } 167 168 return pKeyMatched, pEntryMatched, nil 169 } 170 171 func (d *driver) peerDbAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, 172 peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) { 173 174 d.peerDb.Lock() 175 pMap, ok := d.peerDb.mp[nid] 176 if !ok { 177 d.peerDb.mp[nid] = &peerMap{ 178 mp: setmatrix.NewSetMatrix(), 179 } 180 181 pMap = d.peerDb.mp[nid] 182 } 183 d.peerDb.Unlock() 184 185 pKey := peerKey{ 186 peerIP: peerIP, 187 peerMac: peerMac, 188 } 189 190 pEntry := peerEntry{ 191 eid: eid, 192 vtep: vtep, 193 peerIPMask: peerIPMask, 194 isLocal: isLocal, 195 } 196 197 pMap.Lock() 198 defer pMap.Unlock() 199 b, i := pMap.mp.Insert(pKey.String(), pEntry.MarshalDB()) 200 if i != 1 { 201 // Transient case, there is more than one endpoint that is using the same IP,MAC pair 202 s, _ := pMap.mp.String(pKey.String()) 203 logrus.Warnf("peerDbAdd transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s) 204 } 205 return b, i 206 } 207 208 func (d *driver) peerDbDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, 209 peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) { 210 211 d.peerDb.Lock() 212 pMap, ok := d.peerDb.mp[nid] 213 if !ok { 214 d.peerDb.Unlock() 215 return false, 0 216 } 217 d.peerDb.Unlock() 218 219 pKey := peerKey{ 220 peerIP: peerIP, 221 peerMac: peerMac, 222 } 223 224 pEntry := peerEntry{ 225 eid: eid, 226 vtep: vtep, 227 peerIPMask: peerIPMask, 228 isLocal: isLocal, 229 } 230 231 pMap.Lock() 232 defer pMap.Unlock() 233 b, i := pMap.mp.Remove(pKey.String(), pEntry.MarshalDB()) 234 if i != 0 { 235 // Transient case, there is more than one endpoint that is using the same IP,MAC pair 236 s, _ := pMap.mp.String(pKey.String()) 237 logrus.Warnf("peerDbDelete transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s) 238 } 239 return b, i 240 } 241 242 // The overlay uses a lazy initialization approach, this means that when a network is created 243 // and the driver registered the overlay does not allocate resources till the moment that a 244 // sandbox is actually created. 245 // At the moment of this call, that happens when a sandbox is initialized, is possible that 246 // networkDB has already delivered some events of peers already available on remote nodes, 247 // these peers are saved into the peerDB and this function is used to properly configure 248 // the network sandbox with all those peers that got previously notified. 249 // Note also that this method sends a single message on the channel and the go routine on the 250 // other side, will atomically loop on the whole table of peers and will program their state 251 // in one single atomic operation. This is fundamental to guarantee consistency, and avoid that 252 // new peerAdd or peerDelete gets reordered during the sandbox init. 253 func (d *driver) initSandboxPeerDB(nid string) { 254 d.peerInit(nid) 255 } 256 257 type peerOperationType int32 258 259 const ( 260 peerOperationINIT peerOperationType = iota 261 peerOperationADD 262 peerOperationDELETE 263 peerOperationFLUSH 264 ) 265 266 type peerOperation struct { 267 opType peerOperationType 268 networkID string 269 endpointID string 270 peerIP net.IP 271 peerIPMask net.IPMask 272 peerMac net.HardwareAddr 273 vtepIP net.IP 274 l2Miss bool 275 l3Miss bool 276 localPeer bool 277 callerName string 278 } 279 280 func (d *driver) peerOpRoutine(ctx context.Context, ch chan *peerOperation) { 281 var err error 282 for { 283 select { 284 case <-ctx.Done(): 285 return 286 case op := <-ch: 287 switch op.opType { 288 case peerOperationINIT: 289 err = d.peerInitOp(op.networkID) 290 case peerOperationADD: 291 err = d.peerAddOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.l2Miss, op.l3Miss, true, op.localPeer) 292 case peerOperationDELETE: 293 err = d.peerDeleteOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.localPeer) 294 case peerOperationFLUSH: 295 err = d.peerFlushOp(op.networkID) 296 } 297 if err != nil { 298 logrus.Warnf("Peer operation failed:%s op:%v", err, op) 299 } 300 } 301 } 302 } 303 304 func (d *driver) peerInit(nid string) { 305 callerName := caller.Name(1) 306 d.peerOpCh <- &peerOperation{ 307 opType: peerOperationINIT, 308 networkID: nid, 309 callerName: callerName, 310 } 311 } 312 313 func (d *driver) peerInitOp(nid string) error { 314 return d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool { 315 // Local entries do not need to be added 316 if pEntry.isLocal { 317 return false 318 } 319 320 d.peerAddOp(nid, pEntry.eid, pKey.peerIP, pEntry.peerIPMask, pKey.peerMac, pEntry.vtep, false, false, false, pEntry.isLocal) 321 // return false to loop on all entries 322 return false 323 }) 324 } 325 326 func (d *driver) peerAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, 327 peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, localPeer bool) { 328 d.peerOpCh <- &peerOperation{ 329 opType: peerOperationADD, 330 networkID: nid, 331 endpointID: eid, 332 peerIP: peerIP, 333 peerIPMask: peerIPMask, 334 peerMac: peerMac, 335 vtepIP: vtep, 336 l2Miss: l2Miss, 337 l3Miss: l3Miss, 338 localPeer: localPeer, 339 callerName: caller.Name(1), 340 } 341 } 342 343 func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, 344 peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, updateDB, localPeer bool) error { 345 346 if err := validateID(nid, eid); err != nil { 347 return err 348 } 349 350 var dbEntries int 351 var inserted bool 352 if updateDB { 353 inserted, dbEntries = d.peerDbAdd(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer) 354 if !inserted { 355 logrus.Warnf("Entry already present in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v", 356 nid, eid, peerIP, peerMac, localPeer, vtep) 357 } 358 } 359 360 // Local peers do not need any further configuration 361 if localPeer { 362 return nil 363 } 364 365 n := d.network(nid) 366 if n == nil { 367 return nil 368 } 369 370 sbox := n.sandbox() 371 if sbox == nil { 372 // We are hitting this case for all the events that are arriving before that the sandbox 373 // is being created. The peer got already added into the database and the sanbox init will 374 // call the peerDbUpdateSandbox that will configure all these peers from the database 375 return nil 376 } 377 378 IP := &net.IPNet{ 379 IP: peerIP, 380 Mask: peerIPMask, 381 } 382 383 s := n.getSubnetforIP(IP) 384 if s == nil { 385 return fmt.Errorf("couldn't find the subnet %q in network %q", IP.String(), n.id) 386 } 387 388 if err := n.obtainVxlanID(s); err != nil { 389 return fmt.Errorf("couldn't get vxlan id for %q: %v", s.subnetIP.String(), err) 390 } 391 392 if err := n.joinSandbox(s, false, false); err != nil { 393 return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), err) 394 } 395 396 if err := d.checkEncryption(nid, vtep, n.vxlanID(s), false, true); err != nil { 397 logrus.Warn(err) 398 } 399 400 // Add neighbor entry for the peer IP 401 if err := sbox.AddNeighbor(peerIP, peerMac, l3Miss, sbox.NeighborOptions().LinkName(s.vxlanName)); err != nil { 402 if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 1 { 403 // We are in the transient case so only the first configuration is programmed into the kernel 404 // Upon deletion if the active configuration is deleted the next one from the database will be restored 405 // Note we are skipping also the next configuration 406 return nil 407 } 408 return fmt.Errorf("could not add neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) 409 } 410 411 // Add fdb entry to the bridge for the peer mac 412 if err := sbox.AddNeighbor(vtep, peerMac, l2Miss, sbox.NeighborOptions().LinkName(s.vxlanName), 413 sbox.NeighborOptions().Family(syscall.AF_BRIDGE)); err != nil { 414 return fmt.Errorf("could not add fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) 415 } 416 417 return nil 418 } 419 420 func (d *driver) peerDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, 421 peerMac net.HardwareAddr, vtep net.IP, localPeer bool) { 422 d.peerOpCh <- &peerOperation{ 423 opType: peerOperationDELETE, 424 networkID: nid, 425 endpointID: eid, 426 peerIP: peerIP, 427 peerIPMask: peerIPMask, 428 peerMac: peerMac, 429 vtepIP: vtep, 430 callerName: caller.Name(1), 431 localPeer: localPeer, 432 } 433 } 434 435 func (d *driver) peerDeleteOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, 436 peerMac net.HardwareAddr, vtep net.IP, localPeer bool) error { 437 438 if err := validateID(nid, eid); err != nil { 439 return err 440 } 441 442 deleted, dbEntries := d.peerDbDelete(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer) 443 if !deleted { 444 logrus.Warnf("Entry was not in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v", 445 nid, eid, peerIP, peerMac, localPeer, vtep) 446 } 447 448 n := d.network(nid) 449 if n == nil { 450 return nil 451 } 452 453 sbox := n.sandbox() 454 if sbox == nil { 455 return nil 456 } 457 458 if err := d.checkEncryption(nid, vtep, 0, localPeer, false); err != nil { 459 logrus.Warn(err) 460 } 461 462 // Local peers do not have any local configuration to delete 463 if !localPeer { 464 // Remove fdb entry to the bridge for the peer mac 465 if err := sbox.DeleteNeighbor(vtep, peerMac, true); err != nil { 466 if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 0 { 467 // We fall in here if there is a transient state and if the neighbor that is being deleted 468 // was never been configured into the kernel (we allow only 1 configuration at the time per <ip,mac> mapping) 469 return nil 470 } 471 return fmt.Errorf("could not delete fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) 472 } 473 474 // Delete neighbor entry for the peer IP 475 if err := sbox.DeleteNeighbor(peerIP, peerMac, true); err != nil { 476 return fmt.Errorf("could not delete neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) 477 } 478 } 479 480 if dbEntries == 0 { 481 return nil 482 } 483 484 // If there is still an entry into the database and the deletion went through without errors means that there is now no 485 // configuration active in the kernel. 486 // Restore one configuration for the <ip,mac> directly from the database, note that is guaranteed that there is one 487 peerKey, peerEntry, err := d.peerDbSearch(nid, peerIP) 488 if err != nil { 489 logrus.Errorf("peerDeleteOp unable to restore a configuration for nid:%s ip:%v mac:%v err:%s", nid, peerIP, peerMac, err) 490 return err 491 } 492 return d.peerAddOp(nid, peerEntry.eid, peerIP, peerEntry.peerIPMask, peerKey.peerMac, peerEntry.vtep, false, false, false, peerEntry.isLocal) 493 } 494 495 func (d *driver) peerFlush(nid string) { 496 d.peerOpCh <- &peerOperation{ 497 opType: peerOperationFLUSH, 498 networkID: nid, 499 callerName: caller.Name(1), 500 } 501 } 502 503 func (d *driver) peerFlushOp(nid string) error { 504 d.peerDb.Lock() 505 defer d.peerDb.Unlock() 506 _, ok := d.peerDb.mp[nid] 507 if !ok { 508 return fmt.Errorf("Unable to find the peerDB for nid:%s", nid) 509 } 510 delete(d.peerDb.mp, nid) 511 return nil 512 } 513 514 func (d *driver) pushLocalDb() { 515 d.peerDbWalk(func(nid string, pKey *peerKey, pEntry *peerEntry) bool { 516 if pEntry.isLocal { 517 d.pushLocalEndpointEvent("join", nid, pEntry.eid) 518 } 519 return false 520 }) 521 } 522 523 func (d *driver) peerDBUpdateSelf() { 524 d.peerDbWalk(func(nid string, pkey *peerKey, pEntry *peerEntry) bool { 525 if pEntry.isLocal { 526 pEntry.vtep = net.ParseIP(d.advertiseAddress) 527 } 528 return false 529 }) 530 }