github.com/jfrazelle/docker@v1.1.2-0.20210712172922-bf78e25fe508/libnetwork/drivers/overlay/peerdb.go (about) 1 // +build linux 2 3 package overlay 4 5 import ( 6 "context" 7 "fmt" 8 "net" 9 "sync" 10 "syscall" 11 12 "github.com/docker/docker/libnetwork/internal/caller" 13 "github.com/docker/docker/libnetwork/internal/setmatrix" 14 "github.com/docker/docker/libnetwork/osl" 15 "github.com/sirupsen/logrus" 16 ) 17 18 const ovPeerTable = "overlay_peer_table" 19 20 type peerKey struct { 21 peerIP net.IP 22 peerMac net.HardwareAddr 23 } 24 25 type peerEntry struct { 26 eid string 27 vtep net.IP 28 peerIPMask net.IPMask 29 isLocal bool 30 } 31 32 func (p *peerEntry) MarshalDB() peerEntryDB { 33 ones, bits := p.peerIPMask.Size() 34 return peerEntryDB{ 35 eid: p.eid, 36 vtep: p.vtep.String(), 37 peerIPMaskOnes: ones, 38 peerIPMaskBits: bits, 39 isLocal: p.isLocal, 40 } 41 } 42 43 // This the structure saved into the set (SetMatrix), due to the implementation of it 44 // the value inserted in the set has to be Hashable so the []byte had to be converted into 45 // strings 46 type peerEntryDB struct { 47 eid string 48 vtep string 49 peerIPMaskOnes int 50 peerIPMaskBits int 51 isLocal bool 52 } 53 54 func (p *peerEntryDB) UnMarshalDB() peerEntry { 55 return peerEntry{ 56 eid: p.eid, 57 vtep: net.ParseIP(p.vtep), 58 peerIPMask: net.CIDRMask(p.peerIPMaskOnes, p.peerIPMaskBits), 59 isLocal: p.isLocal, 60 } 61 } 62 63 type peerMap struct { 64 // set of peerEntry, note they have to be objects and not pointers to maintain the proper equality checks 65 mp setmatrix.SetMatrix 66 sync.Mutex 67 } 68 69 type peerNetworkMap struct { 70 // map with key peerKey 71 mp map[string]*peerMap 72 sync.Mutex 73 } 74 75 func (pKey peerKey) String() string { 76 return fmt.Sprintf("%s %s", pKey.peerIP, pKey.peerMac) 77 } 78 79 func (pKey *peerKey) Scan(state fmt.ScanState, verb rune) error { 80 ipB, err := state.Token(true, nil) 81 if err != nil { 82 return err 83 } 84 85 pKey.peerIP = net.ParseIP(string(ipB)) 86 87 macB, err := state.Token(true, nil) 88 if err != nil { 89 return err 90 } 91 92 pKey.peerMac, err = net.ParseMAC(string(macB)) 93 return err 94 } 95 96 func (d *driver) peerDbWalk(f func(string, *peerKey, *peerEntry) bool) error { 97 d.peerDb.Lock() 98 nids := []string{} 99 for nid := range d.peerDb.mp { 100 nids = append(nids, nid) 101 } 102 d.peerDb.Unlock() 103 104 for _, nid := range nids { 105 d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool { 106 return f(nid, pKey, pEntry) 107 }) 108 } 109 return nil 110 } 111 112 func (d *driver) peerDbNetworkWalk(nid string, f func(*peerKey, *peerEntry) bool) error { 113 d.peerDb.Lock() 114 pMap, ok := d.peerDb.mp[nid] 115 d.peerDb.Unlock() 116 117 if !ok { 118 return nil 119 } 120 121 mp := map[string]peerEntry{} 122 pMap.Lock() 123 for _, pKeyStr := range pMap.mp.Keys() { 124 entryDBList, ok := pMap.mp.Get(pKeyStr) 125 if ok { 126 peerEntryDB := entryDBList[0].(peerEntryDB) 127 mp[pKeyStr] = peerEntryDB.UnMarshalDB() 128 } 129 } 130 pMap.Unlock() 131 132 for pKeyStr, pEntry := range mp { 133 var pKey peerKey 134 pEntry := pEntry 135 if _, err := fmt.Sscan(pKeyStr, &pKey); err != nil { 136 logrus.Warnf("Peer key scan on network %s failed: %v", nid, err) 137 } 138 if f(&pKey, &pEntry) { 139 return nil 140 } 141 } 142 143 return nil 144 } 145 146 func (d *driver) peerDbSearch(nid string, peerIP net.IP) (*peerKey, *peerEntry, error) { 147 var pKeyMatched *peerKey 148 var pEntryMatched *peerEntry 149 err := d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool { 150 if pKey.peerIP.Equal(peerIP) { 151 pKeyMatched = pKey 152 pEntryMatched = pEntry 153 return true 154 } 155 156 return false 157 }) 158 159 if err != nil { 160 return nil, nil, fmt.Errorf("peerdb search for peer ip %q failed: %v", peerIP, err) 161 } 162 163 if pKeyMatched == nil || pEntryMatched == nil { 164 return nil, nil, fmt.Errorf("peer ip %q not found in peerdb", peerIP) 165 } 166 167 return pKeyMatched, pEntryMatched, nil 168 } 169 170 func (d *driver) peerDbAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, 171 peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) { 172 173 d.peerDb.Lock() 174 pMap, ok := d.peerDb.mp[nid] 175 if !ok { 176 d.peerDb.mp[nid] = &peerMap{ 177 mp: setmatrix.NewSetMatrix(), 178 } 179 180 pMap = d.peerDb.mp[nid] 181 } 182 d.peerDb.Unlock() 183 184 pKey := peerKey{ 185 peerIP: peerIP, 186 peerMac: peerMac, 187 } 188 189 pEntry := peerEntry{ 190 eid: eid, 191 vtep: vtep, 192 peerIPMask: peerIPMask, 193 isLocal: isLocal, 194 } 195 196 pMap.Lock() 197 defer pMap.Unlock() 198 b, i := pMap.mp.Insert(pKey.String(), pEntry.MarshalDB()) 199 if i != 1 { 200 // Transient case, there is more than one endpoint that is using the same IP,MAC pair 201 s, _ := pMap.mp.String(pKey.String()) 202 logrus.Warnf("peerDbAdd transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s) 203 } 204 return b, i 205 } 206 207 func (d *driver) peerDbDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, 208 peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) { 209 210 d.peerDb.Lock() 211 pMap, ok := d.peerDb.mp[nid] 212 if !ok { 213 d.peerDb.Unlock() 214 return false, 0 215 } 216 d.peerDb.Unlock() 217 218 pKey := peerKey{ 219 peerIP: peerIP, 220 peerMac: peerMac, 221 } 222 223 pEntry := peerEntry{ 224 eid: eid, 225 vtep: vtep, 226 peerIPMask: peerIPMask, 227 isLocal: isLocal, 228 } 229 230 pMap.Lock() 231 defer pMap.Unlock() 232 b, i := pMap.mp.Remove(pKey.String(), pEntry.MarshalDB()) 233 if i != 0 { 234 // Transient case, there is more than one endpoint that is using the same IP,MAC pair 235 s, _ := pMap.mp.String(pKey.String()) 236 logrus.Warnf("peerDbDelete transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s) 237 } 238 return b, i 239 } 240 241 // The overlay uses a lazy initialization approach, this means that when a network is created 242 // and the driver registered the overlay does not allocate resources till the moment that a 243 // sandbox is actually created. 244 // At the moment of this call, that happens when a sandbox is initialized, is possible that 245 // networkDB has already delivered some events of peers already available on remote nodes, 246 // these peers are saved into the peerDB and this function is used to properly configure 247 // the network sandbox with all those peers that got previously notified. 248 // Note also that this method sends a single message on the channel and the go routine on the 249 // other side, will atomically loop on the whole table of peers and will program their state 250 // in one single atomic operation. This is fundamental to guarantee consistency, and avoid that 251 // new peerAdd or peerDelete gets reordered during the sandbox init. 252 func (d *driver) initSandboxPeerDB(nid string) { 253 d.peerInit(nid) 254 } 255 256 type peerOperationType int32 257 258 const ( 259 peerOperationINIT peerOperationType = iota 260 peerOperationADD 261 peerOperationDELETE 262 peerOperationFLUSH 263 ) 264 265 type peerOperation struct { 266 opType peerOperationType 267 networkID string 268 endpointID string 269 peerIP net.IP 270 peerIPMask net.IPMask 271 peerMac net.HardwareAddr 272 vtepIP net.IP 273 l2Miss bool 274 l3Miss bool 275 localPeer bool 276 callerName string 277 } 278 279 func (d *driver) peerOpRoutine(ctx context.Context, ch chan *peerOperation) { 280 var err error 281 for { 282 select { 283 case <-ctx.Done(): 284 return 285 case op := <-ch: 286 switch op.opType { 287 case peerOperationINIT: 288 err = d.peerInitOp(op.networkID) 289 case peerOperationADD: 290 err = d.peerAddOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.l2Miss, op.l3Miss, true, op.localPeer) 291 case peerOperationDELETE: 292 err = d.peerDeleteOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.localPeer) 293 case peerOperationFLUSH: 294 err = d.peerFlushOp(op.networkID) 295 } 296 if err != nil { 297 logrus.Warnf("Peer operation failed:%s op:%v", err, op) 298 } 299 } 300 } 301 } 302 303 func (d *driver) peerInit(nid string) { 304 callerName := caller.Name(1) 305 d.peerOpCh <- &peerOperation{ 306 opType: peerOperationINIT, 307 networkID: nid, 308 callerName: callerName, 309 } 310 } 311 312 func (d *driver) peerInitOp(nid string) error { 313 return d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool { 314 // Local entries do not need to be added 315 if pEntry.isLocal { 316 return false 317 } 318 319 d.peerAddOp(nid, pEntry.eid, pKey.peerIP, pEntry.peerIPMask, pKey.peerMac, pEntry.vtep, false, false, false, pEntry.isLocal) 320 // return false to loop on all entries 321 return false 322 }) 323 } 324 325 func (d *driver) peerAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, 326 peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, localPeer bool) { 327 d.peerOpCh <- &peerOperation{ 328 opType: peerOperationADD, 329 networkID: nid, 330 endpointID: eid, 331 peerIP: peerIP, 332 peerIPMask: peerIPMask, 333 peerMac: peerMac, 334 vtepIP: vtep, 335 l2Miss: l2Miss, 336 l3Miss: l3Miss, 337 localPeer: localPeer, 338 callerName: caller.Name(1), 339 } 340 } 341 342 func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, 343 peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, updateDB, localPeer bool) error { 344 345 if err := validateID(nid, eid); err != nil { 346 return err 347 } 348 349 var dbEntries int 350 var inserted bool 351 if updateDB { 352 inserted, dbEntries = d.peerDbAdd(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer) 353 if !inserted { 354 logrus.Warnf("Entry already present in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v", 355 nid, eid, peerIP, peerMac, localPeer, vtep) 356 } 357 } 358 359 // Local peers do not need any further configuration 360 if localPeer { 361 return nil 362 } 363 364 n := d.network(nid) 365 if n == nil { 366 return nil 367 } 368 369 sbox := n.sandbox() 370 if sbox == nil { 371 // We are hitting this case for all the events that are arriving before that the sandbox 372 // is being created. The peer got already added into the database and the sanbox init will 373 // call the peerDbUpdateSandbox that will configure all these peers from the database 374 return nil 375 } 376 377 IP := &net.IPNet{ 378 IP: peerIP, 379 Mask: peerIPMask, 380 } 381 382 s := n.getSubnetforIP(IP) 383 if s == nil { 384 return fmt.Errorf("couldn't find the subnet %q in network %q", IP.String(), n.id) 385 } 386 387 if err := n.obtainVxlanID(s); err != nil { 388 return fmt.Errorf("couldn't get vxlan id for %q: %v", s.subnetIP.String(), err) 389 } 390 391 if err := n.joinSandbox(s, false, false); err != nil { 392 return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), err) 393 } 394 395 if err := d.checkEncryption(nid, vtep, n.vxlanID(s), false, true); err != nil { 396 logrus.Warn(err) 397 } 398 399 // Add neighbor entry for the peer IP 400 if err := sbox.AddNeighbor(peerIP, peerMac, l3Miss, sbox.NeighborOptions().LinkName(s.vxlanName)); err != nil { 401 if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 1 { 402 // We are in the transient case so only the first configuration is programmed into the kernel 403 // Upon deletion if the active configuration is deleted the next one from the database will be restored 404 // Note we are skipping also the next configuration 405 return nil 406 } 407 return fmt.Errorf("could not add neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) 408 } 409 410 // Add fdb entry to the bridge for the peer mac 411 if err := sbox.AddNeighbor(vtep, peerMac, l2Miss, sbox.NeighborOptions().LinkName(s.vxlanName), 412 sbox.NeighborOptions().Family(syscall.AF_BRIDGE)); err != nil { 413 return fmt.Errorf("could not add fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) 414 } 415 416 return nil 417 } 418 419 func (d *driver) peerDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, 420 peerMac net.HardwareAddr, vtep net.IP, localPeer bool) { 421 d.peerOpCh <- &peerOperation{ 422 opType: peerOperationDELETE, 423 networkID: nid, 424 endpointID: eid, 425 peerIP: peerIP, 426 peerIPMask: peerIPMask, 427 peerMac: peerMac, 428 vtepIP: vtep, 429 callerName: caller.Name(1), 430 localPeer: localPeer, 431 } 432 } 433 434 func (d *driver) peerDeleteOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, 435 peerMac net.HardwareAddr, vtep net.IP, localPeer bool) error { 436 437 if err := validateID(nid, eid); err != nil { 438 return err 439 } 440 441 deleted, dbEntries := d.peerDbDelete(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer) 442 if !deleted { 443 logrus.Warnf("Entry was not in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v", 444 nid, eid, peerIP, peerMac, localPeer, vtep) 445 } 446 447 n := d.network(nid) 448 if n == nil { 449 return nil 450 } 451 452 sbox := n.sandbox() 453 if sbox == nil { 454 return nil 455 } 456 457 if err := d.checkEncryption(nid, vtep, 0, localPeer, false); err != nil { 458 logrus.Warn(err) 459 } 460 461 // Local peers do not have any local configuration to delete 462 if !localPeer { 463 // Remove fdb entry to the bridge for the peer mac 464 if err := sbox.DeleteNeighbor(vtep, peerMac, true); err != nil { 465 if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 0 { 466 // We fall in here if there is a transient state and if the neighbor that is being deleted 467 // was never been configured into the kernel (we allow only 1 configuration at the time per <ip,mac> mapping) 468 return nil 469 } 470 return fmt.Errorf("could not delete fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) 471 } 472 473 // Delete neighbor entry for the peer IP 474 if err := sbox.DeleteNeighbor(peerIP, peerMac, true); err != nil { 475 return fmt.Errorf("could not delete neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) 476 } 477 } 478 479 if dbEntries == 0 { 480 return nil 481 } 482 483 // If there is still an entry into the database and the deletion went through without errors means that there is now no 484 // configuration active in the kernel. 485 // Restore one configuration for the <ip,mac> directly from the database, note that is guaranteed that there is one 486 peerKey, peerEntry, err := d.peerDbSearch(nid, peerIP) 487 if err != nil { 488 logrus.Errorf("peerDeleteOp unable to restore a configuration for nid:%s ip:%v mac:%v err:%s", nid, peerIP, peerMac, err) 489 return err 490 } 491 return d.peerAddOp(nid, peerEntry.eid, peerIP, peerEntry.peerIPMask, peerKey.peerMac, peerEntry.vtep, false, false, false, peerEntry.isLocal) 492 } 493 494 func (d *driver) peerFlush(nid string) { 495 d.peerOpCh <- &peerOperation{ 496 opType: peerOperationFLUSH, 497 networkID: nid, 498 callerName: caller.Name(1), 499 } 500 } 501 502 func (d *driver) peerFlushOp(nid string) error { 503 d.peerDb.Lock() 504 defer d.peerDb.Unlock() 505 _, ok := d.peerDb.mp[nid] 506 if !ok { 507 return fmt.Errorf("Unable to find the peerDB for nid:%s", nid) 508 } 509 delete(d.peerDb.mp, nid) 510 return nil 511 } 512 513 func (d *driver) pushLocalDb() { 514 d.peerDbWalk(func(nid string, pKey *peerKey, pEntry *peerEntry) bool { 515 if pEntry.isLocal { 516 d.pushLocalEndpointEvent("join", nid, pEntry.eid) 517 } 518 return false 519 }) 520 } 521 522 func (d *driver) peerDBUpdateSelf() { 523 d.peerDbWalk(func(nid string, pkey *peerKey, pEntry *peerEntry) bool { 524 if pEntry.isLocal { 525 pEntry.vtep = net.ParseIP(d.advertiseAddress) 526 } 527 return false 528 }) 529 }