github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/transport/bundle/stream_bundle.go (about) 1 // Package bundle provides multi-streaming transport with the functionality 2 // to dynamically (un)register receive endpoints, establish long-lived flows, and more. 3 /* 4 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 5 */ 6 package bundle 7 8 import ( 9 "fmt" 10 "sync" 11 ratomic "sync/atomic" 12 13 "github.com/NVIDIA/aistore/cmn" 14 "github.com/NVIDIA/aistore/cmn/atomic" 15 "github.com/NVIDIA/aistore/cmn/cos" 16 "github.com/NVIDIA/aistore/cmn/debug" 17 "github.com/NVIDIA/aistore/cmn/nlog" 18 "github.com/NVIDIA/aistore/core" 19 "github.com/NVIDIA/aistore/core/meta" 20 "github.com/NVIDIA/aistore/transport" 21 ) 22 23 const ( 24 closeFin = iota 25 closeStop 26 ) 27 28 type ( 29 // multiple streams to the same destination with round-robin selection 30 stsdest []*transport.Stream 31 robin struct { 32 stsdest stsdest 33 i atomic.Int64 34 } 35 bundle map[string]*robin // stream "bundle" indexed by node ID 36 ) 37 38 type ( 39 Streams struct { 40 client transport.Client 41 smap *meta.Smap // current Smap 42 smaplock *sync.Mutex 43 streams ratomic.Pointer[bundle] // stream bundle 44 trname string 45 network string 46 lid string 47 extra transport.Extra 48 rxNodeType int // receiving nodes: [Targets, ..., AllNodes ] enum above 49 multiplier int // optionally: multiple streams per destination (round-robin) 50 manualResync bool 51 } 52 Stats map[string]*transport.Stats // by DaemonID 53 54 Args struct { 55 Extra *transport.Extra // additional parameters 56 Net string // one of cmn.KnownNetworks, empty defaults to cmn.NetIntraData 57 Trname string // transport endpoint name 58 Ntype int // core.Target (0) by default 59 Multiplier int // so-many TCP connections per Rx endpoint, with round-robin 60 ManualResync bool // auto-resync by default 61 } 62 63 ErrDestinationMissing struct { 64 streamStr string 65 tname string 66 smapStr string 67 } 68 ) 69 70 // interface guard 71 var _ meta.Slistener = (*Streams)(nil) 72 73 // 74 // public 75 // 76 77 func (sb *Streams) UsePDU() bool { return sb.extra.UsePDU() } 78 func (sb *Streams) Trname() string { return sb.trname } 79 80 func New(cl transport.Client, args Args) (sb *Streams) { 81 if args.Net == "" { 82 args.Net = cmn.NetIntraData 83 } 84 sb = &Streams{ 85 smap: &meta.Smap{}, // empty on purpose (see Resync) 86 smaplock: &sync.Mutex{}, 87 client: cl, 88 network: args.Net, 89 trname: args.Trname, 90 rxNodeType: args.Ntype, 91 multiplier: args.Multiplier, 92 manualResync: args.ManualResync, 93 } 94 debug.Assert(args.Extra != nil && args.Extra.Config != nil) 95 sb.extra = *args.Extra 96 if sb.multiplier == 0 { 97 sb.multiplier = 1 98 } 99 if sb.extra.Config == nil { 100 sb.extra.Config = cmn.GCO.Get() 101 } 102 if !sb.extra.Compressed() { 103 sb.lid = fmt.Sprintf("sb[%s-%s-%s]", core.T.SID(), sb.network, sb.trname) 104 } else { 105 sb.lid = fmt.Sprintf("sb[%s-%s-%s[%s]]", core.T.SID(), sb.network, sb.trname, 106 cos.ToSizeIEC(int64(sb.extra.Config.Transport.LZ4BlockMaxSize), 0)) 107 } 108 109 // update streams when Smap changes 110 sb.smaplock.Lock() 111 sb.Resync() 112 sb.smaplock.Unlock() 113 114 // register this stream-bundle as Smap listener 115 if !sb.manualResync { 116 listeners := core.T.Sowner().Listeners() 117 listeners.Reg(sb) 118 } 119 return 120 } 121 122 // Close closes all contained streams and unregisters the bundle from Smap listeners; 123 // graceful=true blocks until all pending objects get completed (for "completion", see transport/README.md) 124 func (sb *Streams) Close(gracefully bool) { 125 if gracefully { 126 sb.apply(closeFin) 127 } else { 128 sb.apply(closeStop) 129 } 130 if !sb.manualResync { 131 listeners := core.T.Sowner().Listeners() 132 listeners.Unreg(sb) 133 } 134 } 135 136 // when (nodes == nil) transmit via all established streams in a bundle 137 // otherwise, restrict to the specified subset (nodes) 138 func (sb *Streams) Send(obj *transport.Obj, roc cos.ReadOpenCloser, nodes ...*meta.Snode) (err error) { 139 debug.Assert(!transport.ReservedOpcode(obj.Hdr.Opcode)) 140 streams := sb.get() 141 if len(streams) == 0 { 142 err = fmt.Errorf("no streams %s => .../%s", core.T.Snode(), sb.trname) 143 } else if nodes != nil && len(nodes) == 0 { 144 err = fmt.Errorf("no destinations %s => .../%s", core.T.Snode(), sb.trname) 145 } else if obj.IsUnsized() && sb.extra.SizePDU == 0 { 146 err = fmt.Errorf("[%s] sending unsized object supported only with PDUs", obj.Hdr.Cname()) 147 } 148 149 if err != nil { 150 if cmn.Rom.FastV(5, cos.SmoduleTransport) { 151 nlog.Warningln(err) 152 } 153 // compare w/ transport doCmpl() 154 _doCmpl(obj, roc, err) 155 return 156 } 157 if obj.Callback == nil { 158 obj.Callback = sb.extra.Callback 159 } 160 if obj.IsHeaderOnly() { 161 roc = nil 162 } 163 164 if nodes == nil { 165 idx, cnt := 0, len(streams) 166 obj.SetPrc(cnt) 167 // Reader-reopening logic: since the streams in a bundle are mutually independent 168 // and asynchronous, reader.Open() (aka reopen) is skipped for the 1st replica 169 // that we put on the wire and is done for the 2nd, 3rd, etc. replicas. 170 // In other words, for the N object replicas over the N bundled streams, the 171 // original reader will get reopened (N-1) times. 172 for sid, robin := range streams { 173 if core.T.SID() == sid { 174 continue 175 } 176 if err = sb.sendOne(obj, roc, robin, idx, cnt); err != nil { 177 return 178 } 179 idx++ 180 } 181 } else { 182 // first, check streams vs destinations 183 for _, di := range nodes { 184 if _, ok := streams[di.ID()]; ok { 185 continue 186 } 187 err = &ErrDestinationMissing{sb.String(), di.StringEx(), sb.smap.String()} 188 _doCmpl(obj, roc, err) // ditto 189 return 190 } 191 // second, do send. Same comment wrt reopening. 192 cnt := len(nodes) 193 obj.SetPrc(cnt) 194 for idx, di := range nodes { 195 robin := streams[di.ID()] 196 if err = sb.sendOne(obj, roc, robin, idx, cnt); err != nil { 197 return 198 } 199 } 200 } 201 return 202 } 203 204 func _doCmpl(obj *transport.Obj, roc cos.ReadOpenCloser, err error) { 205 if roc != nil { 206 cos.Close(roc) 207 } 208 if obj.Callback != nil { 209 obj.Callback(&obj.Hdr, roc, obj.CmplArg, err) 210 } 211 } 212 213 func (sb *Streams) String() string { return sb.lid } 214 func (sb *Streams) Smap() *meta.Smap { return sb.smap } 215 216 // keep streams to => (clustered nodes as per rxNodeType) in sync at all times 217 func (sb *Streams) ListenSmapChanged() { 218 smap := core.T.Sowner().Get() 219 if smap.Version <= sb.smap.Version { 220 return 221 } 222 223 sb.smaplock.Lock() 224 sb.Resync() 225 sb.smaplock.Unlock() 226 } 227 228 func (sb *Streams) GetStats() Stats { 229 streams := sb.get() 230 stats := make(Stats, len(streams)) 231 for id, robin := range streams { 232 s := robin.stsdest[0] 233 tstat := s.GetStats() 234 stats[id] = &tstat 235 } 236 return stats 237 } 238 239 // 240 // private methods 241 // 242 243 func (sb *Streams) get() (bun bundle) { 244 optr := sb.streams.Load() 245 if optr != nil { 246 bun = *optr 247 } 248 return 249 } 250 251 // one obj, one stream 252 func (sb *Streams) sendOne(obj *transport.Obj, roc cos.ReadOpenCloser, robin *robin, idx, cnt int) error { 253 obj.Hdr.SID = core.T.SID() 254 one := obj 255 one.Reader = roc 256 if cnt == 1 { 257 goto snd 258 } 259 one = transport.AllocSend() 260 *one = *obj 261 if idx > 0 && roc != nil { 262 reader, err := roc.Open() 263 if err != nil { // reopen for every destination 264 err := fmt.Errorf("%s failed to reopen %q reader: %v", sb, obj, err) 265 debug.AssertNoErr(err) // must never happen 266 return err 267 } 268 one.Reader = reader 269 } 270 snd: 271 i := 0 272 if sb.multiplier > 1 { 273 i = int(robin.i.Inc()) % len(robin.stsdest) 274 } 275 s := robin.stsdest[i] 276 return s.Send(one) 277 } 278 279 func (sb *Streams) Abort() { 280 streams := sb.get() 281 for _, robin := range streams { 282 for _, s := range robin.stsdest { 283 s.Abort() 284 } 285 } 286 } 287 288 func (sb *Streams) apply(action int) { 289 cos.Assert(action == closeFin || action == closeStop) 290 var ( 291 streams = sb.get() 292 wg = &sync.WaitGroup{} 293 ) 294 for _, robin := range streams { 295 wg.Add(1) 296 go func(stsdest stsdest, wg *sync.WaitGroup) { 297 for _, s := range stsdest { 298 if !s.IsTerminated() { 299 if action == closeFin { 300 s.Fin() 301 } else { 302 s.Stop() 303 } 304 } 305 } 306 wg.Done() 307 }(robin.stsdest, wg) 308 } 309 wg.Wait() 310 } 311 312 // Resync streams asynchronously 313 // is a slowpath; is called under lock; NOTE: calls stream.Stop() 314 func (sb *Streams) Resync() { 315 smap := core.T.Sowner().Get() 316 if smap.Version <= sb.smap.Version { 317 debug.Assertf(smap.Version == sb.smap.Version, "%s[%s]: %s vs %s", sb.trname, sb.lid, smap, sb.smap) 318 return 319 } 320 321 var ( 322 oldm []meta.NodeMap 323 newm []meta.NodeMap 324 node = smap.GetNode(core.T.SID()) // upd flags 325 ) 326 switch sb.rxNodeType { 327 case core.Targets: 328 oldm = []meta.NodeMap{sb.smap.Tmap} 329 newm = []meta.NodeMap{smap.Tmap} 330 case core.Proxies: 331 oldm = []meta.NodeMap{sb.smap.Pmap} 332 newm = []meta.NodeMap{smap.Pmap} 333 case core.AllNodes: 334 oldm = []meta.NodeMap{sb.smap.Tmap, sb.smap.Pmap} 335 newm = []meta.NodeMap{smap.Tmap, smap.Pmap} 336 default: 337 debug.Assert(false) 338 } 339 if node == nil { 340 // extremely unlikely 341 debug.Assert(false, core.T.SID()) 342 newm = []meta.NodeMap{make(meta.NodeMap)} 343 } else { 344 core.T.Snode().Flags = node.Flags 345 } 346 347 added, removed := mdiff(oldm, newm) 348 349 obundle := sb.get() 350 l := len(added) - len(removed) 351 if obundle != nil { 352 l = max(len(obundle), len(obundle)+l) 353 } 354 nbundle := make(bundle, l) 355 for id, robin := range obundle { 356 nbundle[id] = robin 357 } 358 for id, si := range added { 359 if id == core.T.SID() { 360 continue 361 } 362 // not connecting to the peer that's in maintenance and already rebalanced-out 363 if si.InMaintPostReb() { 364 nlog.Infof("%s => %s[-/%s] per %s - skipping", sb, si.StringEx(), si.Fl2S(), smap) 365 continue 366 } 367 368 dstURL := si.URL(sb.network) + transport.ObjURLPath(sb.trname) // direct destination URL 369 nrobin := &robin{stsdest: make(stsdest, sb.multiplier)} 370 for k := range sb.multiplier { 371 ns := transport.NewObjStream(sb.client, dstURL, id /*dstID*/, &sb.extra) 372 nrobin.stsdest[k] = ns 373 } 374 nbundle[id] = nrobin 375 } 376 for id := range removed { 377 if id == core.T.SID() { 378 continue 379 } 380 orobin := nbundle[id] 381 for k := range sb.multiplier { 382 os := orobin.stsdest[k] 383 if !os.IsTerminated() { 384 os.Stop() // the node is gone but the stream appears to be still active - stop it 385 } 386 } 387 delete(nbundle, id) 388 } 389 sb.streams.Store(&nbundle) 390 sb.smap = smap 391 } 392 393 // helper to find out NodeMap "delta" or "diff" 394 func mdiff(oldMaps, newMaps []meta.NodeMap) (added, removed meta.NodeMap) { 395 for i, mold := range oldMaps { 396 mnew := newMaps[i] 397 for id, si := range mnew { 398 if _, ok := mold[id]; !ok { 399 if added == nil { 400 added = make(meta.NodeMap, max(len(mnew)-len(mold), 1)) 401 } 402 added[id] = si 403 } 404 } 405 } 406 for i, mold := range oldMaps { 407 mnew := newMaps[i] 408 for id, si := range mold { 409 if _, ok := mnew[id]; !ok { 410 if removed == nil { 411 removed = make(meta.NodeMap, 1) 412 } 413 removed[id] = si 414 } 415 } 416 } 417 return 418 } 419 420 /////////////////////////// 421 // ErrDestinationMissing // 422 /////////////////////////// 423 424 func (e *ErrDestinationMissing) Error() string { 425 return fmt.Sprintf("destination missing: stream (%s) => %s, %s", e.streamStr, e.tname, e.smapStr) 426 } 427 428 func IsErrDestinationMissing(e error) bool { 429 _, ok := e.(*ErrDestinationMissing) 430 return ok 431 }