github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/transport/bundle/stream_bundle.go (about)

     1  // Package bundle provides multi-streaming transport with the functionality
     2  // to dynamically (un)register receive endpoints, establish long-lived flows, and more.
     3  /*
     4   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     5   */
     6  package bundle
     7  
     8  import (
     9  	"fmt"
    10  	"sync"
    11  	ratomic "sync/atomic"
    12  
    13  	"github.com/NVIDIA/aistore/cmn"
    14  	"github.com/NVIDIA/aistore/cmn/atomic"
    15  	"github.com/NVIDIA/aistore/cmn/cos"
    16  	"github.com/NVIDIA/aistore/cmn/debug"
    17  	"github.com/NVIDIA/aistore/cmn/nlog"
    18  	"github.com/NVIDIA/aistore/core"
    19  	"github.com/NVIDIA/aistore/core/meta"
    20  	"github.com/NVIDIA/aistore/transport"
    21  )
    22  
    23  const (
    24  	closeFin = iota
    25  	closeStop
    26  )
    27  
    28  type (
    29  	// multiple streams to the same destination with round-robin selection
    30  	stsdest []*transport.Stream
    31  	robin   struct {
    32  		stsdest stsdest
    33  		i       atomic.Int64
    34  	}
    35  	bundle map[string]*robin // stream "bundle" indexed by node ID
    36  )
    37  
    38  type (
    39  	Streams struct {
    40  		client       transport.Client
    41  		smap         *meta.Smap // current Smap
    42  		smaplock     *sync.Mutex
    43  		streams      ratomic.Pointer[bundle] // stream bundle
    44  		trname       string
    45  		network      string
    46  		lid          string
    47  		extra        transport.Extra
    48  		rxNodeType   int // receiving nodes: [Targets, ..., AllNodes ] enum above
    49  		multiplier   int // optionally: multiple streams per destination (round-robin)
    50  		manualResync bool
    51  	}
    52  	Stats map[string]*transport.Stats // by DaemonID
    53  
    54  	Args struct {
    55  		Extra        *transport.Extra // additional parameters
    56  		Net          string           // one of cmn.KnownNetworks, empty defaults to cmn.NetIntraData
    57  		Trname       string           // transport endpoint name
    58  		Ntype        int              // core.Target (0) by default
    59  		Multiplier   int              // so-many TCP connections per Rx endpoint, with round-robin
    60  		ManualResync bool             // auto-resync by default
    61  	}
    62  
    63  	ErrDestinationMissing struct {
    64  		streamStr string
    65  		tname     string
    66  		smapStr   string
    67  	}
    68  )
    69  
    70  // interface guard
    71  var _ meta.Slistener = (*Streams)(nil)
    72  
    73  //
    74  // public
    75  //
    76  
    77  func (sb *Streams) UsePDU() bool   { return sb.extra.UsePDU() }
    78  func (sb *Streams) Trname() string { return sb.trname }
    79  
    80  func New(cl transport.Client, args Args) (sb *Streams) {
    81  	if args.Net == "" {
    82  		args.Net = cmn.NetIntraData
    83  	}
    84  	sb = &Streams{
    85  		smap:         &meta.Smap{}, // empty on purpose (see Resync)
    86  		smaplock:     &sync.Mutex{},
    87  		client:       cl,
    88  		network:      args.Net,
    89  		trname:       args.Trname,
    90  		rxNodeType:   args.Ntype,
    91  		multiplier:   args.Multiplier,
    92  		manualResync: args.ManualResync,
    93  	}
    94  	debug.Assert(args.Extra != nil && args.Extra.Config != nil)
    95  	sb.extra = *args.Extra
    96  	if sb.multiplier == 0 {
    97  		sb.multiplier = 1
    98  	}
    99  	if sb.extra.Config == nil {
   100  		sb.extra.Config = cmn.GCO.Get()
   101  	}
   102  	if !sb.extra.Compressed() {
   103  		sb.lid = fmt.Sprintf("sb[%s-%s-%s]", core.T.SID(), sb.network, sb.trname)
   104  	} else {
   105  		sb.lid = fmt.Sprintf("sb[%s-%s-%s[%s]]", core.T.SID(), sb.network, sb.trname,
   106  			cos.ToSizeIEC(int64(sb.extra.Config.Transport.LZ4BlockMaxSize), 0))
   107  	}
   108  
   109  	// update streams when Smap changes
   110  	sb.smaplock.Lock()
   111  	sb.Resync()
   112  	sb.smaplock.Unlock()
   113  
   114  	// register this stream-bundle as Smap listener
   115  	if !sb.manualResync {
   116  		listeners := core.T.Sowner().Listeners()
   117  		listeners.Reg(sb)
   118  	}
   119  	return
   120  }
   121  
   122  // Close closes all contained streams and unregisters the bundle from Smap listeners;
   123  // graceful=true blocks until all pending objects get completed (for "completion", see transport/README.md)
   124  func (sb *Streams) Close(gracefully bool) {
   125  	if gracefully {
   126  		sb.apply(closeFin)
   127  	} else {
   128  		sb.apply(closeStop)
   129  	}
   130  	if !sb.manualResync {
   131  		listeners := core.T.Sowner().Listeners()
   132  		listeners.Unreg(sb)
   133  	}
   134  }
   135  
   136  // when (nodes == nil) transmit via all established streams in a bundle
   137  // otherwise, restrict to the specified subset (nodes)
   138  func (sb *Streams) Send(obj *transport.Obj, roc cos.ReadOpenCloser, nodes ...*meta.Snode) (err error) {
   139  	debug.Assert(!transport.ReservedOpcode(obj.Hdr.Opcode))
   140  	streams := sb.get()
   141  	if len(streams) == 0 {
   142  		err = fmt.Errorf("no streams %s => .../%s", core.T.Snode(), sb.trname)
   143  	} else if nodes != nil && len(nodes) == 0 {
   144  		err = fmt.Errorf("no destinations %s => .../%s", core.T.Snode(), sb.trname)
   145  	} else if obj.IsUnsized() && sb.extra.SizePDU == 0 {
   146  		err = fmt.Errorf("[%s] sending unsized object supported only with PDUs", obj.Hdr.Cname())
   147  	}
   148  
   149  	if err != nil {
   150  		if cmn.Rom.FastV(5, cos.SmoduleTransport) {
   151  			nlog.Warningln(err)
   152  		}
   153  		// compare w/ transport doCmpl()
   154  		_doCmpl(obj, roc, err)
   155  		return
   156  	}
   157  	if obj.Callback == nil {
   158  		obj.Callback = sb.extra.Callback
   159  	}
   160  	if obj.IsHeaderOnly() {
   161  		roc = nil
   162  	}
   163  
   164  	if nodes == nil {
   165  		idx, cnt := 0, len(streams)
   166  		obj.SetPrc(cnt)
   167  		// Reader-reopening logic: since the streams in a bundle are mutually independent
   168  		// and asynchronous, reader.Open() (aka reopen) is skipped for the 1st replica
   169  		// that we put on the wire and is done for the 2nd, 3rd, etc. replicas.
   170  		// In other words, for the N object replicas over the N bundled streams, the
   171  		// original reader will get reopened (N-1) times.
   172  		for sid, robin := range streams {
   173  			if core.T.SID() == sid {
   174  				continue
   175  			}
   176  			if err = sb.sendOne(obj, roc, robin, idx, cnt); err != nil {
   177  				return
   178  			}
   179  			idx++
   180  		}
   181  	} else {
   182  		// first, check streams vs destinations
   183  		for _, di := range nodes {
   184  			if _, ok := streams[di.ID()]; ok {
   185  				continue
   186  			}
   187  			err = &ErrDestinationMissing{sb.String(), di.StringEx(), sb.smap.String()}
   188  			_doCmpl(obj, roc, err) // ditto
   189  			return
   190  		}
   191  		// second, do send. Same comment wrt reopening.
   192  		cnt := len(nodes)
   193  		obj.SetPrc(cnt)
   194  		for idx, di := range nodes {
   195  			robin := streams[di.ID()]
   196  			if err = sb.sendOne(obj, roc, robin, idx, cnt); err != nil {
   197  				return
   198  			}
   199  		}
   200  	}
   201  	return
   202  }
   203  
   204  func _doCmpl(obj *transport.Obj, roc cos.ReadOpenCloser, err error) {
   205  	if roc != nil {
   206  		cos.Close(roc)
   207  	}
   208  	if obj.Callback != nil {
   209  		obj.Callback(&obj.Hdr, roc, obj.CmplArg, err)
   210  	}
   211  }
   212  
   213  func (sb *Streams) String() string   { return sb.lid }
   214  func (sb *Streams) Smap() *meta.Smap { return sb.smap }
   215  
   216  // keep streams to => (clustered nodes as per rxNodeType) in sync at all times
   217  func (sb *Streams) ListenSmapChanged() {
   218  	smap := core.T.Sowner().Get()
   219  	if smap.Version <= sb.smap.Version {
   220  		return
   221  	}
   222  
   223  	sb.smaplock.Lock()
   224  	sb.Resync()
   225  	sb.smaplock.Unlock()
   226  }
   227  
   228  func (sb *Streams) GetStats() Stats {
   229  	streams := sb.get()
   230  	stats := make(Stats, len(streams))
   231  	for id, robin := range streams {
   232  		s := robin.stsdest[0]
   233  		tstat := s.GetStats()
   234  		stats[id] = &tstat
   235  	}
   236  	return stats
   237  }
   238  
   239  //
   240  // private methods
   241  //
   242  
   243  func (sb *Streams) get() (bun bundle) {
   244  	optr := sb.streams.Load()
   245  	if optr != nil {
   246  		bun = *optr
   247  	}
   248  	return
   249  }
   250  
   251  // one obj, one stream
   252  func (sb *Streams) sendOne(obj *transport.Obj, roc cos.ReadOpenCloser, robin *robin, idx, cnt int) error {
   253  	obj.Hdr.SID = core.T.SID()
   254  	one := obj
   255  	one.Reader = roc
   256  	if cnt == 1 {
   257  		goto snd
   258  	}
   259  	one = transport.AllocSend()
   260  	*one = *obj
   261  	if idx > 0 && roc != nil {
   262  		reader, err := roc.Open()
   263  		if err != nil { // reopen for every destination
   264  			err := fmt.Errorf("%s failed to reopen %q reader: %v", sb, obj, err)
   265  			debug.AssertNoErr(err) // must never happen
   266  			return err
   267  		}
   268  		one.Reader = reader
   269  	}
   270  snd:
   271  	i := 0
   272  	if sb.multiplier > 1 {
   273  		i = int(robin.i.Inc()) % len(robin.stsdest)
   274  	}
   275  	s := robin.stsdest[i]
   276  	return s.Send(one)
   277  }
   278  
   279  func (sb *Streams) Abort() {
   280  	streams := sb.get()
   281  	for _, robin := range streams {
   282  		for _, s := range robin.stsdest {
   283  			s.Abort()
   284  		}
   285  	}
   286  }
   287  
   288  func (sb *Streams) apply(action int) {
   289  	cos.Assert(action == closeFin || action == closeStop)
   290  	var (
   291  		streams = sb.get()
   292  		wg      = &sync.WaitGroup{}
   293  	)
   294  	for _, robin := range streams {
   295  		wg.Add(1)
   296  		go func(stsdest stsdest, wg *sync.WaitGroup) {
   297  			for _, s := range stsdest {
   298  				if !s.IsTerminated() {
   299  					if action == closeFin {
   300  						s.Fin()
   301  					} else {
   302  						s.Stop()
   303  					}
   304  				}
   305  			}
   306  			wg.Done()
   307  		}(robin.stsdest, wg)
   308  	}
   309  	wg.Wait()
   310  }
   311  
   312  // Resync streams asynchronously
   313  // is a slowpath; is called under lock; NOTE: calls stream.Stop()
   314  func (sb *Streams) Resync() {
   315  	smap := core.T.Sowner().Get()
   316  	if smap.Version <= sb.smap.Version {
   317  		debug.Assertf(smap.Version == sb.smap.Version, "%s[%s]: %s vs %s", sb.trname, sb.lid, smap, sb.smap)
   318  		return
   319  	}
   320  
   321  	var (
   322  		oldm []meta.NodeMap
   323  		newm []meta.NodeMap
   324  		node = smap.GetNode(core.T.SID()) // upd flags
   325  	)
   326  	switch sb.rxNodeType {
   327  	case core.Targets:
   328  		oldm = []meta.NodeMap{sb.smap.Tmap}
   329  		newm = []meta.NodeMap{smap.Tmap}
   330  	case core.Proxies:
   331  		oldm = []meta.NodeMap{sb.smap.Pmap}
   332  		newm = []meta.NodeMap{smap.Pmap}
   333  	case core.AllNodes:
   334  		oldm = []meta.NodeMap{sb.smap.Tmap, sb.smap.Pmap}
   335  		newm = []meta.NodeMap{smap.Tmap, smap.Pmap}
   336  	default:
   337  		debug.Assert(false)
   338  	}
   339  	if node == nil {
   340  		// extremely unlikely
   341  		debug.Assert(false, core.T.SID())
   342  		newm = []meta.NodeMap{make(meta.NodeMap)}
   343  	} else {
   344  		core.T.Snode().Flags = node.Flags
   345  	}
   346  
   347  	added, removed := mdiff(oldm, newm)
   348  
   349  	obundle := sb.get()
   350  	l := len(added) - len(removed)
   351  	if obundle != nil {
   352  		l = max(len(obundle), len(obundle)+l)
   353  	}
   354  	nbundle := make(bundle, l)
   355  	for id, robin := range obundle {
   356  		nbundle[id] = robin
   357  	}
   358  	for id, si := range added {
   359  		if id == core.T.SID() {
   360  			continue
   361  		}
   362  		// not connecting to the peer that's in maintenance and already rebalanced-out
   363  		if si.InMaintPostReb() {
   364  			nlog.Infof("%s => %s[-/%s] per %s - skipping", sb, si.StringEx(), si.Fl2S(), smap)
   365  			continue
   366  		}
   367  
   368  		dstURL := si.URL(sb.network) + transport.ObjURLPath(sb.trname) // direct destination URL
   369  		nrobin := &robin{stsdest: make(stsdest, sb.multiplier)}
   370  		for k := range sb.multiplier {
   371  			ns := transport.NewObjStream(sb.client, dstURL, id /*dstID*/, &sb.extra)
   372  			nrobin.stsdest[k] = ns
   373  		}
   374  		nbundle[id] = nrobin
   375  	}
   376  	for id := range removed {
   377  		if id == core.T.SID() {
   378  			continue
   379  		}
   380  		orobin := nbundle[id]
   381  		for k := range sb.multiplier {
   382  			os := orobin.stsdest[k]
   383  			if !os.IsTerminated() {
   384  				os.Stop() // the node is gone but the stream appears to be still active - stop it
   385  			}
   386  		}
   387  		delete(nbundle, id)
   388  	}
   389  	sb.streams.Store(&nbundle)
   390  	sb.smap = smap
   391  }
   392  
   393  // helper to find out NodeMap "delta" or "diff"
   394  func mdiff(oldMaps, newMaps []meta.NodeMap) (added, removed meta.NodeMap) {
   395  	for i, mold := range oldMaps {
   396  		mnew := newMaps[i]
   397  		for id, si := range mnew {
   398  			if _, ok := mold[id]; !ok {
   399  				if added == nil {
   400  					added = make(meta.NodeMap, max(len(mnew)-len(mold), 1))
   401  				}
   402  				added[id] = si
   403  			}
   404  		}
   405  	}
   406  	for i, mold := range oldMaps {
   407  		mnew := newMaps[i]
   408  		for id, si := range mold {
   409  			if _, ok := mnew[id]; !ok {
   410  				if removed == nil {
   411  					removed = make(meta.NodeMap, 1)
   412  				}
   413  				removed[id] = si
   414  			}
   415  		}
   416  	}
   417  	return
   418  }
   419  
   420  ///////////////////////////
   421  // ErrDestinationMissing //
   422  ///////////////////////////
   423  
   424  func (e *ErrDestinationMissing) Error() string {
   425  	return fmt.Sprintf("destination missing: stream (%s) => %s, %s", e.streamStr, e.tname, e.smapStr)
   426  }
   427  
   428  func IsErrDestinationMissing(e error) bool {
   429  	_, ok := e.(*ErrDestinationMissing)
   430  	return ok
   431  }