github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/nl/listener.go (about)

     1  // Package notifications provides interfaces for AIStore notifications
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package nl
     6  
     7  import (
     8  	"strconv"
     9  	"strings"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/NVIDIA/aistore/cmn"
    14  	"github.com/NVIDIA/aistore/cmn/atomic"
    15  	"github.com/NVIDIA/aistore/cmn/cos"
    16  	"github.com/NVIDIA/aistore/cmn/debug"
    17  	"github.com/NVIDIA/aistore/cmn/mono"
    18  	"github.com/NVIDIA/aistore/core/meta"
    19  	jsoniter "github.com/json-iterator/go"
    20  )
    21  
    22  type Listener interface {
    23  	Callback(nl Listener, ts int64)
    24  	UnmarshalStats(rawMsg []byte) (any, bool, bool, error)
    25  	Lock()
    26  	Unlock()
    27  	RLock()
    28  	RUnlock()
    29  	Notifiers() meta.NodeMap
    30  	Kind() string
    31  	Cause() string
    32  	Bcks() []*cmn.Bck
    33  	AddErr(error)
    34  	Err() error
    35  	ErrCnt() int
    36  	UUID() string
    37  	SetAborted()
    38  	Aborted() bool
    39  	Status() *Status
    40  	SetStats(daeID string, stats any)
    41  	NodeStats() *NodeStats
    42  	QueryArgs() cmn.HreqArgs
    43  	EndTime() int64
    44  	SetAddedTime()
    45  	AddedTime() int64
    46  	Finished() bool
    47  	Name() string
    48  	String() string
    49  	GetOwner() string
    50  	SetOwner(string)
    51  	LastUpdated(*meta.Snode) int64
    52  	ProgressInterval() time.Duration
    53  
    54  	// detailed ref-counting
    55  	ActiveNotifiers() meta.NodeMap
    56  	FinCount() int
    57  	ActiveCount() int
    58  	HasFinished(*meta.Snode) bool
    59  	MarkFinished(*meta.Snode)
    60  	NodesTardy(periodicNotifTime time.Duration) (nodes meta.NodeMap, tardy bool)
    61  }
    62  
    63  type (
    64  	Callback func(n Listener)
    65  
    66  	NodeStats struct {
    67  		sync.RWMutex
    68  		stats map[string]any // daeID => Stats (e.g. cmn.SnapExt)
    69  	}
    70  
    71  	ListenerBase struct {
    72  		mu     sync.RWMutex
    73  		Common struct {
    74  			UUID  string
    75  			Kind  string // async operation kind (see api/apc/actmsg.go)
    76  			Cause string // causal action (e.g. decommission => rebalance)
    77  			Owned string // "": not owned | equalIC: IC | otherwise, pid + IC
    78  			Bck   []*cmn.Bck
    79  		}
    80  		// construction
    81  		Srcs        meta.NodeMap     // all notifiers
    82  		ActiveSrcs  meta.NodeMap     // running notifiers
    83  		F           Callback         `json:"-"` // optional listening-side callback
    84  		Stats       *NodeStats       // [daeID => Stats (e.g. cmn.SnapExt)]
    85  		lastUpdated map[string]int64 // [daeID => last update time(nanoseconds)]
    86  		progress    time.Duration    // time interval to monitor the progress
    87  		addedTime   atomic.Int64     // Time when `nl` is added
    88  
    89  		// runtime
    90  		EndTimeX atomic.Int64 // timestamp when finished
    91  		AbortedX atomic.Bool  // sets if the xaction is Aborted
    92  		Errs     cos.Errs     // reported error and count
    93  	}
    94  
    95  	Status struct {
    96  		Kind     string `json:"kind"`     // xaction kind
    97  		UUID     string `json:"uuid"`     // xaction UUID
    98  		ErrMsg   string `json:"err"`      // error
    99  		EndTimeX int64  `json:"end_time"` // time xaction ended
   100  		AbortedX bool   `json:"aborted"`  // true if aborted
   101  	}
   102  	StatusVec []Status
   103  )
   104  
   105  //////////////////
   106  // ListenerBase //
   107  //////////////////
   108  
   109  func NewNLB(uuid, action, cause string, srcs meta.NodeMap, progress time.Duration, bck ...*cmn.Bck) *ListenerBase {
   110  	nlb := &ListenerBase{
   111  		Srcs:        srcs,
   112  		Stats:       NewNodeStats(len(srcs)),
   113  		progress:    progress,
   114  		lastUpdated: make(map[string]int64, len(srcs)),
   115  	}
   116  	nlb.Common.UUID = uuid
   117  	nlb.Common.Kind = action
   118  	nlb.Common.Cause = cause
   119  	nlb.Common.Bck = bck
   120  	nlb.ActiveSrcs = srcs.ActiveMap()
   121  	return nlb
   122  }
   123  
   124  func (nlb *ListenerBase) Lock()    { nlb.mu.Lock() }
   125  func (nlb *ListenerBase) Unlock()  { nlb.mu.Unlock() }
   126  func (nlb *ListenerBase) RLock()   { nlb.mu.RLock() }
   127  func (nlb *ListenerBase) RUnlock() { nlb.mu.RUnlock() }
   128  
   129  func (nlb *ListenerBase) Notifiers() meta.NodeMap         { return nlb.Srcs }
   130  func (nlb *ListenerBase) UUID() string                    { return nlb.Common.UUID }
   131  func (nlb *ListenerBase) Aborted() bool                   { return nlb.AbortedX.Load() }
   132  func (nlb *ListenerBase) SetAborted()                     { nlb.AbortedX.CAS(false, true) }
   133  func (nlb *ListenerBase) EndTime() int64                  { return nlb.EndTimeX.Load() }
   134  func (nlb *ListenerBase) Finished() bool                  { return nlb.EndTime() > 0 }
   135  func (nlb *ListenerBase) ProgressInterval() time.Duration { return nlb.progress }
   136  func (nlb *ListenerBase) NodeStats() *NodeStats           { return nlb.Stats }
   137  func (nlb *ListenerBase) GetOwner() string                { return nlb.Common.Owned }
   138  func (nlb *ListenerBase) SetOwner(o string)               { nlb.Common.Owned = o }
   139  func (nlb *ListenerBase) Kind() string                    { return nlb.Common.Kind }
   140  func (nlb *ListenerBase) Cause() string                   { return nlb.Common.Cause }
   141  func (nlb *ListenerBase) Bcks() []*cmn.Bck                { return nlb.Common.Bck }
   142  func (nlb *ListenerBase) AddedTime() int64                { return nlb.addedTime.Load() }
   143  func (nlb *ListenerBase) SetAddedTime()                   { nlb.addedTime.Store(mono.NanoTime()) }
   144  
   145  func (nlb *ListenerBase) ActiveNotifiers() meta.NodeMap { return nlb.ActiveSrcs }
   146  func (nlb *ListenerBase) ActiveCount() int              { return len(nlb.ActiveSrcs) }
   147  func (nlb *ListenerBase) FinCount() int                 { return len(nlb.Srcs) - nlb.ActiveCount() }
   148  
   149  func (nlb *ListenerBase) MarkFinished(node *meta.Snode) {
   150  	delete(nlb.ActiveSrcs, node.ID())
   151  }
   152  
   153  func (nlb *ListenerBase) HasFinished(node *meta.Snode) bool {
   154  	return !nlb.ActiveSrcs.Contains(node.ID())
   155  }
   156  
   157  // is called after all Notifiers will have notified OR on failure (err != nil)
   158  func (nlb *ListenerBase) Callback(nl Listener, ts int64) {
   159  	if nlb.EndTimeX.CAS(0, 1) {
   160  		nlb.EndTimeX.Store(ts)
   161  		if nlb.F != nil {
   162  			nlb.F(nl)
   163  		}
   164  	}
   165  }
   166  
   167  func (nlb *ListenerBase) AddErr(err error) { nlb.Errs.Add(err) }
   168  func (nlb *ListenerBase) ErrCnt() int      { return nlb.Errs.Cnt() }
   169  
   170  func (nlb *ListenerBase) Err() error {
   171  	if nlb.ErrCnt() == 0 {
   172  		return nil
   173  	}
   174  	return &nlb.Errs
   175  }
   176  
   177  func (nlb *ListenerBase) SetStats(daeID string, stats any) {
   178  	debug.AssertRWMutexLocked(&nlb.mu)
   179  
   180  	_, ok := nlb.Srcs[daeID]
   181  	debug.Assert(ok)
   182  	nlb.Stats.Store(daeID, stats)
   183  	if nlb.lastUpdated == nil {
   184  		nlb.lastUpdated = make(map[string]int64, len(nlb.Srcs))
   185  	}
   186  	nlb.lastUpdated[daeID] = mono.NanoTime()
   187  }
   188  
   189  func (nlb *ListenerBase) LastUpdated(si *meta.Snode) int64 {
   190  	if nlb.lastUpdated == nil {
   191  		return 0
   192  	}
   193  	return nlb.lastUpdated[si.ID()]
   194  }
   195  
   196  // under rlock
   197  func (nlb *ListenerBase) NodesTardy(periodicNotifTime time.Duration) (nodes meta.NodeMap, tardy bool) {
   198  	if nlb.ProgressInterval() != 0 {
   199  		periodicNotifTime = nlb.ProgressInterval()
   200  	}
   201  	nodes = make(meta.NodeMap, nlb.ActiveCount())
   202  	now := mono.NanoTime()
   203  	for _, si := range nlb.ActiveSrcs {
   204  		ts := nlb.LastUpdated(si)
   205  		diff := time.Duration(now - ts)
   206  		if _, ok := nlb.Stats.Load(si.ID()); ok && diff < periodicNotifTime {
   207  			continue
   208  		}
   209  		nodes.Add(si)
   210  		tardy = true
   211  	}
   212  	return
   213  }
   214  
   215  func (nlb *ListenerBase) Status() *Status {
   216  	return &Status{Kind: nlb.Kind(), UUID: nlb.UUID(), EndTimeX: nlb.EndTimeX.Load(), AbortedX: nlb.Aborted()}
   217  }
   218  
   219  func (nlb *ListenerBase) _name() *strings.Builder {
   220  	var sb strings.Builder
   221  	sb.WriteString("nl-")
   222  	sb.WriteString(nlb.Kind())
   223  	sb.WriteByte('[')
   224  	sb.WriteString(nlb.UUID())
   225  	sb.WriteByte(']')
   226  	return &sb
   227  }
   228  
   229  func (nlb *ListenerBase) Name() string {
   230  	sb := nlb._name()
   231  	return sb.String()
   232  }
   233  
   234  func (nlb *ListenerBase) String() string {
   235  	var (
   236  		tm, res  string
   237  		sb       = nlb._name()
   238  		finCount = nlb.FinCount()
   239  	)
   240  	if nlb.Cause() != "" {
   241  		sb.WriteString("-caused-by-")
   242  		sb.WriteString(nlb.Cause())
   243  	}
   244  	if bcks := nlb.Bcks(); len(bcks) > 0 {
   245  		sb.WriteByte('-')
   246  		sb.WriteString(bcks[0].String())
   247  		if len(bcks) > 1 {
   248  			sb.WriteByte('-')
   249  			sb.WriteString(bcks[1].String())
   250  		}
   251  	}
   252  	if tfin := nlb.EndTimeX.Load(); tfin > 0 {
   253  		if cnt := nlb.ErrCnt(); cnt > 0 {
   254  			res = "-" + nlb.Err().Error()
   255  		} else {
   256  			res = "-done"
   257  		}
   258  		tm = cos.FormatNanoTime(tfin, cos.StampMicro)
   259  		sb.WriteByte('-')
   260  		sb.WriteString(tm)
   261  		sb.WriteString(res)
   262  		return sb.String()
   263  	}
   264  	if finCount > 0 {
   265  		sb.WriteString("(cnt=")
   266  		sb.WriteString(strconv.Itoa(finCount))
   267  		sb.WriteByte('/')
   268  		sb.WriteString(strconv.Itoa(len(nlb.Srcs)))
   269  		sb.WriteByte(')')
   270  		return sb.String()
   271  	}
   272  	return sb.String()
   273  }
   274  
   275  ////////////
   276  // Status //
   277  ////////////
   278  
   279  func (ns *Status) Finished() bool { return ns.EndTimeX > 0 }
   280  func (ns *Status) Aborted() bool  { return ns.AbortedX }
   281  
   282  func (ns *Status) String() (s string) {
   283  	s = ns.Kind + "[" + ns.UUID + "]"
   284  	switch {
   285  	case ns.Aborted():
   286  		s += "-abrt"
   287  	case ns.Finished():
   288  		if ns.ErrMsg != "" {
   289  			s += "-" + ns.ErrMsg
   290  		} else {
   291  			s += "-done"
   292  		}
   293  	}
   294  	return
   295  }
   296  
   297  func (nsv StatusVec) String() (s string) {
   298  	for _, ns := range nsv {
   299  		s += ns.String() + ", "
   300  	}
   301  	return s[:max(0, len(s)-2)]
   302  }
   303  
   304  ///////////////
   305  // NodeStats //
   306  ///////////////
   307  
   308  func NewNodeStats(sizes ...int) *NodeStats {
   309  	size := 0
   310  	if len(sizes) > 0 {
   311  		size = sizes[0]
   312  	}
   313  	return &NodeStats{
   314  		stats: make(map[string]any, size),
   315  	}
   316  }
   317  
   318  func (ns *NodeStats) Store(key string, stats any) {
   319  	ns.Lock()
   320  	if ns.stats == nil {
   321  		ns.stats = make(map[string]any)
   322  	}
   323  	ns.stats[key] = stats
   324  	ns.Unlock()
   325  }
   326  
   327  func (ns *NodeStats) Range(f func(string, any) bool) {
   328  	ns.RLock()
   329  	defer ns.RUnlock()
   330  
   331  	for key, val := range ns.stats {
   332  		if !f(key, val) {
   333  			return
   334  		}
   335  	}
   336  }
   337  
   338  func (ns *NodeStats) Load(key string) (val any, ok bool) {
   339  	ns.RLock()
   340  	val, ok = ns.stats[key]
   341  	ns.RUnlock()
   342  	return
   343  }
   344  
   345  func (ns *NodeStats) Len() (l int) {
   346  	ns.RLock()
   347  	l = len(ns.stats)
   348  	ns.RUnlock()
   349  	return
   350  }
   351  
   352  func (ns *NodeStats) MarshalJSON() (data []byte, err error) {
   353  	ns.RLock()
   354  	data, err = jsoniter.Marshal(ns.stats)
   355  	ns.RUnlock()
   356  	return
   357  }
   358  
   359  func (ns *NodeStats) UnmarshalJSON(data []byte) (err error) {
   360  	if len(data) == 0 {
   361  		return nil
   362  	}
   363  	return jsoniter.Unmarshal(data, &ns.stats)
   364  }