github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/nl/listener.go (about) 1 // Package notifications provides interfaces for AIStore notifications 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package nl 6 7 import ( 8 "strconv" 9 "strings" 10 "sync" 11 "time" 12 13 "github.com/NVIDIA/aistore/cmn" 14 "github.com/NVIDIA/aistore/cmn/atomic" 15 "github.com/NVIDIA/aistore/cmn/cos" 16 "github.com/NVIDIA/aistore/cmn/debug" 17 "github.com/NVIDIA/aistore/cmn/mono" 18 "github.com/NVIDIA/aistore/core/meta" 19 jsoniter "github.com/json-iterator/go" 20 ) 21 22 type Listener interface { 23 Callback(nl Listener, ts int64) 24 UnmarshalStats(rawMsg []byte) (any, bool, bool, error) 25 Lock() 26 Unlock() 27 RLock() 28 RUnlock() 29 Notifiers() meta.NodeMap 30 Kind() string 31 Cause() string 32 Bcks() []*cmn.Bck 33 AddErr(error) 34 Err() error 35 ErrCnt() int 36 UUID() string 37 SetAborted() 38 Aborted() bool 39 Status() *Status 40 SetStats(daeID string, stats any) 41 NodeStats() *NodeStats 42 QueryArgs() cmn.HreqArgs 43 EndTime() int64 44 SetAddedTime() 45 AddedTime() int64 46 Finished() bool 47 Name() string 48 String() string 49 GetOwner() string 50 SetOwner(string) 51 LastUpdated(*meta.Snode) int64 52 ProgressInterval() time.Duration 53 54 // detailed ref-counting 55 ActiveNotifiers() meta.NodeMap 56 FinCount() int 57 ActiveCount() int 58 HasFinished(*meta.Snode) bool 59 MarkFinished(*meta.Snode) 60 NodesTardy(periodicNotifTime time.Duration) (nodes meta.NodeMap, tardy bool) 61 } 62 63 type ( 64 Callback func(n Listener) 65 66 NodeStats struct { 67 sync.RWMutex 68 stats map[string]any // daeID => Stats (e.g. cmn.SnapExt) 69 } 70 71 ListenerBase struct { 72 mu sync.RWMutex 73 Common struct { 74 UUID string 75 Kind string // async operation kind (see api/apc/actmsg.go) 76 Cause string // causal action (e.g. decommission => rebalance) 77 Owned string // "": not owned | equalIC: IC | otherwise, pid + IC 78 Bck []*cmn.Bck 79 } 80 // construction 81 Srcs meta.NodeMap // all notifiers 82 ActiveSrcs meta.NodeMap // running notifiers 83 F Callback `json:"-"` // optional listening-side callback 84 Stats *NodeStats // [daeID => Stats (e.g. cmn.SnapExt)] 85 lastUpdated map[string]int64 // [daeID => last update time(nanoseconds)] 86 progress time.Duration // time interval to monitor the progress 87 addedTime atomic.Int64 // Time when `nl` is added 88 89 // runtime 90 EndTimeX atomic.Int64 // timestamp when finished 91 AbortedX atomic.Bool // sets if the xaction is Aborted 92 Errs cos.Errs // reported error and count 93 } 94 95 Status struct { 96 Kind string `json:"kind"` // xaction kind 97 UUID string `json:"uuid"` // xaction UUID 98 ErrMsg string `json:"err"` // error 99 EndTimeX int64 `json:"end_time"` // time xaction ended 100 AbortedX bool `json:"aborted"` // true if aborted 101 } 102 StatusVec []Status 103 ) 104 105 ////////////////// 106 // ListenerBase // 107 ////////////////// 108 109 func NewNLB(uuid, action, cause string, srcs meta.NodeMap, progress time.Duration, bck ...*cmn.Bck) *ListenerBase { 110 nlb := &ListenerBase{ 111 Srcs: srcs, 112 Stats: NewNodeStats(len(srcs)), 113 progress: progress, 114 lastUpdated: make(map[string]int64, len(srcs)), 115 } 116 nlb.Common.UUID = uuid 117 nlb.Common.Kind = action 118 nlb.Common.Cause = cause 119 nlb.Common.Bck = bck 120 nlb.ActiveSrcs = srcs.ActiveMap() 121 return nlb 122 } 123 124 func (nlb *ListenerBase) Lock() { nlb.mu.Lock() } 125 func (nlb *ListenerBase) Unlock() { nlb.mu.Unlock() } 126 func (nlb *ListenerBase) RLock() { nlb.mu.RLock() } 127 func (nlb *ListenerBase) RUnlock() { nlb.mu.RUnlock() } 128 129 func (nlb *ListenerBase) Notifiers() meta.NodeMap { return nlb.Srcs } 130 func (nlb *ListenerBase) UUID() string { return nlb.Common.UUID } 131 func (nlb *ListenerBase) Aborted() bool { return nlb.AbortedX.Load() } 132 func (nlb *ListenerBase) SetAborted() { nlb.AbortedX.CAS(false, true) } 133 func (nlb *ListenerBase) EndTime() int64 { return nlb.EndTimeX.Load() } 134 func (nlb *ListenerBase) Finished() bool { return nlb.EndTime() > 0 } 135 func (nlb *ListenerBase) ProgressInterval() time.Duration { return nlb.progress } 136 func (nlb *ListenerBase) NodeStats() *NodeStats { return nlb.Stats } 137 func (nlb *ListenerBase) GetOwner() string { return nlb.Common.Owned } 138 func (nlb *ListenerBase) SetOwner(o string) { nlb.Common.Owned = o } 139 func (nlb *ListenerBase) Kind() string { return nlb.Common.Kind } 140 func (nlb *ListenerBase) Cause() string { return nlb.Common.Cause } 141 func (nlb *ListenerBase) Bcks() []*cmn.Bck { return nlb.Common.Bck } 142 func (nlb *ListenerBase) AddedTime() int64 { return nlb.addedTime.Load() } 143 func (nlb *ListenerBase) SetAddedTime() { nlb.addedTime.Store(mono.NanoTime()) } 144 145 func (nlb *ListenerBase) ActiveNotifiers() meta.NodeMap { return nlb.ActiveSrcs } 146 func (nlb *ListenerBase) ActiveCount() int { return len(nlb.ActiveSrcs) } 147 func (nlb *ListenerBase) FinCount() int { return len(nlb.Srcs) - nlb.ActiveCount() } 148 149 func (nlb *ListenerBase) MarkFinished(node *meta.Snode) { 150 delete(nlb.ActiveSrcs, node.ID()) 151 } 152 153 func (nlb *ListenerBase) HasFinished(node *meta.Snode) bool { 154 return !nlb.ActiveSrcs.Contains(node.ID()) 155 } 156 157 // is called after all Notifiers will have notified OR on failure (err != nil) 158 func (nlb *ListenerBase) Callback(nl Listener, ts int64) { 159 if nlb.EndTimeX.CAS(0, 1) { 160 nlb.EndTimeX.Store(ts) 161 if nlb.F != nil { 162 nlb.F(nl) 163 } 164 } 165 } 166 167 func (nlb *ListenerBase) AddErr(err error) { nlb.Errs.Add(err) } 168 func (nlb *ListenerBase) ErrCnt() int { return nlb.Errs.Cnt() } 169 170 func (nlb *ListenerBase) Err() error { 171 if nlb.ErrCnt() == 0 { 172 return nil 173 } 174 return &nlb.Errs 175 } 176 177 func (nlb *ListenerBase) SetStats(daeID string, stats any) { 178 debug.AssertRWMutexLocked(&nlb.mu) 179 180 _, ok := nlb.Srcs[daeID] 181 debug.Assert(ok) 182 nlb.Stats.Store(daeID, stats) 183 if nlb.lastUpdated == nil { 184 nlb.lastUpdated = make(map[string]int64, len(nlb.Srcs)) 185 } 186 nlb.lastUpdated[daeID] = mono.NanoTime() 187 } 188 189 func (nlb *ListenerBase) LastUpdated(si *meta.Snode) int64 { 190 if nlb.lastUpdated == nil { 191 return 0 192 } 193 return nlb.lastUpdated[si.ID()] 194 } 195 196 // under rlock 197 func (nlb *ListenerBase) NodesTardy(periodicNotifTime time.Duration) (nodes meta.NodeMap, tardy bool) { 198 if nlb.ProgressInterval() != 0 { 199 periodicNotifTime = nlb.ProgressInterval() 200 } 201 nodes = make(meta.NodeMap, nlb.ActiveCount()) 202 now := mono.NanoTime() 203 for _, si := range nlb.ActiveSrcs { 204 ts := nlb.LastUpdated(si) 205 diff := time.Duration(now - ts) 206 if _, ok := nlb.Stats.Load(si.ID()); ok && diff < periodicNotifTime { 207 continue 208 } 209 nodes.Add(si) 210 tardy = true 211 } 212 return 213 } 214 215 func (nlb *ListenerBase) Status() *Status { 216 return &Status{Kind: nlb.Kind(), UUID: nlb.UUID(), EndTimeX: nlb.EndTimeX.Load(), AbortedX: nlb.Aborted()} 217 } 218 219 func (nlb *ListenerBase) _name() *strings.Builder { 220 var sb strings.Builder 221 sb.WriteString("nl-") 222 sb.WriteString(nlb.Kind()) 223 sb.WriteByte('[') 224 sb.WriteString(nlb.UUID()) 225 sb.WriteByte(']') 226 return &sb 227 } 228 229 func (nlb *ListenerBase) Name() string { 230 sb := nlb._name() 231 return sb.String() 232 } 233 234 func (nlb *ListenerBase) String() string { 235 var ( 236 tm, res string 237 sb = nlb._name() 238 finCount = nlb.FinCount() 239 ) 240 if nlb.Cause() != "" { 241 sb.WriteString("-caused-by-") 242 sb.WriteString(nlb.Cause()) 243 } 244 if bcks := nlb.Bcks(); len(bcks) > 0 { 245 sb.WriteByte('-') 246 sb.WriteString(bcks[0].String()) 247 if len(bcks) > 1 { 248 sb.WriteByte('-') 249 sb.WriteString(bcks[1].String()) 250 } 251 } 252 if tfin := nlb.EndTimeX.Load(); tfin > 0 { 253 if cnt := nlb.ErrCnt(); cnt > 0 { 254 res = "-" + nlb.Err().Error() 255 } else { 256 res = "-done" 257 } 258 tm = cos.FormatNanoTime(tfin, cos.StampMicro) 259 sb.WriteByte('-') 260 sb.WriteString(tm) 261 sb.WriteString(res) 262 return sb.String() 263 } 264 if finCount > 0 { 265 sb.WriteString("(cnt=") 266 sb.WriteString(strconv.Itoa(finCount)) 267 sb.WriteByte('/') 268 sb.WriteString(strconv.Itoa(len(nlb.Srcs))) 269 sb.WriteByte(')') 270 return sb.String() 271 } 272 return sb.String() 273 } 274 275 //////////// 276 // Status // 277 //////////// 278 279 func (ns *Status) Finished() bool { return ns.EndTimeX > 0 } 280 func (ns *Status) Aborted() bool { return ns.AbortedX } 281 282 func (ns *Status) String() (s string) { 283 s = ns.Kind + "[" + ns.UUID + "]" 284 switch { 285 case ns.Aborted(): 286 s += "-abrt" 287 case ns.Finished(): 288 if ns.ErrMsg != "" { 289 s += "-" + ns.ErrMsg 290 } else { 291 s += "-done" 292 } 293 } 294 return 295 } 296 297 func (nsv StatusVec) String() (s string) { 298 for _, ns := range nsv { 299 s += ns.String() + ", " 300 } 301 return s[:max(0, len(s)-2)] 302 } 303 304 /////////////// 305 // NodeStats // 306 /////////////// 307 308 func NewNodeStats(sizes ...int) *NodeStats { 309 size := 0 310 if len(sizes) > 0 { 311 size = sizes[0] 312 } 313 return &NodeStats{ 314 stats: make(map[string]any, size), 315 } 316 } 317 318 func (ns *NodeStats) Store(key string, stats any) { 319 ns.Lock() 320 if ns.stats == nil { 321 ns.stats = make(map[string]any) 322 } 323 ns.stats[key] = stats 324 ns.Unlock() 325 } 326 327 func (ns *NodeStats) Range(f func(string, any) bool) { 328 ns.RLock() 329 defer ns.RUnlock() 330 331 for key, val := range ns.stats { 332 if !f(key, val) { 333 return 334 } 335 } 336 } 337 338 func (ns *NodeStats) Load(key string) (val any, ok bool) { 339 ns.RLock() 340 val, ok = ns.stats[key] 341 ns.RUnlock() 342 return 343 } 344 345 func (ns *NodeStats) Len() (l int) { 346 ns.RLock() 347 l = len(ns.stats) 348 ns.RUnlock() 349 return 350 } 351 352 func (ns *NodeStats) MarshalJSON() (data []byte, err error) { 353 ns.RLock() 354 data, err = jsoniter.Marshal(ns.stats) 355 ns.RUnlock() 356 return 357 } 358 359 func (ns *NodeStats) UnmarshalJSON(data []byte) (err error) { 360 if len(data) == 0 { 361 return nil 362 } 363 return jsoniter.Unmarshal(data, &ns.stats) 364 }