github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/xact/api.go (about) 1 // Package xact provides core functionality for the AIStore eXtended Actions (xactions). 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package xact 6 7 import ( 8 "errors" 9 "fmt" 10 "sort" 11 "strings" 12 "time" 13 14 "github.com/NVIDIA/aistore/api/apc" 15 "github.com/NVIDIA/aistore/cmn" 16 "github.com/NVIDIA/aistore/cmn/cos" 17 "github.com/NVIDIA/aistore/cmn/debug" 18 "github.com/NVIDIA/aistore/core" 19 ) 20 21 const ( 22 ScopeG = iota + 1 // cluster 23 ScopeB // bucket 24 ScopeGB // (one bucket) | (all buckets) 25 ScopeT // target 26 ) 27 28 const ( 29 SepaID = "," 30 31 LeftID = "[" 32 RightID = "]" 33 ) 34 35 // global waiting tunables 36 // (used in: `api.WaitForXactionIC` and `api.WaitForXactionNode`) 37 const ( 38 DefWaitTimeShort = time.Minute // zero `ArgsMsg.Timeout` defaults to 39 DefWaitTimeLong = 7 * 24 * time.Hour // when `ArgsMsg.Timeout` is negative 40 MaxProbingFreq = 30 * time.Second // as the name implies 41 MinPollTime = 2 * time.Second // ditto 42 MaxPollTime = 2 * time.Minute // can grow up to 43 44 // number of consecutive 'idle' xaction states, with possible numeric 45 // values translating as follows: 46 // 1: fully rely on xact.IsIdle() logic with no extra checks whatsoever 47 // 2: one additional IsIdle() call after MinPollTime 48 // 3: two additional IsIdle() calls spaced at MinPollTime interval, and so on. 49 NumConsecutiveIdle = 2 50 ) 51 52 type ( 53 // either xaction ID or Kind must be specified 54 // is getting passed via ActMsg.Value w/ MorphMarshal extraction 55 ArgsMsg struct { 56 ID string // xaction UUID 57 Kind string // xaction kind _or_ name (see `xact.Table`) 58 59 // optional parameters 60 DaemonID string // node that runs this xaction 61 Bck cmn.Bck // bucket 62 Buckets []cmn.Bck // list of buckets (e.g., copy-bucket, lru-evict, etc.) 63 Timeout time.Duration // max time to wait 64 Force bool // force 65 OnlyRunning bool // only for running xactions 66 } 67 68 // simplified JSON-tagged version of the above 69 QueryMsg struct { 70 OnlyRunning *bool `json:"show_active"` 71 Bck cmn.Bck `json:"bck"` 72 ID string `json:"id"` 73 Kind string `json:"kind"` 74 DaemonID string `json:"node,omitempty"` 75 Buckets []cmn.Bck `json:"buckets,omitempty"` 76 } 77 78 // primarily: `api.QueryXactionSnaps` 79 MultiSnap map[string][]*core.Snap // by target ID (tid) 80 ) 81 82 type ( 83 Descriptor struct { 84 DisplayName string // as implied 85 Access apc.AccessAttrs // access permissions (see: apc.Access*) 86 Scope int // ScopeG (global), etc. - the enum above 87 Startable bool // true if user can start this xaction (e.g., via `api.StartXaction`) 88 Metasync bool // true if this xaction changes (and metasyncs) cluster metadata 89 RefreshCap bool // refresh capacity stats upon completion 90 91 // see xreg for "limited coexistence" 92 Rebalance bool // moves data between nodes 93 Resilver bool // moves data between mountpaths 94 ConflictRebRes bool // conflicts with rebalance/resilver 95 AbortRebRes bool // gets aborted upon rebalance/resilver - currently, all `ext`-ensions 96 97 // xaction has an intermediate `idle` state whereby it "idles" between requests 98 // (see related: xact/demand.go) 99 Idles bool 100 101 // xaction returns extended xaction-specific stats 102 // (see related: `Snap.Ext` in core/xaction.go) 103 ExtendedStats bool 104 } 105 ) 106 107 //////////////// 108 // Descriptor // 109 //////////////// 110 111 // `xact.Table` is a static, public, and global Kind=>[Xaction Descriptor] map that contains 112 // xaction kinds and static properties, such as `Startable`, `Owned`, etc. 113 // In particular, "startability" is narrowly defined as ability to start xaction 114 // via `api.StartXaction` 115 // (whereby copying bucket, for instance, requires a separate `api.CopyBucket`, etc.) 116 var Table = map[string]Descriptor{ 117 // bucket-less xactions that will typically have a 'cluster' scope (with resilver being a notable exception) 118 apc.ActElection: {DisplayName: "elect-primary", Scope: ScopeG, Startable: false}, 119 apc.ActRebalance: {Scope: ScopeG, Startable: true, Metasync: true, Rebalance: true}, 120 121 apc.ActETLInline: {Scope: ScopeG, Startable: false, AbortRebRes: true}, 122 123 // (one bucket) | (all buckets) 124 apc.ActLRU: {DisplayName: "lru-eviction", Scope: ScopeGB, Startable: true}, 125 apc.ActStoreCleanup: {DisplayName: "cleanup", Scope: ScopeGB, Startable: true}, 126 apc.ActSummaryBck: { 127 DisplayName: "summary", 128 Scope: ScopeGB, 129 Access: apc.AceObjLIST | apc.AceBckHEAD, 130 Startable: false, 131 Metasync: false, 132 }, 133 134 // single target (node) 135 apc.ActResilver: {Scope: ScopeT, Startable: true, Resilver: true}, 136 137 // on-demand EC and n-way replication 138 // (non-startable, triggered by PUT => erasure-coded or mirrored bucket) 139 apc.ActECGet: {Scope: ScopeB, Startable: false, Idles: true, ExtendedStats: true}, 140 apc.ActECPut: {Scope: ScopeB, Startable: false, RefreshCap: true, Idles: true, ExtendedStats: true}, 141 apc.ActECRespond: {Scope: ScopeB, Startable: false, Idles: true}, 142 apc.ActPutCopies: {Scope: ScopeB, Startable: false, RefreshCap: true, Idles: true}, 143 144 // 145 // on-demand multi-object (consider setting ConflictRebRes = true) 146 // 147 apc.ActArchive: {Scope: ScopeB, Access: apc.AccessRW, Startable: false, RefreshCap: true, Idles: true}, 148 apc.ActCopyObjects: { 149 DisplayName: "copy-objects", 150 Scope: ScopeB, 151 Access: apc.AccessRW, // apc.AceCreateBucket is checked as well but only if ais://dst doesn't exist 152 Startable: false, 153 RefreshCap: true, 154 Idles: true, 155 }, 156 apc.ActETLObjects: { 157 DisplayName: "etl-objects", 158 Scope: ScopeB, 159 Access: apc.AccessRW, // ditto 160 Startable: false, 161 RefreshCap: true, 162 Idles: true, 163 AbortRebRes: true, 164 }, 165 166 apc.ActBlobDl: {Access: apc.AccessRW, Scope: ScopeB, Startable: true, AbortRebRes: true, RefreshCap: true}, 167 168 apc.ActDownload: {Access: apc.AccessRW, Scope: ScopeG, Startable: false, Idles: true, AbortRebRes: true}, 169 170 // in its own class 171 apc.ActDsort: { 172 DisplayName: "dsort", 173 Scope: ScopeB, 174 Access: apc.AccessRW, 175 Startable: false, 176 RefreshCap: true, 177 ConflictRebRes: true, 178 ExtendedStats: true, 179 AbortRebRes: true, 180 }, 181 182 // multi-object 183 apc.ActPromote: { 184 DisplayName: "promote-files", 185 Scope: ScopeB, 186 Access: apc.AcePromote, 187 Startable: false, 188 RefreshCap: true, 189 }, 190 apc.ActEvictObjects: { 191 DisplayName: "evict-objects", 192 Scope: ScopeB, 193 Access: apc.AceObjDELETE, 194 Startable: false, 195 RefreshCap: true, 196 }, 197 apc.ActDeleteObjects: { 198 DisplayName: "delete-objects", 199 Scope: ScopeB, 200 Access: apc.AceObjDELETE, 201 Startable: false, 202 RefreshCap: true, 203 }, 204 apc.ActPrefetchObjects: { 205 DisplayName: "prefetch-objects", 206 Scope: ScopeB, 207 Access: apc.AccessRW, 208 Startable: true, 209 RefreshCap: true, 210 }, 211 212 // entire bucket (storage svcs) 213 apc.ActECEncode: { 214 DisplayName: "ec-bucket", 215 Scope: ScopeB, 216 Access: apc.AccessRW, 217 Startable: true, 218 Metasync: true, 219 RefreshCap: true, 220 ConflictRebRes: true, 221 }, 222 apc.ActMakeNCopies: { 223 DisplayName: "mirror", 224 Scope: ScopeB, 225 Access: apc.AccessRW, 226 Startable: true, 227 Metasync: true, 228 RefreshCap: true, 229 }, 230 apc.ActMoveBck: { 231 DisplayName: "rename-bucket", 232 Scope: ScopeB, 233 Access: apc.AceMoveBucket, 234 Startable: false, // executing this one cannot be done via `api.StartXaction` 235 Metasync: true, 236 Rebalance: true, 237 ConflictRebRes: true, 238 }, 239 apc.ActCopyBck: { 240 DisplayName: "copy-bucket", 241 Scope: ScopeB, 242 Access: apc.AccessRW, // apc.AceCreateBucket ditto 243 Startable: false, // ditto 244 Metasync: true, 245 RefreshCap: true, 246 ConflictRebRes: true, 247 }, 248 apc.ActETLBck: { 249 DisplayName: "etl-bucket", 250 Scope: ScopeB, 251 Access: apc.AccessRW, // ditto 252 Startable: false, // ditto 253 Metasync: true, 254 RefreshCap: true, 255 AbortRebRes: true, 256 }, 257 258 apc.ActList: {Scope: ScopeB, Access: apc.AceObjLIST, Startable: false, Metasync: false, Idles: true}, 259 260 // cache management, internal usage 261 apc.ActLoadLomCache: {DisplayName: "warm-up-metadata", Scope: ScopeB, Startable: true}, 262 apc.ActInvalListCache: {Scope: ScopeB, Access: apc.AceObjLIST, Startable: false}, 263 } 264 265 func IsValidKind(kind string) bool { 266 _, ok := Table[kind] 267 return ok 268 } 269 270 func GetDescriptor(kindOrName string) (string, Descriptor, error) { 271 kind, dtor := getDtor(kindOrName) 272 if dtor == nil { 273 return "", Descriptor{}, fmt.Errorf("not found xaction %q", kindOrName) 274 } 275 return kind, *dtor, nil 276 } 277 278 func GetKindName(kindOrName string) (kind, name string) { 279 if kindOrName == "" { 280 return 281 } 282 var dtor *Descriptor 283 kind, dtor = getDtor(kindOrName) 284 if dtor == nil { 285 return 286 } 287 name = dtor.DisplayName 288 if name == "" { 289 name = kind 290 } 291 return 292 } 293 294 func Cname(kind, uuid string) string { return kind + LeftID + uuid + RightID } 295 296 func ParseCname(cname string) (xactKind, xactID string, _ error) { 297 const efmt = "invalid name %q" 298 l := len(cname) 299 if l == 0 || cname[l-1] != RightID[0] { 300 return "", "", fmt.Errorf(efmt, cname) 301 } 302 i := strings.IndexByte(cname, LeftID[0]) 303 if i < 0 { 304 return "", "", fmt.Errorf(efmt, cname) 305 } 306 xactKind, xactID = cname[:i], cname[i+1:l-1] 307 return xactKind, xactID, nil 308 } 309 310 func IdlesBeforeFinishing(kindOrName string) bool { 311 _, dtor := getDtor(kindOrName) 312 debug.Assert(dtor != nil) 313 return dtor.Idles 314 } 315 316 func ListDisplayNames(onlyStartable bool) (names []string) { 317 names = make([]string, 0, len(Table)) 318 for kind, dtor := range Table { 319 if onlyStartable && !dtor.Startable { 320 continue 321 } 322 if dtor.DisplayName != "" { 323 names = append(names, dtor.DisplayName) 324 } else { 325 names = append(names, kind) 326 } 327 } 328 sort.Strings(names) 329 return 330 } 331 332 func IsSameScope(kindOrName string, scs ...int) bool { 333 _, dtor := getDtor(kindOrName) 334 if dtor == nil { 335 return false 336 } 337 scope, scope2 := scs[0], 0 338 if len(scs) > 1 { 339 scope2 = scs[1] 340 } 341 return dtor.Scope == scope || dtor.Scope == scope2 342 } 343 344 func getDtor(kindOrName string) (string, *Descriptor) { 345 if dtor, ok := Table[kindOrName]; ok { 346 return kindOrName, &dtor 347 } 348 for kind, dtor := range Table { 349 if dtor.DisplayName == kindOrName { 350 return kind, &dtor 351 } 352 } 353 return "", nil 354 } 355 356 ///////////// 357 // ArgsMsg // 358 ///////////// 359 360 func (args *ArgsMsg) String() (s string) { 361 if args.ID == "" { 362 s = "x-" + args.Kind 363 } else { 364 s = fmt.Sprintf("x-%s[%s]", args.Kind, args.ID) 365 } 366 if !args.Bck.IsEmpty() { 367 s += "-" + args.Bck.String() 368 } 369 if args.Timeout > 0 { 370 s += "-" + args.Timeout.String() 371 } 372 if args.DaemonID != "" { 373 s += "-node[" + args.DaemonID + "]" 374 } 375 return 376 } 377 378 ////////////// 379 // QueryMsg // 380 ////////////// 381 382 func (msg *QueryMsg) String() (s string) { 383 if msg.ID == "" { 384 s = "x-" + msg.Kind 385 } else { 386 s = fmt.Sprintf("x-%s[%s]", msg.Kind, msg.ID) 387 } 388 if !msg.Bck.IsEmpty() { 389 s += "-" + msg.Bck.String() 390 } 391 if msg.DaemonID != "" { 392 s += "-node[" + msg.DaemonID + "]" 393 } 394 if msg.OnlyRunning != nil && *msg.OnlyRunning { 395 s += "-only-running" 396 } 397 return 398 } 399 400 /////////////// 401 // MultiSnap // 402 /////////////// 403 404 // NOTE: when xaction UUID is not specified: require the same kind _and_ 405 // a single running uuid (otherwise, IsAborted() et al. can only produce ambiguous results) 406 func (xs MultiSnap) checkEmptyID(xid string) error { 407 var kind, uuid string 408 if xid != "" { 409 debug.Assert(IsValidUUID(xid), xid) 410 return nil 411 } 412 for _, snaps := range xs { 413 for _, xsnap := range snaps { 414 if kind == "" { 415 kind = xsnap.Kind 416 } else if kind != xsnap.Kind { 417 return fmt.Errorf("invalid multi-snap Kind: %q vs %q", kind, xsnap.Kind) 418 } 419 if xsnap.Running() { 420 if uuid == "" { 421 uuid = xsnap.ID 422 } else if uuid != xsnap.ID { 423 return fmt.Errorf("invalid multi-snap UUID: %q vs %q", uuid, xsnap.ID) 424 } 425 } 426 } 427 } 428 return nil 429 } 430 431 func (xs MultiSnap) GetUUIDs() []string { 432 uuids := make(cos.StrSet, 2) 433 for _, snaps := range xs { 434 for _, xsnap := range snaps { 435 uuids[xsnap.ID] = struct{}{} 436 } 437 } 438 return uuids.ToSlice() 439 } 440 441 func (xs MultiSnap) RunningTarget(xid string) (string /*tid*/, *core.Snap, error) { 442 if err := xs.checkEmptyID(xid); err != nil { 443 return "", nil, err 444 } 445 for tid, snaps := range xs { 446 for _, xsnap := range snaps { 447 if (xid == xsnap.ID || xid == "") && xsnap.Running() { 448 return tid, xsnap, nil 449 } 450 } 451 } 452 return "", nil, nil 453 } 454 455 func (xs MultiSnap) IsAborted(xid string) (bool, error) { 456 if err := xs.checkEmptyID(xid); err != nil { 457 return false, err 458 } 459 for _, snaps := range xs { 460 for _, xsnap := range snaps { 461 if (xid == xsnap.ID || xid == "") && xsnap.IsAborted() { 462 return true, nil 463 } 464 } 465 } 466 return false, nil 467 } 468 469 // (all targets, all xactions) 470 func (xs MultiSnap) IsIdle(xid string) (aborted, running, notstarted bool) { 471 if xid != "" { 472 debug.Assert(IsValidUUID(xid), xid) 473 return xs._get(xid) 474 } 475 uuids := xs.GetUUIDs() 476 for _, xid = range uuids { 477 a, r, ns := xs._get(xid) 478 aborted = aborted || a 479 notstarted = notstarted || ns 480 running = running || r 481 } 482 return aborted, running, notstarted 483 } 484 485 // (all targets, given xaction) 486 func (xs MultiSnap) _get(xid string) (aborted, running, notstarted bool) { 487 var nt, nr, ns, nf int 488 for _, snaps := range xs { 489 nt++ 490 for _, xsnap := range snaps { 491 if xid != xsnap.ID { 492 continue 493 } 494 nf++ 495 // (one target, one xaction) 496 switch { 497 case xsnap.IsAborted(): 498 return true, false, false 499 case !xsnap.Started(): 500 ns++ 501 case !xsnap.IsIdle(): 502 nr++ 503 } 504 break 505 } 506 } 507 running = nr > 0 508 notstarted = ns > 0 || nf == 0 509 return 510 } 511 512 func (xs MultiSnap) ObjCounts(xid string) (locObjs, outObjs, inObjs int64) { 513 if xid == "" { 514 uuids := xs.GetUUIDs() 515 debug.Assert(len(uuids) == 1, uuids) 516 xid = uuids[0] 517 } 518 for _, snaps := range xs { 519 for _, xsnap := range snaps { 520 if xid == xsnap.ID { 521 locObjs += xsnap.Stats.Objs 522 outObjs += xsnap.Stats.OutObjs 523 inObjs += xsnap.Stats.InObjs 524 } 525 } 526 } 527 return 528 } 529 530 func (xs MultiSnap) ByteCounts(xid string) (locBytes, outBytes, inBytes int64) { 531 if xid == "" { 532 uuids := xs.GetUUIDs() 533 debug.Assert(len(uuids) == 1, uuids) 534 xid = uuids[0] 535 } 536 for _, snaps := range xs { 537 for _, xsnap := range snaps { 538 if xid == xsnap.ID { 539 locBytes += xsnap.Stats.Bytes 540 outBytes += xsnap.Stats.OutBytes 541 inBytes += xsnap.Stats.InBytes 542 } 543 } 544 } 545 return 546 } 547 548 func (xs MultiSnap) TotalRunningTime(xid string) (time.Duration, error) { 549 debug.Assert(IsValidUUID(xid), xid) 550 var ( 551 start, end time.Time 552 found, running bool 553 ) 554 for _, snaps := range xs { 555 for _, xsnap := range snaps { 556 if xid == xsnap.ID { 557 found = true 558 running = running || xsnap.Running() 559 if !xsnap.StartTime.IsZero() { 560 if start.IsZero() || xsnap.StartTime.Before(start) { 561 start = xsnap.StartTime 562 } 563 } 564 if !xsnap.EndTime.IsZero() && xsnap.EndTime.After(end) { 565 end = xsnap.EndTime 566 } 567 } 568 } 569 } 570 if !found { 571 return 0, errors.New("xaction [" + xid + "] not found") 572 } 573 if running { 574 end = time.Now() 575 } 576 return end.Sub(start), nil 577 }