github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/space/lru.go (about) 1 // Package space provides storage cleanup and eviction functionality (the latter based on the 2 // least recently used cache replacement). It also serves as a built-in garbage-collection 3 // mechanism for orphaned workfiles. 4 /* 5 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 6 */ 7 package space 8 9 import ( 10 "container/heap" 11 "fmt" 12 "sort" 13 "sync" 14 "time" 15 16 "github.com/NVIDIA/aistore/api/apc" 17 "github.com/NVIDIA/aistore/cmn" 18 "github.com/NVIDIA/aistore/cmn/cos" 19 "github.com/NVIDIA/aistore/cmn/debug" 20 "github.com/NVIDIA/aistore/cmn/nlog" 21 "github.com/NVIDIA/aistore/core" 22 "github.com/NVIDIA/aistore/core/meta" 23 "github.com/NVIDIA/aistore/fs" 24 "github.com/NVIDIA/aistore/fs/mpather" 25 "github.com/NVIDIA/aistore/ios" 26 "github.com/NVIDIA/aistore/stats" 27 "github.com/NVIDIA/aistore/xact" 28 "github.com/NVIDIA/aistore/xact/xreg" 29 ) 30 31 // LRU-driven eviction is based on configurable watermarks: config.Space.LowWM and 32 // config.Space.HighWM (section "space" in the cluster config). 33 // 34 // When and if exceeded, AIS target will start gradually evicting objects from its 35 // stable storage: oldest first access-time wise. 36 // 37 // LRU is implemented as eXtended Action (xaction, see xact/README.md) that gets 38 // triggered when/if a used local capacity exceeds high watermark (config.Space.HighWM). LRU then 39 // runs automatically. In order to reduce its impact on the live workload, LRU throttles itself 40 // in accordance with the current storage-target's utilization (see xaction_throttle.go). 41 // 42 // There's only one API that this module provides to the rest of the code: 43 // - runLRU - to initiate a new LRU extended action on the local target 44 // All other methods are private to this module and are used only internally. 45 46 // tunables 47 const ( 48 minEvictThresh = 10 * cos.MiB // to run or not to run 49 capCheckThresh = 256 * cos.MiB // capacity checking threshold (in re: periodic throttle) 50 ) 51 52 type ( 53 IniLRU struct { 54 Xaction *XactLRU 55 Config *cmn.Config 56 StatsT stats.Tracker 57 Buckets []cmn.Bck // list of buckets to run LRU 58 GetFSUsedPercentage func(path string) (usedPercentage int64, ok bool) 59 GetFSStats func(path string) (blocks, bavail uint64, bsize int64, err error) 60 WG *sync.WaitGroup 61 Force bool // Ignore LRU prop when set to be true. 62 } 63 XactLRU struct { 64 xact.Base 65 } 66 ) 67 68 // private 69 type ( 70 // minHeap keeps fileInfo sorted by access time with oldest on top of the heap. 71 minHeap []*core.LOM 72 73 // parent (contains mpath joggers) 74 lruP struct { 75 wg sync.WaitGroup 76 joggers map[string]*lruJ 77 ini IniLRU 78 } 79 80 // lruJ represents a single LRU context and a single /jogger/ 81 // that traverses and evicts a single given mountpath. 82 lruJ struct { 83 // runtime 84 curSize int64 85 totalSize int64 // difference between lowWM size and used size 86 newest int64 87 heap *minHeap 88 bck cmn.Bck 89 now int64 90 // init-time 91 p *lruP 92 ini *IniLRU 93 stopCh chan struct{} 94 joggers map[string]*lruJ 95 mi *fs.Mountpath 96 config *cmn.Config 97 // runtime 98 throttle bool 99 allowDelObj bool 100 } 101 lruFactory struct { 102 xreg.RenewBase 103 xctn *XactLRU 104 } 105 TestFactory = lruFactory // unit tests only 106 ) 107 108 // interface guard 109 var ( 110 _ xreg.Renewable = (*lruFactory)(nil) 111 _ core.Xact = (*XactLRU)(nil) 112 ) 113 114 //////////////// 115 // lruFactory // 116 //////////////// 117 118 func (*lruFactory) New(args xreg.Args, _ *meta.Bck) xreg.Renewable { 119 return &lruFactory{RenewBase: xreg.RenewBase{Args: args}} 120 } 121 122 func (p *lruFactory) Start() error { 123 p.xctn = &XactLRU{} 124 p.xctn.InitBase(p.UUID(), apc.ActLRU, nil) 125 return nil 126 } 127 128 func (*lruFactory) Kind() string { return apc.ActLRU } 129 func (p *lruFactory) Get() core.Xact { return p.xctn } 130 131 func (*lruFactory) WhenPrevIsRunning(prevEntry xreg.Renewable) (wpr xreg.WPR, err error) { 132 return xreg.WprUse, cmn.NewErrXactUsePrev(prevEntry.Get().String()) 133 } 134 135 func RunLRU(ini *IniLRU) { 136 var ( 137 xlru = ini.Xaction 138 config = cmn.GCO.Get() 139 availablePaths = fs.GetAvail() 140 num = len(availablePaths) 141 joggers = make(map[string]*lruJ, num) 142 parent = &lruP{joggers: joggers, ini: *ini} 143 ) 144 defer func() { 145 if ini.WG != nil { 146 ini.WG.Done() 147 } 148 }() 149 if num == 0 { 150 xlru.AddErr(cmn.ErrNoMountpaths, 0) 151 xlru.Finish() 152 return 153 } 154 for mpath, mi := range availablePaths { 155 h := make(minHeap, 0, 64) 156 joggers[mpath] = &lruJ{ 157 heap: &h, 158 stopCh: make(chan struct{}, 1), 159 mi: mi, 160 config: config, 161 ini: &parent.ini, 162 p: parent, 163 } 164 } 165 providers := apc.Providers.ToSlice() 166 167 for _, j := range joggers { 168 parent.wg.Add(1) 169 j.joggers = joggers 170 go j.run(providers) 171 } 172 cs := fs.Cap() 173 nlog.Infof("%s started, dont-evict-time %v, %s", xlru, config.LRU.DontEvictTime, cs.String()) 174 if ini.WG != nil { 175 ini.WG.Done() 176 ini.WG = nil 177 } 178 parent.wg.Wait() 179 180 for _, j := range joggers { 181 j.stop() 182 } 183 xlru.Finish() 184 cs = fs.Cap() 185 nlog.Infof("%s finished, %s", xlru, cs.String()) 186 } 187 188 func (*XactLRU) Run(*sync.WaitGroup) { debug.Assert(false) } 189 190 func (r *XactLRU) Snap() (snap *core.Snap) { 191 snap = &core.Snap{} 192 r.ToSnap(snap) 193 194 snap.IdleX = r.IsIdle() 195 return 196 } 197 198 ////////////////////// 199 // mountpath jogger // 200 ////////////////////// 201 202 func (j *lruJ) String() string { 203 return fmt.Sprintf("%s: jog-%s", j.ini.Xaction, j.mi) 204 } 205 206 func (j *lruJ) stop() { j.stopCh <- struct{}{} } 207 208 func (j *lruJ) run(providers []string) { 209 var err error 210 defer j.p.wg.Done() 211 // compute the size (bytes) to free up 212 if err = j.evictSize(); err != nil { 213 goto ex 214 } 215 if j.totalSize < minEvictThresh { 216 nlog.Infof("%s: used cap below threshold, nothing to do", j) 217 return 218 } 219 if len(j.ini.Buckets) != 0 { 220 nlog.Infof("%s: freeing-up %s", j, cos.ToSizeIEC(j.totalSize, 2)) 221 err = j.jogBcks(j.ini.Buckets, j.ini.Force) 222 } else { 223 err = j.jog(providers) 224 } 225 ex: 226 if err == nil || cmn.IsErrBucketNought(err) || cmn.IsErrObjNought(err) { 227 return 228 } 229 nlog.Errorln(j.String()+":", "exited with err:", err) 230 } 231 232 func (j *lruJ) jog(providers []string) (err error) { 233 nlog.Infoln(j.String()+":", "freeing-up", cos.ToSizeIEC(j.totalSize, 2)) 234 for _, provider := range providers { // for each provider (NOTE: ordering is random) 235 var ( 236 bcks []cmn.Bck 237 opts = fs.WalkOpts{ 238 Mi: j.mi, 239 Bck: cmn.Bck{Provider: provider, Ns: cmn.NsGlobal}, 240 } 241 ) 242 if bcks, err = fs.AllMpathBcks(&opts); err != nil { 243 return 244 } 245 if err = j.jogBcks(bcks, false); err != nil { 246 return 247 } 248 } 249 return 250 } 251 252 func (j *lruJ) jogBcks(bcks []cmn.Bck, force bool) (err error) { 253 if len(bcks) == 0 { 254 return 255 } 256 if len(bcks) > 1 { 257 j.sortBsize(bcks) 258 } 259 for _, bck := range bcks { // for each bucket under a given provider 260 var size int64 261 j.bck = bck 262 if j.allowDelObj, err = j.allow(); err != nil { 263 nlog.Errorf("%s: %v - skipping %s (Hint: run 'ais storage cleanup' to cleanup)", j, err, bck) 264 err = nil 265 continue 266 } 267 j.allowDelObj = j.allowDelObj || force 268 if size, err = j.jogBck(); err != nil { 269 return 270 } 271 if size < cos.KiB { 272 continue 273 } 274 // recompute size-to-evict 275 if err = j.evictSize(); err != nil { 276 return 277 } 278 if j.totalSize < cos.KiB { 279 return 280 } 281 } 282 return 283 } 284 285 func (j *lruJ) jogBck() (size int64, err error) { 286 // 1. init per-bucket min-heap (and reuse the slice) 287 h := (*j.heap)[:0] 288 j.heap = &h 289 heap.Init(j.heap) 290 291 // 2. collect 292 opts := &fs.WalkOpts{ 293 Mi: j.mi, 294 Bck: j.bck, 295 CTs: []string{fs.ObjectType}, 296 Callback: j.walk, 297 Sorted: false, 298 } 299 j.now = time.Now().UnixNano() 300 if err = fs.Walk(opts); err != nil { 301 return 302 } 303 // 3. evict 304 size, err = j.evict() 305 return 306 } 307 308 func (j *lruJ) visitLOM(parsedFQN *fs.ParsedFQN) { 309 if !j.allowDelObj { 310 return 311 } 312 lom := core.AllocLOM(parsedFQN.ObjName) 313 if pushed := j._visit(lom); !pushed { 314 core.FreeLOM(lom) 315 } 316 } 317 318 func (j *lruJ) _visit(lom *core.LOM) (pushed bool) { 319 if err := lom.InitBck(&j.bck); err != nil { 320 return 321 } 322 if err := lom.Load(false /*cache it*/, false /*locked*/); err != nil { 323 return 324 } 325 if lom.AtimeUnix()+int64(j.config.LRU.DontEvictTime) > j.now { 326 return 327 } 328 if lom.HasCopies() && lom.IsCopy() { 329 return 330 } 331 // do nothing if the heap's curSize >= totalSize and 332 // the file is more recent then the the heap's newest. 333 if j.curSize >= j.totalSize && lom.AtimeUnix() > j.newest { 334 return 335 } 336 heap.Push(j.heap, lom) 337 j.curSize += lom.SizeBytes() 338 if lom.AtimeUnix() > j.newest { 339 j.newest = lom.AtimeUnix() 340 } 341 return true 342 } 343 344 func (j *lruJ) walk(fqn string, de fs.DirEntry) error { 345 var parsed fs.ParsedFQN 346 if de.IsDir() { 347 return nil 348 } 349 if err := j.yieldTerm(); err != nil { 350 return err 351 } 352 if _, err := core.ResolveFQN(fqn, &parsed); err != nil { 353 return nil 354 } 355 if parsed.ContentType == fs.ObjectType { 356 j.visitLOM(&parsed) 357 } 358 359 return nil 360 } 361 362 func (j *lruJ) evict() (size int64, err error) { 363 var ( 364 fevicted, bevicted int64 365 capCheck int64 366 h = j.heap 367 xlru = j.ini.Xaction 368 ) 369 370 // evict(sic!) and house-keep 371 for h.Len() > 0 && j.totalSize > 0 { 372 lom := heap.Pop(h).(*core.LOM) 373 if !j.evictObj(lom) { 374 core.FreeLOM(lom) 375 continue 376 } 377 objSize := lom.SizeBytes(true /*not loaded*/) 378 core.FreeLOM(lom) 379 bevicted += objSize 380 size += objSize 381 fevicted++ 382 if capCheck, err = j.postRemove(capCheck, objSize); err != nil { 383 return 384 } 385 } 386 j.ini.StatsT.Add(stats.LruEvictSize, bevicted) 387 j.ini.StatsT.Add(stats.LruEvictCount, fevicted) 388 xlru.ObjsAdd(int(fevicted), bevicted) 389 return 390 } 391 392 func (j *lruJ) postRemove(prev, size int64) (capCheck int64, err error) { 393 j.totalSize -= size 394 capCheck = prev + size 395 if err = j.yieldTerm(); err != nil { 396 return 397 } 398 if capCheck < capCheckThresh { 399 return 400 } 401 // init, recompute, and throttle - once per capCheckThresh 402 capCheck = 0 403 j.throttle = false 404 j.allowDelObj, _ = j.allow() 405 j.config = cmn.GCO.Get() 406 j.now = time.Now().UnixNano() 407 usedPct, ok := j.ini.GetFSUsedPercentage(j.mi.Path) 408 if ok && usedPct < j.config.Space.HighWM { 409 err = j._throttle(usedPct) 410 } 411 return 412 } 413 414 func (j *lruJ) _throttle(usedPct int64) (err error) { 415 if j.mi.IsIdle(j.config) { 416 return 417 } 418 // throttle self 419 ratioCapacity := cos.Ratio(j.config.Space.HighWM, j.config.Space.LowWM, usedPct) 420 curr := fs.GetMpathUtil(j.mi.Path) 421 ratioUtilization := cos.Ratio(j.config.Disk.DiskUtilHighWM, j.config.Disk.DiskUtilLowWM, curr) 422 if ratioUtilization > ratioCapacity { 423 if usedPct < (j.config.Space.LowWM+j.config.Space.HighWM)/2 { 424 j.throttle = true 425 } 426 time.Sleep(mpather.ThrottleMaxDur) 427 err = j.yieldTerm() 428 } 429 return 430 } 431 432 // remove local copies that "belong" to different LRU joggers (space accounting may be temporarily not precise) 433 func (j *lruJ) evictObj(lom *core.LOM) bool { 434 lom.Lock(true) 435 err := lom.Remove() 436 lom.Unlock(true) 437 if err != nil { 438 nlog.Errorf("%s: failed to evict %s: %v", j, lom, err) 439 return false 440 } 441 if cmn.Rom.FastV(5, cos.SmoduleSpace) { 442 nlog.Infof("%s: evicted %s, size=%d", j, lom, lom.SizeBytes(true /*not loaded*/)) 443 } 444 return true 445 } 446 447 func (j *lruJ) evictSize() (err error) { 448 lwm, hwm := j.config.Space.LowWM, j.config.Space.HighWM 449 blocks, bavail, bsize, err := j.ini.GetFSStats(j.mi.Path) 450 if err != nil { 451 return err 452 } 453 used := blocks - bavail 454 usedPct := used * 100 / blocks 455 if usedPct < uint64(hwm) { 456 return 457 } 458 lwmBlocks := blocks * uint64(lwm) / 100 459 j.totalSize = int64(used-lwmBlocks) * bsize 460 return 461 } 462 463 func (j *lruJ) yieldTerm() error { 464 xlru := j.ini.Xaction 465 select { 466 case errCause := <-xlru.ChanAbort(): 467 return cmn.NewErrAborted(xlru.Name(), "", errCause) 468 case <-j.stopCh: 469 return cmn.NewErrAborted(xlru.Name(), "", nil) 470 default: 471 if j.throttle { 472 time.Sleep(mpather.ThrottleMinDur) 473 } 474 break 475 } 476 if xlru.Finished() { 477 return cmn.NewErrAborted(xlru.Name(), "", nil) 478 } 479 return nil 480 } 481 482 // sort buckets by size 483 func (j *lruJ) sortBsize(bcks []cmn.Bck) { 484 sized := make([]struct { 485 b cmn.Bck 486 v uint64 487 }, len(bcks)) 488 for i := range bcks { 489 path := j.mi.MakePathCT(&bcks[i], fs.ObjectType) 490 sized[i].b = bcks[i] 491 sized[i].v, _ = ios.DirSizeOnDisk(path, false /*withNonDirPrefix*/) 492 } 493 sort.Slice(sized, func(i, j int) bool { 494 return sized[i].v > sized[j].v 495 }) 496 for i := range bcks { 497 bcks[i] = sized[i].b 498 } 499 } 500 501 func (j *lruJ) allow() (ok bool, err error) { 502 var ( 503 bowner = core.T.Bowner() 504 b = meta.CloneBck(&j.bck) 505 ) 506 if err = b.Init(bowner); err != nil { 507 return 508 } 509 ok = b.Props.LRU.Enabled && b.Allow(apc.AceObjDELETE) == nil 510 return 511 } 512 513 ////////////// 514 // min-heap // 515 ////////////// 516 517 func (h minHeap) Len() int { return len(h) } 518 func (h minHeap) Less(i, j int) bool { return h[i].Atime().Before(h[j].Atime()) } 519 func (h minHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } 520 func (h *minHeap) Push(x any) { *h = append(*h, x.(*core.LOM)) } 521 func (h *minHeap) Pop() any { 522 old := *h 523 n := len(old) 524 fi := old[n-1] 525 *h = old[0 : n-1] 526 return fi 527 }