
     1  // Package space provides storage cleanup and eviction functionality (the latter based on the
     2  // least recently used cache replacement). It also serves as a built-in garbage-collection
     3  // mechanism for orphaned workfiles.
     4  /*
     5   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     6   */
     7  package space
     9  import (
    10  	"container/heap"
    11  	"fmt"
    12  	"sort"
    13  	"sync"
    14  	"time"
    16  	""
    17  	""
    18  	""
    19  	""
    20  	""
    21  	""
    22  	""
    23  	""
    24  	""
    25  	""
    26  	""
    27  	""
    28  	""
    29  )
    31  // LRU-driven eviction is based on configurable watermarks: config.Space.LowWM and
    32  // config.Space.HighWM (section "space" in the cluster config).
    33  //
    34  // When and if exceeded, AIS target will start gradually evicting objects from its
    35  // stable storage: oldest first access-time wise.
    36  //
    37  // LRU is implemented as eXtended Action (xaction, see xact/ that gets
    38  // triggered when/if a used local capacity exceeds high watermark (config.Space.HighWM). LRU then
    39  // runs automatically. In order to reduce its impact on the live workload, LRU throttles itself
    40  // in accordance with the current storage-target's utilization (see xaction_throttle.go).
    41  //
    42  // There's only one API that this module provides to the rest of the code:
    43  //   - runLRU - to initiate a new LRU extended action on the local target
    44  // All other methods are private to this module and are used only internally.
    46  // tunables
    47  const (
    48  	minEvictThresh = 10 * cos.MiB  // to run or not to run
    49  	capCheckThresh = 256 * cos.MiB // capacity checking threshold (in re: periodic throttle)
    50  )
    52  type (
    53  	IniLRU struct {
    54  		Xaction             *XactLRU
    55  		Config              *cmn.Config
    56  		StatsT              stats.Tracker
    57  		Buckets             []cmn.Bck // list of buckets to run LRU
    58  		GetFSUsedPercentage func(path string) (usedPercentage int64, ok bool)
    59  		GetFSStats          func(path string) (blocks, bavail uint64, bsize int64, err error)
    60  		WG                  *sync.WaitGroup
    61  		Force               bool // Ignore LRU prop when set to be true.
    62  	}
    63  	XactLRU struct {
    64  		xact.Base
    65  	}
    66  )
    68  // private
    69  type (
    70  	// minHeap keeps fileInfo sorted by access time with oldest on top of the heap.
    71  	minHeap []*core.LOM
    73  	// parent (contains mpath joggers)
    74  	lruP struct {
    75  		wg      sync.WaitGroup
    76  		joggers map[string]*lruJ
    77  		ini     IniLRU
    78  	}
    80  	// lruJ represents a single LRU context and a single /jogger/
    81  	// that traverses and evicts a single given mountpath.
    82  	lruJ struct {
    83  		// runtime
    84  		curSize   int64
    85  		totalSize int64 // difference between lowWM size and used size
    86  		newest    int64
    87  		heap      *minHeap
    88  		bck       cmn.Bck
    89  		now       int64
    90  		// init-time
    91  		p       *lruP
    92  		ini     *IniLRU
    93  		stopCh  chan struct{}
    94  		joggers map[string]*lruJ
    95  		mi      *fs.Mountpath
    96  		config  *cmn.Config
    97  		// runtime
    98  		throttle    bool
    99  		allowDelObj bool
   100  	}
   101  	lruFactory struct {
   102  		xreg.RenewBase
   103  		xctn *XactLRU
   104  	}
   105  	TestFactory = lruFactory // unit tests only
   106  )
   108  // interface guard
   109  var (
   110  	_ xreg.Renewable = (*lruFactory)(nil)
   111  	_ core.Xact      = (*XactLRU)(nil)
   112  )
   114  ////////////////
   115  // lruFactory //
   116  ////////////////
   118  func (*lruFactory) New(args xreg.Args, _ *meta.Bck) xreg.Renewable {
   119  	return &lruFactory{RenewBase: xreg.RenewBase{Args: args}}
   120  }
   122  func (p *lruFactory) Start() error {
   123  	p.xctn = &XactLRU{}
   124  	p.xctn.InitBase(p.UUID(), apc.ActLRU, nil)
   125  	return nil
   126  }
   128  func (*lruFactory) Kind() string     { return apc.ActLRU }
   129  func (p *lruFactory) Get() core.Xact { return p.xctn }
   131  func (*lruFactory) WhenPrevIsRunning(prevEntry xreg.Renewable) (wpr xreg.WPR, err error) {
   132  	return xreg.WprUse, cmn.NewErrXactUsePrev(prevEntry.Get().String())
   133  }
   135  func RunLRU(ini *IniLRU) {
   136  	var (
   137  		xlru           = ini.Xaction
   138  		config         = cmn.GCO.Get()
   139  		availablePaths = fs.GetAvail()
   140  		num            = len(availablePaths)
   141  		joggers        = make(map[string]*lruJ, num)
   142  		parent         = &lruP{joggers: joggers, ini: *ini}
   143  	)
   144  	defer func() {
   145  		if ini.WG != nil {
   146  			ini.WG.Done()
   147  		}
   148  	}()
   149  	if num == 0 {
   150  		xlru.AddErr(cmn.ErrNoMountpaths, 0)
   151  		xlru.Finish()
   152  		return
   153  	}
   154  	for mpath, mi := range availablePaths {
   155  		h := make(minHeap, 0, 64)
   156  		joggers[mpath] = &lruJ{
   157  			heap:   &h,
   158  			stopCh: make(chan struct{}, 1),
   159  			mi:     mi,
   160  			config: config,
   161  			ini:    &parent.ini,
   162  			p:      parent,
   163  		}
   164  	}
   165  	providers := apc.Providers.ToSlice()
   167  	for _, j := range joggers {
   168  		parent.wg.Add(1)
   169  		j.joggers = joggers
   170  		go
   171  	}
   172  	cs := fs.Cap()
   173  	nlog.Infof("%s started, dont-evict-time %v, %s", xlru, config.LRU.DontEvictTime, cs.String())
   174  	if ini.WG != nil {
   175  		ini.WG.Done()
   176  		ini.WG = nil
   177  	}
   178  	parent.wg.Wait()
   180  	for _, j := range joggers {
   181  		j.stop()
   182  	}
   183  	xlru.Finish()
   184  	cs = fs.Cap()
   185  	nlog.Infof("%s finished, %s", xlru, cs.String())
   186  }
   188  func (*XactLRU) Run(*sync.WaitGroup) { debug.Assert(false) }
   190  func (r *XactLRU) Snap() (snap *core.Snap) {
   191  	snap = &core.Snap{}
   192  	r.ToSnap(snap)
   194  	snap.IdleX = r.IsIdle()
   195  	return
   196  }
   198  //////////////////////
   199  // mountpath jogger //
   200  //////////////////////
   202  func (j *lruJ) String() string {
   203  	return fmt.Sprintf("%s: jog-%s", j.ini.Xaction, j.mi)
   204  }
   206  func (j *lruJ) stop() { j.stopCh <- struct{}{} }
   208  func (j *lruJ) run(providers []string) {
   209  	var err error
   210  	defer j.p.wg.Done()
   211  	// compute the size (bytes) to free up
   212  	if err = j.evictSize(); err != nil {
   213  		goto ex
   214  	}
   215  	if j.totalSize < minEvictThresh {
   216  		nlog.Infof("%s: used cap below threshold, nothing to do", j)
   217  		return
   218  	}
   219  	if len(j.ini.Buckets) != 0 {
   220  		nlog.Infof("%s: freeing-up %s", j, cos.ToSizeIEC(j.totalSize, 2))
   221  		err = j.jogBcks(j.ini.Buckets, j.ini.Force)
   222  	} else {
   223  		err = j.jog(providers)
   224  	}
   225  ex:
   226  	if err == nil || cmn.IsErrBucketNought(err) || cmn.IsErrObjNought(err) {
   227  		return
   228  	}
   229  	nlog.Errorln(j.String()+":", "exited with err:", err)
   230  }
   232  func (j *lruJ) jog(providers []string) (err error) {
   233  	nlog.Infoln(j.String()+":", "freeing-up", cos.ToSizeIEC(j.totalSize, 2))
   234  	for _, provider := range providers { // for each provider (NOTE: ordering is random)
   235  		var (
   236  			bcks []cmn.Bck
   237  			opts = fs.WalkOpts{
   238  				Mi:  j.mi,
   239  				Bck: cmn.Bck{Provider: provider, Ns: cmn.NsGlobal},
   240  			}
   241  		)
   242  		if bcks, err = fs.AllMpathBcks(&opts); err != nil {
   243  			return
   244  		}
   245  		if err = j.jogBcks(bcks, false); err != nil {
   246  			return
   247  		}
   248  	}
   249  	return
   250  }
   252  func (j *lruJ) jogBcks(bcks []cmn.Bck, force bool) (err error) {
   253  	if len(bcks) == 0 {
   254  		return
   255  	}
   256  	if len(bcks) > 1 {
   257  		j.sortBsize(bcks)
   258  	}
   259  	for _, bck := range bcks { // for each bucket under a given provider
   260  		var size int64
   261  		j.bck = bck
   262  		if j.allowDelObj, err = j.allow(); err != nil {
   263  			nlog.Errorf("%s: %v - skipping %s (Hint: run 'ais storage cleanup' to cleanup)", j, err, bck)
   264  			err = nil
   265  			continue
   266  		}
   267  		j.allowDelObj = j.allowDelObj || force
   268  		if size, err = j.jogBck(); err != nil {
   269  			return
   270  		}
   271  		if size < cos.KiB {
   272  			continue
   273  		}
   274  		// recompute size-to-evict
   275  		if err = j.evictSize(); err != nil {
   276  			return
   277  		}
   278  		if j.totalSize < cos.KiB {
   279  			return
   280  		}
   281  	}
   282  	return
   283  }
   285  func (j *lruJ) jogBck() (size int64, err error) {
   286  	// 1. init per-bucket min-heap (and reuse the slice)
   287  	h := (*j.heap)[:0]
   288  	j.heap = &h
   289  	heap.Init(j.heap)
   291  	// 2. collect
   292  	opts := &fs.WalkOpts{
   293  		Mi:       j.mi,
   294  		Bck:      j.bck,
   295  		CTs:      []string{fs.ObjectType},
   296  		Callback: j.walk,
   297  		Sorted:   false,
   298  	}
   299 = time.Now().UnixNano()
   300  	if err = fs.Walk(opts); err != nil {
   301  		return
   302  	}
   303  	// 3. evict
   304  	size, err = j.evict()
   305  	return
   306  }
   308  func (j *lruJ) visitLOM(parsedFQN *fs.ParsedFQN) {
   309  	if !j.allowDelObj {
   310  		return
   311  	}
   312  	lom := core.AllocLOM(parsedFQN.ObjName)
   313  	if pushed := j._visit(lom); !pushed {
   314  		core.FreeLOM(lom)
   315  	}
   316  }
   318  func (j *lruJ) _visit(lom *core.LOM) (pushed bool) {
   319  	if err := lom.InitBck(&j.bck); err != nil {
   320  		return
   321  	}
   322  	if err := lom.Load(false /*cache it*/, false /*locked*/); err != nil {
   323  		return
   324  	}
   325  	if lom.AtimeUnix()+int64(j.config.LRU.DontEvictTime) > {
   326  		return
   327  	}
   328  	if lom.HasCopies() && lom.IsCopy() {
   329  		return
   330  	}
   331  	// do nothing if the heap's curSize >= totalSize and
   332  	// the file is more recent then the the heap's newest.
   333  	if j.curSize >= j.totalSize && lom.AtimeUnix() > j.newest {
   334  		return
   335  	}
   336  	heap.Push(j.heap, lom)
   337  	j.curSize += lom.SizeBytes()
   338  	if lom.AtimeUnix() > j.newest {
   339  		j.newest = lom.AtimeUnix()
   340  	}
   341  	return true
   342  }
   344  func (j *lruJ) walk(fqn string, de fs.DirEntry) error {
   345  	var parsed fs.ParsedFQN
   346  	if de.IsDir() {
   347  		return nil
   348  	}
   349  	if err := j.yieldTerm(); err != nil {
   350  		return err
   351  	}
   352  	if _, err := core.ResolveFQN(fqn, &parsed); err != nil {
   353  		return nil
   354  	}
   355  	if parsed.ContentType == fs.ObjectType {
   356  		j.visitLOM(&parsed)
   357  	}
   359  	return nil
   360  }
   362  func (j *lruJ) evict() (size int64, err error) {
   363  	var (
   364  		fevicted, bevicted int64
   365  		capCheck           int64
   366  		h                  = j.heap
   367  		xlru               = j.ini.Xaction
   368  	)
   370  	// evict(sic!) and house-keep
   371  	for h.Len() > 0 && j.totalSize > 0 {
   372  		lom := heap.Pop(h).(*core.LOM)
   373  		if !j.evictObj(lom) {
   374  			core.FreeLOM(lom)
   375  			continue
   376  		}
   377  		objSize := lom.SizeBytes(true /*not loaded*/)
   378  		core.FreeLOM(lom)
   379  		bevicted += objSize
   380  		size += objSize
   381  		fevicted++
   382  		if capCheck, err = j.postRemove(capCheck, objSize); err != nil {
   383  			return
   384  		}
   385  	}
   386  	j.ini.StatsT.Add(stats.LruEvictSize, bevicted)
   387  	j.ini.StatsT.Add(stats.LruEvictCount, fevicted)
   388  	xlru.ObjsAdd(int(fevicted), bevicted)
   389  	return
   390  }
   392  func (j *lruJ) postRemove(prev, size int64) (capCheck int64, err error) {
   393  	j.totalSize -= size
   394  	capCheck = prev + size
   395  	if err = j.yieldTerm(); err != nil {
   396  		return
   397  	}
   398  	if capCheck < capCheckThresh {
   399  		return
   400  	}
   401  	// init, recompute, and throttle - once per capCheckThresh
   402  	capCheck = 0
   403  	j.throttle = false
   404  	j.allowDelObj, _ = j.allow()
   405  	j.config = cmn.GCO.Get()
   406 = time.Now().UnixNano()
   407  	usedPct, ok := j.ini.GetFSUsedPercentage(j.mi.Path)
   408  	if ok && usedPct < j.config.Space.HighWM {
   409  		err = j._throttle(usedPct)
   410  	}
   411  	return
   412  }
   414  func (j *lruJ) _throttle(usedPct int64) (err error) {
   415  	if j.mi.IsIdle(j.config) {
   416  		return
   417  	}
   418  	// throttle self
   419  	ratioCapacity := cos.Ratio(j.config.Space.HighWM, j.config.Space.LowWM, usedPct)
   420  	curr := fs.GetMpathUtil(j.mi.Path)
   421  	ratioUtilization := cos.Ratio(j.config.Disk.DiskUtilHighWM, j.config.Disk.DiskUtilLowWM, curr)
   422  	if ratioUtilization > ratioCapacity {
   423  		if usedPct < (j.config.Space.LowWM+j.config.Space.HighWM)/2 {
   424  			j.throttle = true
   425  		}
   426  		time.Sleep(mpather.ThrottleMaxDur)
   427  		err = j.yieldTerm()
   428  	}
   429  	return
   430  }
   432  // remove local copies that "belong" to different LRU joggers (space accounting may be temporarily not precise)
   433  func (j *lruJ) evictObj(lom *core.LOM) bool {
   434  	lom.Lock(true)
   435  	err := lom.Remove()
   436  	lom.Unlock(true)
   437  	if err != nil {
   438  		nlog.Errorf("%s: failed to evict %s: %v", j, lom, err)
   439  		return false
   440  	}
   441  	if cmn.Rom.FastV(5, cos.SmoduleSpace) {
   442  		nlog.Infof("%s: evicted %s, size=%d", j, lom, lom.SizeBytes(true /*not loaded*/))
   443  	}
   444  	return true
   445  }
   447  func (j *lruJ) evictSize() (err error) {
   448  	lwm, hwm := j.config.Space.LowWM, j.config.Space.HighWM
   449  	blocks, bavail, bsize, err := j.ini.GetFSStats(j.mi.Path)
   450  	if err != nil {
   451  		return err
   452  	}
   453  	used := blocks - bavail
   454  	usedPct := used * 100 / blocks
   455  	if usedPct < uint64(hwm) {
   456  		return
   457  	}
   458  	lwmBlocks := blocks * uint64(lwm) / 100
   459  	j.totalSize = int64(used-lwmBlocks) * bsize
   460  	return
   461  }
   463  func (j *lruJ) yieldTerm() error {
   464  	xlru := j.ini.Xaction
   465  	select {
   466  	case errCause := <-xlru.ChanAbort():
   467  		return cmn.NewErrAborted(xlru.Name(), "", errCause)
   468  	case <-j.stopCh:
   469  		return cmn.NewErrAborted(xlru.Name(), "", nil)
   470  	default:
   471  		if j.throttle {
   472  			time.Sleep(mpather.ThrottleMinDur)
   473  		}
   474  		break
   475  	}
   476  	if xlru.Finished() {
   477  		return cmn.NewErrAborted(xlru.Name(), "", nil)
   478  	}
   479  	return nil
   480  }
   482  // sort buckets by size
   483  func (j *lruJ) sortBsize(bcks []cmn.Bck) {
   484  	sized := make([]struct {
   485  		b cmn.Bck
   486  		v uint64
   487  	}, len(bcks))
   488  	for i := range bcks {
   489  		path := j.mi.MakePathCT(&bcks[i], fs.ObjectType)
   490  		sized[i].b = bcks[i]
   491  		sized[i].v, _ = ios.DirSizeOnDisk(path, false /*withNonDirPrefix*/)
   492  	}
   493  	sort.Slice(sized, func(i, j int) bool {
   494  		return sized[i].v > sized[j].v
   495  	})
   496  	for i := range bcks {
   497  		bcks[i] = sized[i].b
   498  	}
   499  }
   501  func (j *lruJ) allow() (ok bool, err error) {
   502  	var (
   503  		bowner = core.T.Bowner()
   504  		b      = meta.CloneBck(&j.bck)
   505  	)
   506  	if err = b.Init(bowner); err != nil {
   507  		return
   508  	}
   509  	ok = b.Props.LRU.Enabled && b.Allow(apc.AceObjDELETE) == nil
   510  	return
   511  }
   513  //////////////
   514  // min-heap //
   515  //////////////
   517  func (h minHeap) Len() int           { return len(h) }
   518  func (h minHeap) Less(i, j int) bool { return h[i].Atime().Before(h[j].Atime()) }
   519  func (h minHeap) Swap(i, j int)      { h[i], h[j] = h[j], h[i] }
   520  func (h *minHeap) Push(x any)        { *h = append(*h, x.(*core.LOM)) }
   521  func (h *minHeap) Pop() any {
   522  	old := *h
   523  	n := len(old)
   524  	fi := old[n-1]
   525  	*h = old[0 : n-1]
   526  	return fi
   527  }