github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/fs/mpather/jogger.go (about)

     1  // Package mpather provides per-mountpath concepts.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package mpather
     6  
     7  import (
     8  	"context"
     9  	"fmt"
    10  	"path/filepath"
    11  	"runtime"
    12  	"strings"
    13  	"time"
    14  
    15  	"github.com/NVIDIA/aistore/cmn"
    16  	"github.com/NVIDIA/aistore/cmn/atomic"
    17  	"github.com/NVIDIA/aistore/cmn/cos"
    18  	"github.com/NVIDIA/aistore/cmn/debug"
    19  	"github.com/NVIDIA/aistore/cmn/nlog"
    20  	"github.com/NVIDIA/aistore/core"
    21  	"github.com/NVIDIA/aistore/core/meta"
    22  	"github.com/NVIDIA/aistore/fs"
    23  	"github.com/NVIDIA/aistore/memsys"
    24  	"golang.org/x/sync/errgroup"
    25  )
    26  
    27  // walk all or selected buckets, one at a time
    28  
    29  const (
    30  	throttleNumObjects = 64 // unit of self-throttling
    31  )
    32  
    33  type LoadType int
    34  
    35  const (
    36  	noLoad LoadType = iota
    37  	LoadUnsafe
    38  	Load
    39  )
    40  
    41  const (
    42  	ThrottleMinDur = time.Millisecond
    43  	ThrottleAvgDur = time.Millisecond * 10
    44  	ThrottleMaxDur = time.Millisecond * 100
    45  )
    46  
    47  type (
    48  	JgroupOpts struct {
    49  		onFinish              func()
    50  		VisitObj              func(lom *core.LOM, buf []byte) error
    51  		VisitCT               func(ct *core.CT, buf []byte) error
    52  		Slab                  *memsys.Slab
    53  		Bck                   cmn.Bck
    54  		Buckets               cmn.Bcks
    55  		Prefix                string
    56  		CTs                   []string
    57  		DoLoad                LoadType // if specified, lom.Load(lock type)
    58  		Parallel              int      // num parallel calls
    59  		IncludeCopy           bool     // visit copies (aka replicas)
    60  		PerBucket             bool     // num joggers = (num mountpaths) x (num buckets)
    61  		SkipGloballyMisplaced bool     // skip globally misplaced
    62  		Throttle              bool     // true: pace itself depending on disk utilization
    63  	}
    64  
    65  	// Jgroup runs jogger per mountpath which walk the entire bucket and
    66  	// call callback on each of the encountered object. When jogger encounters
    67  	// error it stops and informs other joggers about the error (so they stop too).
    68  	Jgroup struct {
    69  		wg          *errgroup.Group
    70  		joggers     map[string]*jogger
    71  		finishedCh  cos.StopCh // when all joggers are done
    72  		finishedCnt atomic.Uint32
    73  	}
    74  
    75  	// jogger is being run on each mountpath and executes fs.Walk which call
    76  	// provided callback.
    77  	jogger struct {
    78  		ctx       context.Context
    79  		syncGroup *joggerSyncGroup
    80  		opts      *JgroupOpts
    81  		mi        *fs.Mountpath
    82  		bdir      string // mi.MakePath(bck)
    83  		objPrefix string // fully-qualified prefix, as in: join(bdir, opts.Prefix)
    84  		config    *cmn.Config
    85  		stopCh    cos.StopCh
    86  		bufs      [][]byte
    87  		num       int64
    88  	}
    89  
    90  	joggerSyncGroup struct {
    91  		sema   chan int // Positional number of a buffer to use by a goroutine.
    92  		group  *errgroup.Group
    93  		cancel context.CancelFunc
    94  	}
    95  )
    96  
    97  func NewJoggerGroup(opts *JgroupOpts, config *cmn.Config, mpath string) *Jgroup {
    98  	var (
    99  		joggers map[string]*jogger
   100  		avail   = fs.GetAvail()
   101  		wg, ctx = errgroup.WithContext(context.Background())
   102  	)
   103  	debug.Assert(!opts.IncludeCopy || (opts.IncludeCopy && opts.DoLoad > noLoad))
   104  
   105  	jg := &Jgroup{wg: wg}
   106  	opts.onFinish = jg.markFinished
   107  
   108  	switch {
   109  	case mpath != "":
   110  		joggers = make(map[string]*jogger, 1)
   111  		if mi, ok := avail[mpath]; ok {
   112  			joggers[mi.Path] = newJogger(ctx, opts, mi, config)
   113  		}
   114  	case opts.PerBucket:
   115  		debug.Assert(len(opts.Buckets) > 1)
   116  		joggers = make(map[string]*jogger, len(avail)*len(opts.Buckets))
   117  		for _, bck := range opts.Buckets {
   118  			nopts := *opts
   119  			nopts.Buckets = nil
   120  			nopts.Bck = bck
   121  			uname := bck.MakeUname("")
   122  			for _, mi := range avail {
   123  				joggers[mi.Path+"|"+uname] = newJogger(ctx, &nopts, mi, config)
   124  			}
   125  		}
   126  	default:
   127  		joggers = make(map[string]*jogger, len(avail))
   128  		for _, mi := range avail {
   129  			joggers[mi.Path] = newJogger(ctx, opts, mi, config)
   130  		}
   131  	}
   132  
   133  	// this jogger group is a no-op (unlikely)
   134  	if len(joggers) == 0 {
   135  		_, disabled := fs.Get()
   136  		nlog.Errorf("%v: avail=%v, disabled=%v, selected=%q", cmn.ErrNoMountpaths, avail, disabled, mpath)
   137  	}
   138  
   139  	jg.joggers = joggers
   140  	jg.finishedCh.Init()
   141  
   142  	return jg
   143  }
   144  
   145  func (jg *Jgroup) Num() int { return len(jg.joggers) }
   146  
   147  func (jg *Jgroup) Run() {
   148  	for _, jogger := range jg.joggers {
   149  		jg.wg.Go(jogger.run)
   150  	}
   151  }
   152  
   153  func (jg *Jgroup) Stop() error {
   154  	for _, jogger := range jg.joggers {
   155  		jogger.abort()
   156  	}
   157  	return jg.wg.Wait()
   158  }
   159  
   160  func (jg *Jgroup) ListenFinished() <-chan struct{} {
   161  	return jg.finishedCh.Listen()
   162  }
   163  
   164  func (jg *Jgroup) markFinished() {
   165  	if n := jg.finishedCnt.Inc(); n == uint32(len(jg.joggers)) {
   166  		jg.finishedCh.Close()
   167  	}
   168  }
   169  
   170  func newJogger(ctx context.Context, opts *JgroupOpts, mi *fs.Mountpath, config *cmn.Config) (j *jogger) {
   171  	var syncGroup *joggerSyncGroup
   172  	if opts.Parallel > 1 {
   173  		var (
   174  			group  *errgroup.Group
   175  			cancel context.CancelFunc
   176  		)
   177  		ctx, cancel = context.WithCancel(ctx)
   178  		group, ctx = errgroup.WithContext(ctx)
   179  		syncGroup = &joggerSyncGroup{
   180  			sema:   make(chan int, opts.Parallel),
   181  			group:  group,
   182  			cancel: cancel,
   183  		}
   184  		for i := range opts.Parallel {
   185  			syncGroup.sema <- i
   186  		}
   187  	}
   188  	j = &jogger{
   189  		ctx:       ctx,
   190  		opts:      opts,
   191  		mi:        mi,
   192  		config:    config,
   193  		syncGroup: syncGroup,
   194  	}
   195  	if opts.Prefix != "" {
   196  		j.bdir = mi.MakePathCT(&j.opts.Bck, fs.ObjectType) // this mountpath's bucket dir that contains objects
   197  		j.objPrefix = filepath.Join(j.bdir, opts.Prefix)
   198  	}
   199  	j.stopCh.Init()
   200  	return
   201  }
   202  
   203  func (j *jogger) run() (err error) {
   204  	if j.opts.Slab != nil {
   205  		if j.opts.Parallel <= 1 {
   206  			j.bufs = [][]byte{j.opts.Slab.Alloc()}
   207  		} else {
   208  			j.bufs = make([][]byte, j.opts.Parallel)
   209  			for i := range j.opts.Parallel {
   210  				j.bufs[i] = j.opts.Slab.Alloc()
   211  			}
   212  		}
   213  	}
   214  
   215  	// 3 running options
   216  	switch {
   217  	case len(j.opts.Buckets) > 0:
   218  		debug.Assert(j.opts.Bck.IsEmpty())
   219  		err = j.runSelected()
   220  	case j.opts.Bck.IsQuery():
   221  		err = j.runQbck(cmn.QueryBcks(j.opts.Bck))
   222  	default:
   223  		_, err = j.runBck(&j.opts.Bck)
   224  	}
   225  
   226  	// cleanup
   227  	if j.opts.Slab != nil {
   228  		for _, buf := range j.bufs {
   229  			j.opts.Slab.Free(buf)
   230  		}
   231  	}
   232  	j.opts.onFinish()
   233  	return
   234  }
   235  
   236  // run selected buckets, one at a time
   237  func (j *jogger) runSelected() error {
   238  	var errs cos.Errs
   239  	for i := range j.opts.Buckets {
   240  		aborted, err := j.runBck(&j.opts.Buckets[i])
   241  		if err != nil {
   242  			errs.Add(err)
   243  		}
   244  		if aborted {
   245  			return &errs
   246  		}
   247  	}
   248  	return nil
   249  }
   250  
   251  // run matching, one at a time
   252  func (j *jogger) runQbck(qbck cmn.QueryBcks) (err error) {
   253  	var (
   254  		bmd      = core.T.Bowner().Get()
   255  		provider *string
   256  		ns       *cmn.Ns
   257  		errs     cos.Errs
   258  	)
   259  	if qbck.Provider != "" {
   260  		provider = &qbck.Provider
   261  	}
   262  	if !qbck.Ns.IsGlobal() {
   263  		ns = &qbck.Ns
   264  	}
   265  	bmd.Range(provider, ns, func(bck *meta.Bck) bool {
   266  		aborted, errV := j.runBck(bck.Bucket())
   267  		if err != nil {
   268  			errs.Add(errV)
   269  			err = &errs
   270  		}
   271  		return aborted
   272  	})
   273  	return
   274  }
   275  
   276  // run single (see also: `PerBucket` above)
   277  func (j *jogger) runBck(bck *cmn.Bck) (aborted bool, err error) {
   278  	opts := &fs.WalkOpts{
   279  		Mi:       j.mi,
   280  		CTs:      j.opts.CTs,
   281  		Callback: j.jog,
   282  		Sorted:   false,
   283  	}
   284  	opts.Bck.Copy(bck)
   285  
   286  	err = fs.Walk(opts)
   287  	if j.syncGroup != nil {
   288  		// If callbacks are executed in goroutines, fs.Walk can stop before the callbacks return.
   289  		// We have to wait for them and check if there was any error.
   290  		if err == nil {
   291  			err = j.syncGroup.waitForAsyncTasks()
   292  		} else {
   293  			j.syncGroup.abortAsyncTasks()
   294  		}
   295  	}
   296  
   297  	if err != nil {
   298  		if cmn.IsErrAborted(err) {
   299  			nlog.Infof("%s stopping traversal: %v", j, err)
   300  			return true, nil
   301  		}
   302  		return false, err
   303  	}
   304  	return false, nil
   305  }
   306  
   307  func (j *jogger) jog(fqn string, de fs.DirEntry) error {
   308  	if j.objPrefix != "" && strings.HasPrefix(fqn, j.bdir) {
   309  		if de.IsDir() {
   310  			if !cmn.DirHasOrIsPrefix(fqn, j.objPrefix) {
   311  				return filepath.SkipDir
   312  			}
   313  		} else if !strings.HasPrefix(fqn, j.objPrefix) {
   314  			return nil
   315  		}
   316  	}
   317  	if de.IsDir() {
   318  		return nil
   319  	}
   320  
   321  	if err := j.checkStopped(); err != nil {
   322  		return err
   323  	}
   324  
   325  	var bufPosition int
   326  	if j.syncGroup == nil {
   327  		if err := j.visitFQN(fqn, j.getBuf(0)); err != nil {
   328  			return err
   329  		}
   330  	} else {
   331  		select {
   332  		case bufPosition = <-j.syncGroup.sema:
   333  			break
   334  		case <-j.ctx.Done():
   335  			return j.ctx.Err()
   336  		}
   337  
   338  		j.syncGroup.group.Go(func() error {
   339  			defer func() {
   340  				// NOTE: There is no need to select j.ctx.Done() as put to this chanel is immediate.
   341  				j.syncGroup.sema <- bufPosition
   342  			}()
   343  			return j.visitFQN(fqn, j.getBuf(bufPosition))
   344  		})
   345  	}
   346  
   347  	if j.opts.Throttle {
   348  		j.num++
   349  		if (j.num % throttleNumObjects) == 0 {
   350  			j.throttle()
   351  		} else {
   352  			runtime.Gosched()
   353  		}
   354  	}
   355  	return nil
   356  }
   357  
   358  func (j *jogger) visitFQN(fqn string, buf []byte) error {
   359  	ct, err := core.NewCTFromFQN(fqn, core.T.Bowner())
   360  	if err != nil {
   361  		return err
   362  	}
   363  
   364  	if j.opts.SkipGloballyMisplaced {
   365  		smap := core.T.Sowner().Get()
   366  		tsi, err := smap.HrwHash2T(ct.Digest())
   367  		if err != nil {
   368  			return err
   369  		}
   370  		if tsi.ID() != core.T.SID() {
   371  			return nil
   372  		}
   373  	}
   374  
   375  	switch ct.ContentType() {
   376  	case fs.ObjectType:
   377  		lom := core.AllocLOM("")
   378  		lom.InitCT(ct)
   379  		err := j.visitObj(lom, buf)
   380  		// NOTE: j.opts.visitObj() callback implementations must either finish
   381  		// synchronously or pass lom.LIF to another goroutine
   382  		core.FreeLOM(lom)
   383  		return err
   384  	default:
   385  		if err := j.visitCT(ct, buf); err != nil {
   386  			return err
   387  		}
   388  	}
   389  	return nil
   390  }
   391  
   392  func (j *jogger) visitObj(lom *core.LOM, buf []byte) (err error) {
   393  	switch j.opts.DoLoad {
   394  	case noLoad:
   395  		goto visit
   396  	case LoadUnsafe:
   397  		err = lom.LoadUnsafe()
   398  	case Load:
   399  		err = lom.Load(false, false)
   400  	default:
   401  		debug.Assert(false, "invalid 'opts.DoLoad'", j.opts.DoLoad)
   402  	}
   403  	if err != nil {
   404  		return
   405  	}
   406  	if !j.opts.IncludeCopy && lom.IsCopy() {
   407  		return nil
   408  	}
   409  visit:
   410  	return j.opts.VisitObj(lom, buf)
   411  }
   412  
   413  func (j *jogger) visitCT(ct *core.CT, buf []byte) error { return j.opts.VisitCT(ct, buf) }
   414  
   415  func (j *jogger) getBuf(position int) []byte {
   416  	if j.bufs == nil {
   417  		return nil
   418  	}
   419  	return j.bufs[position]
   420  }
   421  
   422  func (j *jogger) checkStopped() error {
   423  	select {
   424  	case <-j.ctx.Done(): // Some other worker has exited with error and canceled context.
   425  		return j.ctx.Err()
   426  	case <-j.stopCh.Listen(): // Worker has been aborted.
   427  		return cmn.NewErrAborted(j.String(), "mpath-jog", nil)
   428  	default:
   429  		return nil
   430  	}
   431  }
   432  
   433  func (sg *joggerSyncGroup) waitForAsyncTasks() error {
   434  	return sg.group.Wait()
   435  }
   436  
   437  func (sg *joggerSyncGroup) abortAsyncTasks() error {
   438  	sg.cancel()
   439  	return sg.waitForAsyncTasks()
   440  }
   441  
   442  func (j *jogger) throttle() {
   443  	curUtil := fs.GetMpathUtil(j.mi.Path)
   444  	if curUtil >= j.config.Disk.DiskUtilHighWM {
   445  		time.Sleep(ThrottleMinDur)
   446  	}
   447  }
   448  
   449  func (j *jogger) abort()         { j.stopCh.Close() }
   450  func (j *jogger) String() string { return fmt.Sprintf("jogger [%s/%s]", j.mi, j.opts.Bck) }