github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/fspathrgrp.go (about)

     1  // Package ais provides core functionality for the AIStore object storage.
     2  /*
     3   * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package ais
     6  
     7  import (
     8  	"fmt"
     9  	"sync"
    10  
    11  	"github.com/NVIDIA/aistore/api/apc"
    12  	"github.com/NVIDIA/aistore/cmn"
    13  	"github.com/NVIDIA/aistore/cmn/cos"
    14  	"github.com/NVIDIA/aistore/cmn/debug"
    15  	"github.com/NVIDIA/aistore/cmn/nlog"
    16  	"github.com/NVIDIA/aistore/core"
    17  	"github.com/NVIDIA/aistore/ext/dsort"
    18  	"github.com/NVIDIA/aistore/fs"
    19  	"github.com/NVIDIA/aistore/ios"
    20  	"github.com/NVIDIA/aistore/res"
    21  	"github.com/NVIDIA/aistore/stats"
    22  	"github.com/NVIDIA/aistore/volume"
    23  	"github.com/NVIDIA/aistore/xact/xreg"
    24  	"github.com/NVIDIA/aistore/xact/xs"
    25  )
    26  
    27  type fsprungroup struct {
    28  	t      *target
    29  	newVol bool
    30  }
    31  
    32  func (g *fsprungroup) init(t *target, newVol bool) {
    33  	g.t = t
    34  	g.newVol = newVol
    35  }
    36  
    37  //
    38  // add | re-enable
    39  //
    40  
    41  // enableMpath enables mountpath and notifies necessary runners about the
    42  // change if mountpath actually was enabled.
    43  func (g *fsprungroup) enableMpath(mpath string) (enabledMi *fs.Mountpath, err error) {
    44  	enabledMi, err = fs.EnableMpath(mpath, g.t.SID(), g.redistributeMD)
    45  	if err != nil || enabledMi == nil {
    46  		return
    47  	}
    48  	g._postAdd(apc.ActMountpathEnable, enabledMi)
    49  	return
    50  }
    51  
    52  // attachMpath adds mountpath and notifies necessary runners about the change
    53  // if the mountpath was actually added.
    54  func (g *fsprungroup) attachMpath(mpath string, label ios.Label) (addedMi *fs.Mountpath, err error) {
    55  	addedMi, err = fs.AddMpath(g.t.SID(), mpath, label, g.redistributeMD)
    56  	if err != nil || addedMi == nil {
    57  		return
    58  	}
    59  
    60  	g._postAdd(apc.ActMountpathAttach, addedMi)
    61  	return
    62  }
    63  
    64  func (g *fsprungroup) _postAdd(action string, mi *fs.Mountpath) {
    65  	// NOTE:
    66  	// - currently, dsort doesn't handle (add/enable/disable/detach mountpath) at runtime
    67  	// - consider integrating via `xreg.LimitedCoexistence`
    68  	// - review all xact.IsMountpath(kind) == true
    69  	dsort.Managers.AbortAll(fmt.Errorf("%q %s", action, mi))
    70  
    71  	fspathsConfigAddDel(mi.Path, true /*add*/)
    72  	go func() {
    73  		if cmn.GCO.Get().Resilver.Enabled {
    74  			g.t.runResilver(res.Args{}, nil /*wg*/)
    75  		}
    76  		xreg.RenewMakeNCopies(cos.GenUUID(), action)
    77  	}()
    78  
    79  	g.checkEnable(action, mi)
    80  
    81  	tstats := g.t.statsT.(*stats.Trunner)
    82  	for _, disk := range mi.Disks {
    83  		tstats.RegDiskMetrics(g.t.si, disk)
    84  	}
    85  }
    86  
    87  //
    88  // remove | disable
    89  //
    90  
    91  // disableMpath disables mountpath and notifies necessary runners about the
    92  // change if mountpath actually was disabled.
    93  func (g *fsprungroup) disableMpath(mpath string, dontResilver bool) (*fs.Mountpath, error) {
    94  	return g.doDD(apc.ActMountpathDisable, fs.FlagBeingDisabled, mpath, dontResilver)
    95  }
    96  
    97  // detachMpath removes mountpath and notifies necessary runners about the
    98  // change if the mountpath was actually removed.
    99  func (g *fsprungroup) detachMpath(mpath string, dontResilver bool) (*fs.Mountpath, error) {
   100  	return g.doDD(apc.ActMountpathDetach, fs.FlagBeingDetached, mpath, dontResilver)
   101  }
   102  
   103  func (g *fsprungroup) doDD(action string, flags uint64, mpath string, dontResilver bool) (*fs.Mountpath, error) {
   104  	rmi, numAvail, noResil, err := fs.BeginDD(action, flags, mpath)
   105  	if err != nil || rmi == nil {
   106  		return nil, err
   107  	}
   108  
   109  	// NOTE: above
   110  	dsort.Managers.AbortAll(fmt.Errorf("%q %s", action, rmi))
   111  
   112  	if numAvail == 0 {
   113  		s := fmt.Sprintf("%s: lost (via %q) the last available mountpath %q", g.t.si, action, rmi)
   114  		g.postDD(rmi, action, nil /*xaction*/, nil /*error*/) // go ahead to disable/detach
   115  		g.t.disable(s)
   116  		return rmi, nil
   117  	}
   118  
   119  	core.UncacheMountpath(rmi)
   120  
   121  	if noResil || dontResilver || !cmn.GCO.Get().Resilver.Enabled {
   122  		nlog.Infof("%s: %q %s: no resilvering (%t, %t, %t)", g.t, action, rmi,
   123  			noResil, !dontResilver, cmn.GCO.Get().Resilver.Enabled)
   124  		g.postDD(rmi, action, nil /*xaction*/, nil /*error*/) // ditto (compare with the one below)
   125  		return rmi, nil
   126  	}
   127  
   128  	prevActive := g.t.res.IsActive(1 /*interval-of-inactivity multiplier*/)
   129  	if prevActive {
   130  		nlog.Infof("%s: %q %s: starting to resilver when previous (resilvering) is active", g.t, action, rmi)
   131  	} else {
   132  		nlog.Infof("%s: %q %s: starting to resilver", g.t, action, rmi)
   133  	}
   134  	args := res.Args{
   135  		Rmi:             rmi,
   136  		Action:          action,
   137  		PostDD:          g.postDD,    // callback when done
   138  		SingleRmiJogger: !prevActive, // NOTE: optimization for the special/common case
   139  	}
   140  	wg := &sync.WaitGroup{}
   141  	wg.Add(1)
   142  	go g.t.runResilver(args, wg)
   143  	wg.Wait()
   144  
   145  	return rmi, nil
   146  }
   147  
   148  func (g *fsprungroup) postDD(rmi *fs.Mountpath, action string, xres *xs.Resilver, err error) {
   149  	// 1. handle error
   150  	if err == nil && xres != nil {
   151  		err = xres.AbortErr()
   152  	}
   153  	if err != nil {
   154  		if errCause := cmn.AsErrAborted(err); errCause != nil {
   155  			err = errCause
   156  		}
   157  		if err == cmn.ErrXactUserAbort {
   158  			nlog.Errorf("[post-dd interrupted - clearing the state] %s: %q %s %s: %v",
   159  				g.t.si, action, rmi, xres, err)
   160  			rmi.ClearDD()
   161  		} else {
   162  			nlog.Errorf("[post-dd interrupted - keeping the state] %s: %q %s %s: %v",
   163  				g.t.si, action, rmi, xres, err)
   164  		}
   165  		return
   166  	}
   167  
   168  	// 2. this action
   169  	if action == apc.ActMountpathDetach {
   170  		_, err = fs.Remove(rmi.Path, g.redistributeMD)
   171  	} else {
   172  		debug.Assert(action == apc.ActMountpathDisable)
   173  		_, err = fs.Disable(rmi.Path, g.redistributeMD)
   174  	}
   175  	if err != nil {
   176  		nlog.Errorln(err)
   177  		return
   178  	}
   179  	fspathsConfigAddDel(rmi.Path, false /*add*/)
   180  	nlog.Infof("%s: %s %q %s done", g.t, rmi, action, xres)
   181  
   182  	// 3. the case of multiple overlapping detach _or_ disable operations
   183  	//    (ie., commit previously aborted xs.Resilver, if any)
   184  	availablePaths := fs.GetAvail()
   185  	for _, mi := range availablePaths {
   186  		if !mi.IsAnySet(fs.FlagWaitingDD) {
   187  			continue
   188  		}
   189  		// TODO: assumption that `action` is the same for all
   190  		if action == apc.ActMountpathDetach {
   191  			_, err = fs.Remove(mi.Path, g.redistributeMD)
   192  		} else {
   193  			debug.Assert(action == apc.ActMountpathDisable)
   194  			_, err = fs.Disable(mi.Path, g.redistributeMD)
   195  		}
   196  		if err != nil {
   197  			nlog.Errorln(err)
   198  			return
   199  		}
   200  		fspathsConfigAddDel(mi.Path, false /*add*/)
   201  		nlog.Infof("%s: %s %s %s was previously aborted and now done", g.t, action, mi, xres)
   202  	}
   203  }
   204  
   205  // store updated fspaths locally as part of the 'OverrideConfigFname'
   206  // and commit new version of the config
   207  func fspathsConfigAddDel(mpath string, add bool) {
   208  	if cmn.Rom.TestingEnv() { // since testing fspaths are counted, not enumerated
   209  		return
   210  	}
   211  	config := cmn.GCO.BeginUpdate()
   212  	localConfig := &config.LocalConfig
   213  	if add {
   214  		localConfig.AddPath(mpath)
   215  	} else {
   216  		localConfig.DelPath(mpath)
   217  	}
   218  	if err := localConfig.FSP.Validate(config); err != nil {
   219  		debug.AssertNoErr(err)
   220  		cmn.GCO.DiscardUpdate()
   221  		nlog.Errorln(err)
   222  		return
   223  	}
   224  	// do
   225  	fspathsSave(config)
   226  }
   227  
   228  func fspathsSave(config *cmn.Config) {
   229  	toUpdate := &cmn.ConfigToSet{FSP: &config.LocalConfig.FSP}
   230  	overrideConfig := cmn.GCO.SetLocalFSPaths(toUpdate)
   231  	if err := cmn.SaveOverrideConfig(config.ConfigDir, overrideConfig); err != nil {
   232  		debug.AssertNoErr(err)
   233  		cmn.GCO.DiscardUpdate()
   234  		nlog.Errorln(err)
   235  		return
   236  	}
   237  	cmn.GCO.CommitUpdate(config)
   238  }
   239  
   240  // NOTE: executes under mfs lock; all errors here are FATAL
   241  func (g *fsprungroup) redistributeMD() {
   242  	if !hasEnoughBMDCopies() {
   243  		bo := g.t.owner.bmd
   244  		if err := bo.persist(bo.get(), nil); err != nil {
   245  			cos.ExitLog(err)
   246  		}
   247  	}
   248  
   249  	if !hasEnoughEtlMDCopies() {
   250  		eo := g.t.owner.etl
   251  		if err := eo.persist(eo.get(), nil); err != nil {
   252  			cos.ExitLog(err)
   253  		}
   254  	}
   255  
   256  	if _, err := volume.NewFromMPI(g.t.SID()); err != nil {
   257  		cos.ExitLog(err)
   258  	}
   259  }
   260  
   261  func (g *fsprungroup) checkEnable(action string, mi *fs.Mountpath) {
   262  	availablePaths := fs.GetAvail()
   263  	if len(availablePaths) > 1 {
   264  		nlog.Infoln(action, mi.String())
   265  	} else {
   266  		nlog.Infoln(action, "the first mountpath", mi.String())
   267  		if err := g.t.enable(); err != nil {
   268  			nlog.Errorf("Failed to re-join %s (self): %v", g.t, err) // (FATAL, unlikely)
   269  		}
   270  	}
   271  }