github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/fspathrgrp.go (about) 1 // Package ais provides core functionality for the AIStore object storage. 2 /* 3 * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package ais 6 7 import ( 8 "fmt" 9 "sync" 10 11 "github.com/NVIDIA/aistore/api/apc" 12 "github.com/NVIDIA/aistore/cmn" 13 "github.com/NVIDIA/aistore/cmn/cos" 14 "github.com/NVIDIA/aistore/cmn/debug" 15 "github.com/NVIDIA/aistore/cmn/nlog" 16 "github.com/NVIDIA/aistore/core" 17 "github.com/NVIDIA/aistore/ext/dsort" 18 "github.com/NVIDIA/aistore/fs" 19 "github.com/NVIDIA/aistore/ios" 20 "github.com/NVIDIA/aistore/res" 21 "github.com/NVIDIA/aistore/stats" 22 "github.com/NVIDIA/aistore/volume" 23 "github.com/NVIDIA/aistore/xact/xreg" 24 "github.com/NVIDIA/aistore/xact/xs" 25 ) 26 27 type fsprungroup struct { 28 t *target 29 newVol bool 30 } 31 32 func (g *fsprungroup) init(t *target, newVol bool) { 33 g.t = t 34 g.newVol = newVol 35 } 36 37 // 38 // add | re-enable 39 // 40 41 // enableMpath enables mountpath and notifies necessary runners about the 42 // change if mountpath actually was enabled. 43 func (g *fsprungroup) enableMpath(mpath string) (enabledMi *fs.Mountpath, err error) { 44 enabledMi, err = fs.EnableMpath(mpath, g.t.SID(), g.redistributeMD) 45 if err != nil || enabledMi == nil { 46 return 47 } 48 g._postAdd(apc.ActMountpathEnable, enabledMi) 49 return 50 } 51 52 // attachMpath adds mountpath and notifies necessary runners about the change 53 // if the mountpath was actually added. 54 func (g *fsprungroup) attachMpath(mpath string, label ios.Label) (addedMi *fs.Mountpath, err error) { 55 addedMi, err = fs.AddMpath(g.t.SID(), mpath, label, g.redistributeMD) 56 if err != nil || addedMi == nil { 57 return 58 } 59 60 g._postAdd(apc.ActMountpathAttach, addedMi) 61 return 62 } 63 64 func (g *fsprungroup) _postAdd(action string, mi *fs.Mountpath) { 65 // NOTE: 66 // - currently, dsort doesn't handle (add/enable/disable/detach mountpath) at runtime 67 // - consider integrating via `xreg.LimitedCoexistence` 68 // - review all xact.IsMountpath(kind) == true 69 dsort.Managers.AbortAll(fmt.Errorf("%q %s", action, mi)) 70 71 fspathsConfigAddDel(mi.Path, true /*add*/) 72 go func() { 73 if cmn.GCO.Get().Resilver.Enabled { 74 g.t.runResilver(res.Args{}, nil /*wg*/) 75 } 76 xreg.RenewMakeNCopies(cos.GenUUID(), action) 77 }() 78 79 g.checkEnable(action, mi) 80 81 tstats := g.t.statsT.(*stats.Trunner) 82 for _, disk := range mi.Disks { 83 tstats.RegDiskMetrics(g.t.si, disk) 84 } 85 } 86 87 // 88 // remove | disable 89 // 90 91 // disableMpath disables mountpath and notifies necessary runners about the 92 // change if mountpath actually was disabled. 93 func (g *fsprungroup) disableMpath(mpath string, dontResilver bool) (*fs.Mountpath, error) { 94 return g.doDD(apc.ActMountpathDisable, fs.FlagBeingDisabled, mpath, dontResilver) 95 } 96 97 // detachMpath removes mountpath and notifies necessary runners about the 98 // change if the mountpath was actually removed. 99 func (g *fsprungroup) detachMpath(mpath string, dontResilver bool) (*fs.Mountpath, error) { 100 return g.doDD(apc.ActMountpathDetach, fs.FlagBeingDetached, mpath, dontResilver) 101 } 102 103 func (g *fsprungroup) doDD(action string, flags uint64, mpath string, dontResilver bool) (*fs.Mountpath, error) { 104 rmi, numAvail, noResil, err := fs.BeginDD(action, flags, mpath) 105 if err != nil || rmi == nil { 106 return nil, err 107 } 108 109 // NOTE: above 110 dsort.Managers.AbortAll(fmt.Errorf("%q %s", action, rmi)) 111 112 if numAvail == 0 { 113 s := fmt.Sprintf("%s: lost (via %q) the last available mountpath %q", g.t.si, action, rmi) 114 g.postDD(rmi, action, nil /*xaction*/, nil /*error*/) // go ahead to disable/detach 115 g.t.disable(s) 116 return rmi, nil 117 } 118 119 core.UncacheMountpath(rmi) 120 121 if noResil || dontResilver || !cmn.GCO.Get().Resilver.Enabled { 122 nlog.Infof("%s: %q %s: no resilvering (%t, %t, %t)", g.t, action, rmi, 123 noResil, !dontResilver, cmn.GCO.Get().Resilver.Enabled) 124 g.postDD(rmi, action, nil /*xaction*/, nil /*error*/) // ditto (compare with the one below) 125 return rmi, nil 126 } 127 128 prevActive := g.t.res.IsActive(1 /*interval-of-inactivity multiplier*/) 129 if prevActive { 130 nlog.Infof("%s: %q %s: starting to resilver when previous (resilvering) is active", g.t, action, rmi) 131 } else { 132 nlog.Infof("%s: %q %s: starting to resilver", g.t, action, rmi) 133 } 134 args := res.Args{ 135 Rmi: rmi, 136 Action: action, 137 PostDD: g.postDD, // callback when done 138 SingleRmiJogger: !prevActive, // NOTE: optimization for the special/common case 139 } 140 wg := &sync.WaitGroup{} 141 wg.Add(1) 142 go g.t.runResilver(args, wg) 143 wg.Wait() 144 145 return rmi, nil 146 } 147 148 func (g *fsprungroup) postDD(rmi *fs.Mountpath, action string, xres *xs.Resilver, err error) { 149 // 1. handle error 150 if err == nil && xres != nil { 151 err = xres.AbortErr() 152 } 153 if err != nil { 154 if errCause := cmn.AsErrAborted(err); errCause != nil { 155 err = errCause 156 } 157 if err == cmn.ErrXactUserAbort { 158 nlog.Errorf("[post-dd interrupted - clearing the state] %s: %q %s %s: %v", 159 g.t.si, action, rmi, xres, err) 160 rmi.ClearDD() 161 } else { 162 nlog.Errorf("[post-dd interrupted - keeping the state] %s: %q %s %s: %v", 163 g.t.si, action, rmi, xres, err) 164 } 165 return 166 } 167 168 // 2. this action 169 if action == apc.ActMountpathDetach { 170 _, err = fs.Remove(rmi.Path, g.redistributeMD) 171 } else { 172 debug.Assert(action == apc.ActMountpathDisable) 173 _, err = fs.Disable(rmi.Path, g.redistributeMD) 174 } 175 if err != nil { 176 nlog.Errorln(err) 177 return 178 } 179 fspathsConfigAddDel(rmi.Path, false /*add*/) 180 nlog.Infof("%s: %s %q %s done", g.t, rmi, action, xres) 181 182 // 3. the case of multiple overlapping detach _or_ disable operations 183 // (ie., commit previously aborted xs.Resilver, if any) 184 availablePaths := fs.GetAvail() 185 for _, mi := range availablePaths { 186 if !mi.IsAnySet(fs.FlagWaitingDD) { 187 continue 188 } 189 // TODO: assumption that `action` is the same for all 190 if action == apc.ActMountpathDetach { 191 _, err = fs.Remove(mi.Path, g.redistributeMD) 192 } else { 193 debug.Assert(action == apc.ActMountpathDisable) 194 _, err = fs.Disable(mi.Path, g.redistributeMD) 195 } 196 if err != nil { 197 nlog.Errorln(err) 198 return 199 } 200 fspathsConfigAddDel(mi.Path, false /*add*/) 201 nlog.Infof("%s: %s %s %s was previously aborted and now done", g.t, action, mi, xres) 202 } 203 } 204 205 // store updated fspaths locally as part of the 'OverrideConfigFname' 206 // and commit new version of the config 207 func fspathsConfigAddDel(mpath string, add bool) { 208 if cmn.Rom.TestingEnv() { // since testing fspaths are counted, not enumerated 209 return 210 } 211 config := cmn.GCO.BeginUpdate() 212 localConfig := &config.LocalConfig 213 if add { 214 localConfig.AddPath(mpath) 215 } else { 216 localConfig.DelPath(mpath) 217 } 218 if err := localConfig.FSP.Validate(config); err != nil { 219 debug.AssertNoErr(err) 220 cmn.GCO.DiscardUpdate() 221 nlog.Errorln(err) 222 return 223 } 224 // do 225 fspathsSave(config) 226 } 227 228 func fspathsSave(config *cmn.Config) { 229 toUpdate := &cmn.ConfigToSet{FSP: &config.LocalConfig.FSP} 230 overrideConfig := cmn.GCO.SetLocalFSPaths(toUpdate) 231 if err := cmn.SaveOverrideConfig(config.ConfigDir, overrideConfig); err != nil { 232 debug.AssertNoErr(err) 233 cmn.GCO.DiscardUpdate() 234 nlog.Errorln(err) 235 return 236 } 237 cmn.GCO.CommitUpdate(config) 238 } 239 240 // NOTE: executes under mfs lock; all errors here are FATAL 241 func (g *fsprungroup) redistributeMD() { 242 if !hasEnoughBMDCopies() { 243 bo := g.t.owner.bmd 244 if err := bo.persist(bo.get(), nil); err != nil { 245 cos.ExitLog(err) 246 } 247 } 248 249 if !hasEnoughEtlMDCopies() { 250 eo := g.t.owner.etl 251 if err := eo.persist(eo.get(), nil); err != nil { 252 cos.ExitLog(err) 253 } 254 } 255 256 if _, err := volume.NewFromMPI(g.t.SID()); err != nil { 257 cos.ExitLog(err) 258 } 259 } 260 261 func (g *fsprungroup) checkEnable(action string, mi *fs.Mountpath) { 262 availablePaths := fs.GetAvail() 263 if len(availablePaths) > 1 { 264 nlog.Infoln(action, mi.String()) 265 } else { 266 nlog.Infoln(action, "the first mountpath", mi.String()) 267 if err := g.t.enable(); err != nil { 268 nlog.Errorf("Failed to re-join %s (self): %v", g.t, err) // (FATAL, unlikely) 269 } 270 } 271 }