github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/reb/ec.go (about) 1 // Package reb provides global cluster-wide rebalance upon adding/removing storage nodes. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package reb 6 7 import ( 8 "errors" 9 "fmt" 10 "io" 11 "os" 12 "path/filepath" 13 "sync" 14 15 "github.com/NVIDIA/aistore/api/apc" 16 "github.com/NVIDIA/aistore/cmn" 17 "github.com/NVIDIA/aistore/cmn/cos" 18 "github.com/NVIDIA/aistore/cmn/debug" 19 "github.com/NVIDIA/aistore/cmn/nlog" 20 "github.com/NVIDIA/aistore/core" 21 "github.com/NVIDIA/aistore/core/meta" 22 "github.com/NVIDIA/aistore/ec" 23 "github.com/NVIDIA/aistore/fs" 24 "github.com/NVIDIA/aistore/transport" 25 ) 26 27 // High level overview of how EC rebalance works. 28 // 1. EC traverses only metafile(%mt) directories. A jogger per mountpath. 29 // 2. A jogger skips a metafile if: 30 // - its `FullReplica` is not the local target ID 31 // - its `FullReplica` equals the local target ID and HRW chooses local target 32 // 3. Otherwise, a jogger calculates a correct target using HRW and moves CT there 33 // 4. A target on receiving: 34 // 4.1. Preparation: 35 // - Update metadata: fix `Daemons` and `FullReplica` fields 36 // 4.1. If the target has another CT of the same object and generation: 37 // - move local CT to a working directory 38 // 4.2. If the target contains another CT of the same object and generation: 39 // - save sent CT and metafile 40 // 4.3 If anything was moved to working directory at step 4.1: 41 // - select a target that has no valid CT of the object 42 // - moves the local CT to the selected target 43 // 4.3. Finalization: 44 // - broadcast new metadata to all targets in `Daemons` field for them to 45 // update their metafiles. Targets do not overwrite their metafiles with a new 46 // one. They update only `Daemons` and `FullReplica` fields. 47 48 func (reb *Reb) runECjoggers() { 49 var ( 50 wg = &sync.WaitGroup{} 51 availablePaths = fs.GetAvail() 52 cfg = cmn.GCO.Get() 53 b = reb.xctn().Bck() 54 ) 55 for _, mi := range availablePaths { 56 bck := cmn.Bck{Provider: apc.AIS} 57 if b != nil { 58 bck = cmn.Bck{Name: b.Name, Provider: apc.AIS, Ns: b.Ns} 59 } 60 wg.Add(1) 61 go reb.jogEC(mi, &bck, wg) 62 } 63 for _, provider := range cfg.Backend.Providers { 64 for _, mi := range availablePaths { 65 bck := cmn.Bck{Provider: provider.Name} 66 if b != nil { 67 bck = cmn.Bck{Name: bck.Name, Provider: provider.Name, Ns: bck.Ns} 68 } 69 wg.Add(1) 70 go reb.jogEC(mi, &bck, wg) 71 } 72 } 73 wg.Wait() 74 } 75 76 // mountpath walker - walks through files in /meta/ directory 77 func (reb *Reb) jogEC(mi *fs.Mountpath, bck *cmn.Bck, wg *sync.WaitGroup) { 78 defer wg.Done() 79 opts := &fs.WalkOpts{ 80 Mi: mi, 81 CTs: []string{fs.ECMetaType}, 82 Callback: reb.walkEC, 83 Sorted: false, 84 } 85 opts.Bck.Copy(bck) 86 if err := fs.Walk(opts); err != nil { 87 xreb := reb.xctn() 88 if xreb.IsAborted() || xreb.Finished() { 89 nlog.Infof("aborting traversal") 90 } else { 91 nlog.Warningf("failed to traverse, err: %v", err) 92 } 93 } 94 } 95 96 // Sends local CT along with EC metadata to default target. 97 // The CT is on a local drive and not loaded into SGL. Just read and send. 98 func (reb *Reb) sendFromDisk(ct *core.CT, meta *ec.Metadata, target *meta.Snode, workFQN ...string) (err error) { 99 var ( 100 lom *core.LOM 101 roc cos.ReadOpenCloser 102 fqn = ct.FQN() 103 action = uint32(rebActRebCT) 104 ) 105 debug.Assert(meta != nil) 106 if len(workFQN) != 0 { 107 fqn = workFQN[0] 108 action = rebActMoveCT 109 } 110 // TODO: unify acquiring a reader for LOM and CT 111 if ct.ContentType() == fs.ObjectType { 112 lom = core.AllocLOM(ct.ObjectName()) 113 if err = lom.InitBck(ct.Bck().Bucket()); err != nil { 114 core.FreeLOM(lom) 115 return 116 } 117 lom.Lock(false) 118 if err = lom.Load(false /*cache it*/, true /*locked*/); err != nil { 119 lom.Unlock(false) 120 core.FreeLOM(lom) 121 return 122 } 123 } else { 124 lom = nil // sending slice; TODO: rlock 125 } 126 127 // open 128 if lom != nil { 129 defer core.FreeLOM(lom) 130 roc, err = lom.NewDeferROC() 131 } else { 132 roc, err = cos.NewFileHandle(fqn) 133 } 134 if err != nil { 135 return 136 } 137 138 // transmit 139 ntfn := stageNtfn{daemonID: core.T.SID(), stage: rebStageTraverse, rebID: reb.rebID.Load(), md: meta, action: action} 140 o := transport.AllocSend() 141 o.Hdr = transport.ObjHdr{ObjName: ct.ObjectName(), ObjAttrs: cmn.ObjAttrs{Size: meta.Size}} 142 o.Hdr.Bck.Copy(ct.Bck().Bucket()) 143 if lom != nil { 144 o.Hdr.ObjAttrs.CopyFrom(lom.ObjAttrs(), false /*skip cksum*/) 145 } 146 if meta.SliceID != 0 { 147 o.Hdr.ObjAttrs.Size = ec.SliceSize(meta.Size, meta.Data) 148 } 149 reb.onAir.Inc() 150 o.Hdr.Opaque = ntfn.NewPack(rebMsgEC) 151 o.Callback = reb.transportECCB 152 if err = reb.dm.Send(o, roc, target); err != nil { 153 err = fmt.Errorf("failed to send slices to nodes [%s..]: %v", target.ID(), err) 154 return 155 } 156 xreb := reb.xctn() 157 xreb.OutObjsAdd(1, o.Hdr.ObjAttrs.Size) 158 return 159 } 160 161 func (reb *Reb) transportECCB(_ *transport.ObjHdr, _ io.ReadCloser, _ any, _ error) { 162 reb.onAir.Dec() 163 } 164 165 // Saves received CT to a local drive if needed: 166 // 1. Full object/replica is received 167 // 2. A CT is received and this target is not the default target (it 168 // means that the CTs came from default target after EC had been rebuilt) 169 func (reb *Reb) saveCTToDisk(ntfn *stageNtfn, hdr *transport.ObjHdr, data io.Reader) error { 170 cos.Assert(ntfn.md != nil) 171 var ( 172 err error 173 bck = meta.CloneBck(&hdr.Bck) 174 ) 175 if err := bck.Init(core.T.Bowner()); err != nil { 176 return err 177 } 178 md := ntfn.md.NewPack() 179 if ntfn.md.SliceID != 0 { 180 args := &ec.WriteArgs{Reader: data, MD: md, Xact: reb.xctn()} 181 err = ec.WriteSliceAndMeta(hdr, args) 182 } else { 183 var lom *core.LOM 184 lom, err = core.AllocLomFromHdr(hdr) 185 if err == nil { 186 args := &ec.WriteArgs{Reader: data, MD: md, Cksum: hdr.ObjAttrs.Cksum, Xact: reb.xctn()} 187 err = ec.WriteReplicaAndMeta(lom, args) 188 } 189 core.FreeLOM(lom) 190 } 191 return err 192 } 193 194 // Used when slice conflict is detected: a target receives a new slice and it already 195 // has a slice of the same generation with different ID 196 func (*Reb) renameAsWorkFile(ct *core.CT) (string, error) { 197 fqn := ct.Make(fs.WorkfileType) 198 // Using os.Rename is safe as both CT and Workfile on the same mountpath 199 if err := os.Rename(ct.FQN(), fqn); err != nil { 200 return "", err 201 } 202 return fqn, nil 203 } 204 205 // Find a target that has either an obsolete slice or no slice of the object. 206 // Used to resolve the conflict: this target is the "main" one (has a full 207 // replica) but it also stores a slice of the object. So, the existing slice 208 // goes to any other _free_ target. 209 func (reb *Reb) findEmptyTarget(md *ec.Metadata, ct *core.CT, sender string) (*meta.Snode, error) { 210 var ( 211 sliceCnt = md.Data + md.Parity + 2 212 smap = reb.smap.Load() 213 hrwList, err = smap.HrwTargetList(ct.Bck().MakeUname(ct.ObjectName()), sliceCnt) 214 ) 215 if err != nil { 216 return nil, err 217 } 218 for _, tsi := range hrwList { 219 if tsi.ID() == sender || tsi.ID() == core.T.SID() { 220 continue 221 } 222 remoteMD, err := ec.RequestECMeta(ct.Bucket(), ct.ObjectName(), tsi, core.T.DataClient()) 223 if remoteMD != nil && remoteMD.Generation < md.Generation { 224 return tsi, nil 225 } 226 if remoteMD != nil && remoteMD.Generation == md.Generation { 227 _, ok := md.Daemons[tsi.ID()] 228 if !ok { 229 // ct.ObjectName()[remoteMD.SliceID] not found (new slice md.SliceID) 230 return tsi, nil 231 } 232 } 233 if err != nil && cos.IsNotExist(err, 0) { 234 return tsi, nil 235 } 236 if err != nil { 237 nlog.Errorf("Failed to read metadata from %s: %v", tsi.StringEx(), err) 238 } 239 } 240 return nil, errors.New("no _free_ targets") 241 } 242 243 // Check if this target has a metadata for the received CT 244 func detectLocalCT(req *stageNtfn, ct *core.CT) (*ec.Metadata, error) { 245 if req.action == rebActMoveCT { 246 // internal CT move after slice conflict - save always 247 return nil, nil 248 } 249 if _, ok := req.md.Daemons[core.T.SID()]; !ok { 250 return nil, nil 251 } 252 mdCT, err := core.NewCTFromBO(ct.Bck().Bucket(), ct.ObjectName(), core.T.Bowner(), fs.ECMetaType) 253 if err != nil { 254 return nil, err 255 } 256 locMD, err := ec.LoadMetadata(mdCT.FQN()) 257 if err != nil && os.IsNotExist(err) { 258 err = nil 259 } 260 return locMD, err 261 } 262 263 // When a target receives a slice and the target has a slice with different ID: 264 // - move slice to a workfile directory 265 // - return Snode that must receive the local slice, and workfile path 266 // - the caller saves received CT to local drives, and then sends workfile 267 func (reb *Reb) renameLocalCT(req *stageNtfn, ct *core.CT, md *ec.Metadata) ( 268 workFQN string, moveTo *meta.Snode, err error) { 269 if md == nil || req.action == rebActMoveCT { 270 return 271 } 272 if md.SliceID == 0 || md.SliceID == req.md.SliceID || req.md.Generation != md.Generation { 273 return 274 } 275 if workFQN, err = reb.renameAsWorkFile(ct); err != nil { 276 return 277 } 278 if moveTo, err = reb.findEmptyTarget(md, ct, req.daemonID); err != nil { 279 if errMv := os.Rename(workFQN, ct.FQN()); errMv != nil { 280 nlog.Errorf("Error restoring slice: %v", errMv) 281 } 282 } 283 return 284 } 285 286 func (reb *Reb) walkEC(fqn string, de fs.DirEntry) error { 287 xreb := reb.xctn() 288 if err := xreb.AbortErr(); err != nil { 289 // notify `dir.Walk` to stop iterations 290 nlog.Infoln(xreb.Name(), "walk-ec aborted", err) 291 return err 292 } 293 294 if de.IsDir() { 295 return nil 296 } 297 298 ct, err := core.NewCTFromFQN(fqn, core.T.Bowner()) 299 if err != nil { 300 return nil 301 } 302 // do not touch directories for buckets with EC disabled (for now) 303 if !ct.Bck().Props.EC.Enabled { 304 return filepath.SkipDir 305 } 306 307 md, err := ec.LoadMetadata(fqn) 308 if err != nil { 309 nlog.Warningf("failed to load %q metadata: %v", fqn, err) 310 return nil 311 } 312 313 // Skip a CT if this target is not the 'main' one 314 if md.FullReplica != core.T.SID() { 315 return nil 316 } 317 318 smap := reb.smap.Load() 319 hrwTarget, err := smap.HrwHash2T(ct.Digest()) 320 if err != nil || hrwTarget.ID() == core.T.SID() { 321 return err 322 } 323 324 // check if both slice/replica and metafile exist 325 isReplica := md.SliceID == 0 326 var fileFQN string 327 if isReplica { 328 fileFQN = ct.Make(fs.ObjectType) 329 } else { 330 fileFQN = ct.Make(fs.ECSliceType) 331 } 332 if err := cos.Stat(fileFQN); err != nil { 333 nlog.Warningf("%s no CT for metadata[%d]: %s", core.T, md.SliceID, fileFQN) 334 return nil 335 } 336 337 ct, err = core.NewCTFromFQN(fileFQN, core.T.Bowner()) 338 if err != nil { 339 return nil 340 } 341 return reb.sendFromDisk(ct, md, hrwTarget) 342 }