github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/reb/recv.go (about) 1 // Package reb provides global cluster-wide rebalance upon adding/removing storage nodes. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package reb 6 7 import ( 8 "bytes" 9 "fmt" 10 "io" 11 "os" 12 13 "github.com/NVIDIA/aistore/cmn" 14 "github.com/NVIDIA/aistore/cmn/cos" 15 "github.com/NVIDIA/aistore/cmn/debug" 16 "github.com/NVIDIA/aistore/cmn/nlog" 17 "github.com/NVIDIA/aistore/core" 18 "github.com/NVIDIA/aistore/core/meta" 19 "github.com/NVIDIA/aistore/ec" 20 "github.com/NVIDIA/aistore/fs" 21 "github.com/NVIDIA/aistore/transport" 22 ) 23 24 // TODO: currently, cannot return errors from the receive handlers, here and elsewhere 25 // (see `_regRecv` for "static lifecycle") 26 27 func (reb *Reb) _recvErr(err error) error { 28 if err == nil { 29 return err 30 } 31 if xreb := reb.xctn(); xreb != nil { 32 xreb.Abort(err) 33 } 34 return nil 35 } 36 37 func (reb *Reb) recvObj(hdr *transport.ObjHdr, objReader io.Reader, err error) error { 38 defer transport.DrainAndFreeReader(objReader) 39 if err != nil { 40 nlog.Errorln(err) 41 return err 42 } 43 44 smap, err := reb._waitForSmap() 45 if err != nil { 46 return reb._recvErr(err) 47 } 48 unpacker := cos.NewUnpacker(hdr.Opaque) 49 act, err := unpacker.ReadByte() 50 if err != nil { 51 nlog.Errorf("Failed to read message type: %v", err) 52 return reb._recvErr(err) 53 } 54 if act == rebMsgRegular { 55 err := reb.recvObjRegular(hdr, smap, unpacker, objReader) 56 return reb._recvErr(err) 57 } 58 debug.Assertf(act == rebMsgEC, "act=%d", act) 59 err = reb.recvECData(hdr, unpacker, objReader) 60 return reb._recvErr(err) 61 } 62 63 func (reb *Reb) recvAck(hdr *transport.ObjHdr, _ io.Reader, err error) error { 64 if err != nil { 65 nlog.Errorln(err) 66 return err 67 } 68 69 unpacker := cos.NewUnpacker(hdr.Opaque) 70 act, err := unpacker.ReadByte() 71 if err != nil { 72 err = fmt.Errorf("failed to read message type: %v", err) 73 return reb._recvErr(err) 74 } 75 if act == rebMsgEC { 76 err := reb.recvECAck(hdr, unpacker) 77 return reb._recvErr(err) 78 } 79 debug.Assertf(act == rebMsgRegular, "act=%d", act) 80 err = reb.recvRegularAck(hdr, unpacker) 81 return reb._recvErr(err) 82 } 83 84 func (reb *Reb) recvStageNtfn(hdr *transport.ObjHdr, _ io.Reader, errRx error) error { 85 if errRx != nil { 86 nlog.Errorf("%s: %v", core.T, errRx) 87 return errRx 88 } 89 ntfn, err := reb.decodeStageNtfn(hdr.Opaque) 90 if err != nil { 91 return reb._recvErr(err) 92 } 93 94 var ( 95 rebID = reb.RebID() 96 rsmap = reb.smap.Load() 97 otherStage = stages[ntfn.stage] 98 xreb = reb.xctn() 99 ) 100 if xreb == nil { 101 if reb.stages.stage.Load() != rebStageInactive { 102 nlog.Errorf("%s: nil rebalancing xaction", reb.logHdr(rebID, rsmap)) 103 } 104 return nil 105 } 106 if xreb.IsAborted() { 107 return nil 108 } 109 110 // TODO: see "static lifecycle" comment above 111 112 // eq 113 if rebID == ntfn.rebID { 114 reb.stages.setStage(ntfn.daemonID, ntfn.stage) 115 if ntfn.stage == rebStageAbort { 116 err := fmt.Errorf("abort stage notification from %s(%s)", meta.Tname(ntfn.daemonID), otherStage) 117 xreb.Abort(cmn.NewErrAborted(xreb.Name(), reb.logHdr(rebID, rsmap), err)) 118 } 119 return nil 120 } 121 // other's old 122 if rebID > ntfn.rebID { 123 nlog.Warningf("%s: stage notification from %s(%s): %s", reb.logHdr(rebID, rsmap), 124 meta.Tname(ntfn.daemonID), otherStage, reb.warnID(ntfn.rebID, ntfn.daemonID)) 125 return nil 126 } 127 128 xreb.Abort(cmn.NewErrAborted(xreb.Name(), reb.logHdr(rebID, rsmap), err)) 129 return nil 130 } 131 132 // 133 // regular (non-EC) receive 134 // 135 136 func (reb *Reb) recvObjRegular(hdr *transport.ObjHdr, smap *meta.Smap, unpacker *cos.ByteUnpack, objReader io.Reader) error { 137 ack := ®ularAck{} 138 if err := unpacker.ReadAny(ack); err != nil { 139 nlog.Errorf("Failed to parse ACK: %v", err) 140 return err 141 } 142 if ack.rebID != reb.RebID() { 143 nlog.Warningf("received %s: %s", hdr.Cname(), reb.warnID(ack.rebID, ack.daemonID)) 144 return nil 145 } 146 tsid := ack.daemonID // the sender 147 // Rx 148 lom := core.AllocLOM(hdr.ObjName) 149 defer core.FreeLOM(lom) 150 if err := lom.InitBck(&hdr.Bck); err != nil { 151 nlog.Errorln(err) 152 return nil 153 } 154 if stage := reb.stages.stage.Load(); stage >= rebStageFin { 155 reb.laterx.Store(true) 156 if stage > rebStageFin && cmn.Rom.FastV(4, cos.SmoduleReb) { 157 nlog.Infof("Warning: %s: post stage-fin receive from %s %s (stage %s)", 158 core.T.Snode(), meta.Tname(tsid), lom, stages[stage]) 159 } 160 } else if stage < rebStageTraverse { 161 nlog.Errorf("%s: early receive from %s %s (stage %s)", core.T, meta.Tname(tsid), lom, stages[stage]) 162 } 163 lom.CopyAttrs(&hdr.ObjAttrs, true /*skip-checksum*/) // see "PUT is a no-op" 164 xreb := reb.xctn() 165 if xreb.IsAborted() { 166 return nil 167 } 168 params := core.AllocPutParams() 169 { 170 params.WorkTag = fs.WorkfilePut 171 params.Reader = io.NopCloser(objReader) 172 params.OWT = cmn.OwtRebalance 173 params.Cksum = hdr.ObjAttrs.Cksum 174 params.Atime = lom.Atime() 175 params.Xact = xreb 176 } 177 erp := core.T.PutObject(lom, params) 178 core.FreePutParams(params) 179 if erp != nil { 180 nlog.Errorln(erp) 181 return erp 182 } 183 // stats 184 xreb.InObjsAdd(1, hdr.ObjAttrs.Size) 185 186 // ACK 187 tsi := smap.GetTarget(tsid) 188 if tsi == nil { 189 err := fmt.Errorf("%s is not in the %s", meta.Tname(tsid), smap) 190 nlog.Errorln(err) 191 return err 192 } 193 if stage := reb.stages.stage.Load(); stage < rebStageFinStreams && stage != rebStageInactive { 194 ack := ®ularAck{rebID: reb.RebID(), daemonID: core.T.SID()} 195 hdr.Opaque = ack.NewPack() 196 hdr.ObjAttrs.Size = 0 197 if err := reb.dm.ACK(hdr, nil, tsi); err != nil { 198 nlog.Errorln(err) 199 return err 200 } 201 } 202 return nil 203 } 204 205 func (reb *Reb) recvRegularAck(hdr *transport.ObjHdr, unpacker *cos.ByteUnpack) error { 206 ack := ®ularAck{} 207 if err := unpacker.ReadAny(ack); err != nil { 208 nlog.Errorf("Failed to parse ACK: %v", err) 209 return err 210 } 211 if ack.rebID != reb.rebID.Load() { 212 nlog.Warningf("ACK from %s: %s", ack.daemonID, reb.warnID(ack.rebID, ack.daemonID)) 213 return nil 214 } 215 216 lom := core.AllocLOM(hdr.ObjName) 217 if err := lom.InitBck(&hdr.Bck); err != nil { 218 core.FreeLOM(lom) 219 nlog.Errorln(err) 220 return nil 221 } 222 223 // No immediate file deletion: let LRU cleanup the "misplaced" object 224 // TODO: mark the object "Deleted" 225 226 reb.delLomAck(lom, ack.rebID, true /*free pending (orig) transmitted LOM*/) 227 core.FreeLOM(lom) 228 return nil 229 } 230 231 // 232 // EC receive 233 // 234 235 func (*Reb) recvECAck(hdr *transport.ObjHdr, unpacker *cos.ByteUnpack) (err error) { 236 ack := &ecAck{} 237 err = unpacker.ReadAny(ack) 238 if err != nil { 239 nlog.Errorf("Failed to unmarshal EC ACK for %s: %v", hdr.Cname(), err) 240 } 241 return 242 } 243 244 // Receive MD update. Handling includes partially updating local information: 245 // only the list of daemons and the _main_ target. 246 func receiveMD(req *stageNtfn, hdr *transport.ObjHdr) error { 247 ctMeta, err := core.NewCTFromBO(&hdr.Bck, hdr.ObjName, core.T.Bowner(), fs.ECMetaType) 248 if err != nil { 249 return err 250 } 251 md, err := ec.LoadMetadata(ctMeta.FQN()) 252 if err != nil { 253 if os.IsNotExist(err) { 254 err = nil 255 } 256 return err 257 } 258 if md.Generation != req.md.Generation { 259 return nil 260 } 261 md.FullReplica = req.md.FullReplica 262 md.Daemons = req.md.Daemons 263 mdBytes := md.NewPack() 264 265 return ctMeta.Write(bytes.NewReader(mdBytes), -1) 266 } 267 268 func (reb *Reb) receiveCT(req *stageNtfn, hdr *transport.ObjHdr, reader io.Reader) error { 269 ct, err := core.NewCTFromBO(&hdr.Bck, hdr.ObjName, core.T.Bowner(), fs.ECSliceType) 270 if err != nil { 271 return err 272 } 273 md, err := detectLocalCT(req, ct) 274 if err != nil { 275 nlog.Errorf("%s: %v", ct.FQN(), err) 276 return err 277 } 278 // Fix the metadata: update CT locations 279 delete(req.md.Daemons, req.daemonID) 280 if md != nil && req.md.Generation < md.Generation { 281 // Local CT is newer - do not save anything 282 return nil 283 } 284 // Check for slice conflict 285 workFQN, moveTo, err := reb.renameLocalCT(req, ct, md) 286 if err != nil { 287 return err 288 } 289 req.md.FullReplica = core.T.SID() 290 req.md.Daemons[core.T.SID()] = uint16(req.md.SliceID) 291 if moveTo != nil { 292 req.md.Daemons[moveTo.ID()] = uint16(md.SliceID) 293 } 294 // Save received CT to local drives 295 err = reb.saveCTToDisk(req, hdr, reader) 296 if err != nil { 297 if errRm := os.Remove(ct.FQN()); errRm != nil { 298 nlog.Errorf("Failed to remove %s: %v", ct.FQN(), errRm) 299 } 300 if moveTo != nil { 301 if errMv := os.Rename(workFQN, ct.FQN()); errMv != nil { 302 nlog.Errorf("Error restoring slice: %v", errMv) 303 } 304 } 305 return err 306 } 307 // Send local slice 308 if moveTo != nil { 309 req.md.SliceID = md.SliceID 310 if err = reb.sendFromDisk(ct, req.md, moveTo, workFQN); err != nil { 311 nlog.Errorf("Failed to move slice to %s: %v", moveTo, err) 312 } 313 } 314 // Broadcast updated MD 315 ntfnMD := stageNtfn{daemonID: core.T.SID(), stage: rebStageTraverse, rebID: reb.rebID.Load(), md: req.md, action: rebActUpdateMD} 316 nodes := req.md.RemoteTargets() 317 for _, tsi := range nodes { 318 if moveTo != nil && moveTo.ID() == tsi.ID() { 319 continue 320 } 321 reb.onAir.Inc() 322 xreb := reb.xctn() 323 if xreb.IsAborted() { 324 break 325 } 326 o := transport.AllocSend() 327 o.Hdr = transport.ObjHdr{ObjName: ct.ObjectName(), ObjAttrs: cmn.ObjAttrs{Size: 0}} 328 o.Hdr.Bck.Copy(ct.Bck().Bucket()) 329 o.Hdr.Opaque = ntfnMD.NewPack(rebMsgEC) 330 o.Callback = reb.transportECCB 331 if errSend := reb.dm.Send(o, nil, tsi); errSend != nil && err == nil { 332 err = fmt.Errorf("failed to send updated metafile: %v", err) 333 } 334 } 335 return err 336 } 337 338 // receiving EC CT 339 func (reb *Reb) recvECData(hdr *transport.ObjHdr, unpacker *cos.ByteUnpack, reader io.Reader) error { 340 req := &stageNtfn{} 341 err := unpacker.ReadAny(req) 342 if err != nil { 343 nlog.Errorf("invalid stage notification %s: %v", hdr.ObjName, err) 344 return err 345 } 346 if req.rebID != reb.rebID.Load() { 347 nlog.Warningf("%s: not yet started or already finished rebalancing (%d, %d)", 348 core.T.Snode(), req.rebID, reb.rebID.Load()) 349 return nil 350 } 351 if req.action == rebActUpdateMD { 352 err := receiveMD(req, hdr) 353 if err != nil { 354 nlog.Errorf("failed to receive MD for %s: %v", hdr.Cname(), err) 355 nlog.Errorf("Warning: (g%d, %s) ignoring, proceeding anyway...", req.rebID, core.T) // TODO: revisit 356 } 357 return nil 358 } 359 if err := reb.receiveCT(req, hdr, reader); err != nil { 360 nlog.Errorf("failed to receive CT for %s: %v", hdr.Cname(), err) 361 return err 362 } 363 return nil 364 }