github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ec/getjogger.go (about) 1 // Package ec provides erasure coding (EC) based data protection for AIStore. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package ec 6 7 import ( 8 "bytes" 9 "errors" 10 "fmt" 11 "io" 12 "net/http" 13 "os" 14 "sync" 15 "time" 16 17 "github.com/NVIDIA/aistore/cmn" 18 "github.com/NVIDIA/aistore/cmn/atomic" 19 "github.com/NVIDIA/aistore/cmn/cos" 20 "github.com/NVIDIA/aistore/cmn/debug" 21 "github.com/NVIDIA/aistore/cmn/nlog" 22 "github.com/NVIDIA/aistore/core" 23 "github.com/NVIDIA/aistore/core/meta" 24 "github.com/NVIDIA/aistore/fs" 25 "github.com/NVIDIA/aistore/memsys" 26 "github.com/NVIDIA/aistore/transport" 27 "github.com/klauspost/reedsolomon" 28 ) 29 30 type ( 31 // Mountpath getJogger: processes GET requests to one mountpath 32 getJogger struct { 33 parent *XactGet 34 client *http.Client 35 mpath string // Mountpath that the jogger manages 36 37 workCh chan *request // Channel to request TOP priority operation (restore) 38 stopCh cos.StopCh // Jogger management channel: to stop it 39 } 40 restoreCtx struct { 41 lom *core.LOM // replica 42 meta *Metadata // restored object's EC metafile 43 nodes map[string]*Metadata // EC metafiles downloaded from other targets 44 slices []*slice // slices downloaded from other targets 45 idToNode map[int]string // existing sliceID <-> target 46 toDisk bool // use memory or disk for temporary files 47 } 48 ) 49 50 var ( 51 restoreCtxPool sync.Pool 52 emptyRestoreCtx restoreCtx 53 ) 54 55 func allocRestoreCtx() (ctx *restoreCtx) { 56 if v := restoreCtxPool.Get(); v != nil { 57 ctx = v.(*restoreCtx) 58 } else { 59 ctx = &restoreCtx{} 60 } 61 return 62 } 63 64 func freeRestoreCtx(ctx *restoreCtx) { 65 *ctx = emptyRestoreCtx 66 restoreCtxPool.Put(ctx) 67 } 68 69 func (c *getJogger) newCtx(req *request) (*restoreCtx, error) { 70 lom, err := req.LIF.LOM() 71 if err != nil { 72 return nil, err 73 } 74 ctx := allocRestoreCtx() 75 ctx.toDisk = useDisk(0 /*size of the original object is unknown*/, c.parent.config) 76 ctx.lom = lom 77 err = lom.Load(true /*cache it*/, false /*locked*/) 78 if os.IsNotExist(err) { 79 err = nil 80 } 81 return ctx, err 82 } 83 84 func (*getJogger) freeCtx(ctx *restoreCtx) { 85 core.FreeLOM(ctx.lom) 86 freeRestoreCtx(ctx) 87 } 88 89 func (c *getJogger) run() { 90 nlog.Infof("started EC for mountpath: %s, bucket %s", c.mpath, c.parent.bck) 91 92 for { 93 select { 94 case req := <-c.workCh: 95 c.parent.stats.updateWaitTime(time.Since(req.tm)) 96 req.tm = time.Now() 97 c.parent.IncPending() 98 c.ec(req) 99 c.parent.DecPending() 100 freeReq(req) 101 case <-c.stopCh.Listen(): 102 return 103 } 104 } 105 } 106 107 func (c *getJogger) stop() { 108 nlog.Infof("stopping EC for mountpath: %s, bucket: %s", c.mpath, c.parent.bck) 109 c.stopCh.Close() 110 } 111 112 // Finalize the EC restore: report an error to a caller, do housekeeping. 113 func (*getJogger) finalizeReq(req *request, err error) { 114 if err != nil { 115 nlog.Errorf("Error restoring %s: %v", req.LIF.Uname, err) 116 } 117 if req.ErrCh != nil { 118 if err != nil { 119 req.ErrCh <- err 120 } 121 close(req.ErrCh) 122 } 123 } 124 125 func (c *getJogger) ec(req *request) { 126 debug.Assert(req.Action == ActRestore) 127 ctx, err := c.newCtx(req) 128 if ctx == nil { 129 debug.Assert(err != nil) 130 return 131 } 132 if err == nil { 133 err = c.restore(ctx) 134 c.parent.stats.updateDecodeTime(time.Since(req.tm), err != nil) 135 } 136 if err == nil { 137 c.parent.stats.updateObjTime(time.Since(req.putTime)) 138 err = ctx.lom.Persist() 139 } 140 c.freeCtx(ctx) 141 c.finalizeReq(req, err) 142 } 143 144 // The final step of replica restoration process: the main target detects which 145 // nodes do not have replicas and then runs respective replications. 146 // * reader - replica content to send to remote targets 147 func (c *getJogger) copyMissingReplicas(ctx *restoreCtx, reader cos.ReadOpenCloser) error { 148 if err := ctx.lom.Load(false /*cache it*/, false /*locked*/); err != nil { 149 return err 150 } 151 smap := core.T.Sowner().Get() 152 targets, err := smap.HrwTargetList(ctx.lom.Uname(), ctx.meta.Parity+1) 153 if err != nil { 154 return err 155 } 156 157 // Fill the list of daemonIDs that do not have replica 158 daemons := make([]string, 0, len(targets)) 159 for _, target := range targets { 160 if target.ID() == core.T.SID() { 161 continue 162 } 163 164 if _, ok := ctx.nodes[target.ID()]; !ok { 165 daemons = append(daemons, target.ID()) 166 } 167 } 168 169 // If any target lost its replica send the replica to it, and free allocated 170 // memory on completion. Otherwise free allocated memory and return immediately 171 if len(daemons) == 0 { 172 freeObject(reader) 173 return nil 174 } 175 176 var srcReader cos.ReadOpenCloser 177 switch r := reader.(type) { 178 case *memsys.SGL: 179 srcReader = memsys.NewReader(r) 180 case *cos.FileHandle: 181 srcReader, err = cos.NewFileHandle(ctx.lom.FQN) 182 default: 183 debug.FailTypeCast(reader) 184 err = fmt.Errorf("unsupported reader type: %T", reader) 185 } 186 187 if err != nil { 188 return err 189 } 190 191 // _ io.ReadCloser: pass copyMisssingReplicas reader argument(memsys.SGL type) 192 // instead of callback's reader argument(memsys.Reader type) to freeObject 193 // Reason: memsys.Reader does not provide access to internal memsys.SGL that must be freed 194 cb := func(_ *transport.ObjHdr, _ io.ReadCloser, _ any, err error) { 195 if err != nil { 196 nlog.Errorf("%s failed to send %s to %v: %v", core.T, ctx.lom, daemons, err) 197 } 198 freeObject(reader) 199 } 200 src := &dataSource{ 201 reader: srcReader, 202 size: ctx.lom.SizeBytes(), 203 metadata: ctx.meta, 204 reqType: reqPut, 205 } 206 return c.parent.writeRemote(daemons, ctx.lom, src, cb) 207 } 208 209 func (c *getJogger) restoreReplicatedFromMemory(ctx *restoreCtx) error { 210 var ( 211 writer *memsys.SGL 212 ) 213 // Try to read replica from targets one by one until the replica is downloaded 214 for node := range ctx.nodes { 215 uname := unique(node, ctx.lom.Bck(), ctx.lom.ObjName) 216 iReqBuf := newIntraReq(reqGet, ctx.meta, ctx.lom.Bck()).NewPack(g.smm) 217 218 w := g.smm.NewSGL(cos.KiB) 219 if _, err := c.parent.readRemote(ctx.lom, node, uname, iReqBuf, w); err != nil { 220 nlog.Errorf("%s failed to read from %s", core.T, node) 221 w.Free() 222 g.smm.Free(iReqBuf) 223 w = nil 224 continue 225 } 226 g.smm.Free(iReqBuf) 227 if w.Size() != 0 { 228 // A valid replica is found - break and do not free SGL 229 writer = w 230 break 231 } 232 w.Free() 233 } 234 if cmn.Rom.FastV(4, cos.SmoduleEC) { 235 nlog.Infof("Found meta -> obj get %s, writer found: %v", ctx.lom, writer != nil) 236 } 237 238 if writer == nil { 239 return errors.New("failed to read a replica from any target") 240 } 241 242 ctx.lom.SetSize(writer.Size()) 243 args := &WriteArgs{ 244 Reader: memsys.NewReader(writer), 245 MD: ctx.meta.NewPack(), 246 Cksum: cos.NewCksum(ctx.meta.CksumType, ctx.meta.CksumValue), 247 Generation: ctx.meta.Generation, 248 Xact: c.parent, 249 } 250 if err := WriteReplicaAndMeta(ctx.lom, args); err != nil { 251 writer.Free() 252 return err 253 } 254 255 err := c.copyMissingReplicas(ctx, writer) 256 if err != nil { 257 writer.Free() 258 } 259 return err 260 } 261 262 func (c *getJogger) restoreReplicatedFromDisk(ctx *restoreCtx) error { 263 var ( 264 writer *os.File 265 n int64 266 ) 267 // Try to read a replica from targets one by one until the replica is downloaded 268 tmpFQN := fs.CSM.Gen(ctx.lom, fs.WorkfileType, "ec-restore-repl") 269 270 for node := range ctx.nodes { 271 uname := unique(node, ctx.lom.Bck(), ctx.lom.ObjName) 272 273 w, err := ctx.lom.CreateFile(tmpFQN) 274 if err != nil { 275 nlog.Errorf("Failed to create file: %v", err) 276 break 277 } 278 iReqBuf := newIntraReq(reqGet, ctx.meta, ctx.lom.Bck()).NewPack(g.smm) 279 n, err = c.parent.readRemote(ctx.lom, node, uname, iReqBuf, w) 280 g.smm.Free(iReqBuf) 281 282 if err == nil && n != 0 { 283 // A valid replica is found - break and do close file handle 284 err = cos.FlushClose(w) 285 if err != nil { 286 nlog.Errorf("Failed to flush and close: %v", err) 287 break 288 } 289 ctx.lom.SetSize(n) 290 writer = w 291 break 292 } 293 294 cos.Close(w) 295 errRm := cos.RemoveFile(tmpFQN) 296 debug.AssertNoErr(errRm) 297 } 298 if cmn.Rom.FastV(4, cos.SmoduleEC) { 299 nlog.Infof("Found meta -> obj get %s, writer found: %v", ctx.lom, writer != nil) 300 } 301 302 if writer == nil { 303 return errors.New("failed to read a replica from any target") 304 } 305 if err := ctx.lom.RenameFrom(tmpFQN); err != nil { 306 return err 307 } 308 309 if err := ctx.lom.Persist(); err != nil { 310 return err 311 } 312 313 b := cos.MustMarshal(ctx.meta) 314 ctMeta := core.NewCTFromLOM(ctx.lom, fs.ECMetaType) 315 if err := ctMeta.Write(bytes.NewReader(b), -1); err != nil { 316 return err 317 } 318 if _, exists := core.T.Bowner().Get().Get(ctMeta.Bck()); !exists { 319 if errRm := cos.RemoveFile(ctMeta.FQN()); errRm != nil { 320 nlog.Errorf("nested error: save restored replica -> remove metafile: %v", errRm) 321 } 322 return fmt.Errorf("%s metafile saved while bucket %s was being destroyed", ctMeta.ObjectName(), ctMeta.Bucket()) 323 } 324 325 reader, err := cos.NewFileHandle(ctx.lom.FQN) 326 if err != nil { 327 return err 328 } 329 err = c.copyMissingReplicas(ctx, reader) 330 if err != nil { 331 freeObject(reader) 332 } 333 return err 334 } 335 336 // Main object is not found and it is clear that it was encoded. Request 337 // all data and parity slices from targets in a cluster. 338 func (c *getJogger) requestSlices(ctx *restoreCtx) error { 339 var ( 340 wgSlices = cos.NewTimeoutGroup() 341 sliceCnt = ctx.meta.Data + ctx.meta.Parity 342 daemons = make([]string, 0, len(ctx.nodes)) // Targets to be requested for slices 343 ) 344 ctx.slices = make([]*slice, sliceCnt) 345 ctx.idToNode = make(map[int]string) 346 347 for k, v := range ctx.nodes { 348 if v.SliceID < 1 || v.SliceID > sliceCnt { 349 nlog.Warningf("Node %s has invalid slice ID %d", k, v.SliceID) 350 continue 351 } 352 353 if cmn.Rom.FastV(4, cos.SmoduleEC) { 354 nlog.Infof("Slice %s[%d] requesting from %s", ctx.lom, v.SliceID, k) 355 } 356 var writer *slice 357 if ctx.toDisk { 358 prefix := fmt.Sprintf("ec-restore-%d", v.SliceID) 359 fqn := fs.CSM.Gen(ctx.lom, fs.WorkfileType, prefix) 360 fh, err := ctx.lom.CreateFile(fqn) 361 if err != nil { 362 return err 363 } 364 writer = &slice{ 365 writer: fh, 366 twg: wgSlices, 367 workFQN: fqn, 368 } 369 } else { 370 writer = &slice{ 371 writer: g.pmm.NewSGL(cos.KiB * 512), 372 twg: wgSlices, 373 } 374 } 375 ctx.slices[v.SliceID-1] = writer 376 ctx.idToNode[v.SliceID] = k 377 wgSlices.Add(1) 378 uname := unique(k, ctx.lom.Bck(), ctx.lom.ObjName) 379 if c.parent.regWriter(uname, writer) { 380 daemons = append(daemons, k) 381 } 382 } 383 384 iReq := newIntraReq(reqGet, ctx.meta, ctx.lom.Bck()) 385 iReq.isSlice = true 386 request := iReq.NewPack(g.smm) 387 hdr := transport.ObjHdr{ 388 ObjName: ctx.lom.ObjName, 389 Opaque: request, 390 Opcode: reqGet, 391 } 392 hdr.Bck.Copy(ctx.lom.Bucket()) 393 394 o := transport.AllocSend() 395 o.Hdr = hdr 396 397 // Broadcast slice request and wait for targets to respond 398 if cmn.Rom.FastV(4, cos.SmoduleEC) { 399 nlog.Infof("Requesting daemons %v for slices of %s", daemons, ctx.lom) 400 } 401 if err := c.parent.sendByDaemonID(daemons, o, nil, true); err != nil { 402 freeSlices(ctx.slices) 403 g.smm.Free(request) 404 return err 405 } 406 if wgSlices.WaitTimeout(c.parent.config.Timeout.SendFile.D()) { 407 nlog.Errorf("%s timed out waiting for %s slices", core.T, ctx.lom) 408 } 409 g.smm.Free(request) 410 return nil 411 } 412 413 func newSliceWriter(ctx *restoreCtx, writers []io.Writer, restored []*slice, 414 cksums []*cos.CksumHash, cksumType string, idx int, sliceSize int64) error { 415 if ctx.toDisk { 416 prefix := fmt.Sprintf("ec-rebuild-%d", idx) 417 fqn := fs.CSM.Gen(ctx.lom, fs.WorkfileType, prefix) 418 file, err := ctx.lom.CreateFile(fqn) 419 if err != nil { 420 return err 421 } 422 if cksumType != cos.ChecksumNone { 423 cksums[idx] = cos.NewCksumHash(cksumType) 424 writers[idx] = cos.NewWriterMulti(cksums[idx].H, file) 425 } else { 426 writers[idx] = file 427 } 428 restored[idx] = &slice{workFQN: fqn, n: sliceSize} 429 } else { 430 sgl := g.pmm.NewSGL(sliceSize) 431 restored[idx] = &slice{obj: sgl, n: sliceSize} 432 if cksumType != cos.ChecksumNone { 433 cksums[idx] = cos.NewCksumHash(cksumType) 434 writers[idx] = cos.NewWriterMulti(cksums[idx].H, sgl) 435 } else { 436 writers[idx] = sgl 437 } 438 } 439 440 // Slice IDs starts from 1, hence `+1` 441 delete(ctx.idToNode, idx+1) 442 443 return nil 444 } 445 446 func cksumSlice(reader io.Reader, recvCksum *cos.Cksum, objName string) error { 447 cksumType := recvCksum.Type() 448 if cksumType == cos.ChecksumNone { 449 return nil 450 } 451 _, actualCksum, err := cos.CopyAndChecksum(io.Discard, reader, nil, cksumType) 452 if err != nil { 453 return fmt.Errorf("failed to checksum: %v", err) 454 } 455 if !actualCksum.Equal(recvCksum) { 456 err = cos.NewErrDataCksum(recvCksum, &actualCksum.Cksum, objName) 457 } 458 return err 459 } 460 461 // Reconstruct the main object from slices. Returns the list of reconstructed slices. 462 func (c *getJogger) restoreMainObj(ctx *restoreCtx) ([]*slice, error) { 463 var ( 464 err error 465 sliceCnt = ctx.meta.Data + ctx.meta.Parity 466 sliceSize = SliceSize(ctx.meta.Size, ctx.meta.Data) 467 readers = make([]io.Reader, sliceCnt) 468 writers = make([]io.Writer, sliceCnt) 469 restored = make([]*slice, sliceCnt) 470 cksums = make([]*cos.CksumHash, sliceCnt) 471 cksumType = ctx.lom.CksumType() 472 ) 473 474 // Allocate resources for reconstructed(missing) slices. 475 for i, sl := range ctx.slices { 476 if sl != nil && sl.writer != nil { 477 if cmn.Rom.FastV(4, cos.SmoduleEC) { 478 nlog.Infof("Got slice %d size %d (want %d) of %s", i+1, sl.n, sliceSize, ctx.lom) 479 } 480 if sl.n == 0 { 481 freeObject(sl.obj) 482 sl.obj = nil 483 freeObject(sl.writer) 484 sl.writer = nil 485 } 486 } 487 if sl == nil || sl.writer == nil { 488 err = newSliceWriter(ctx, writers, restored, cksums, cksumType, i, sliceSize) 489 if err != nil { 490 break 491 } 492 continue 493 } 494 495 var cksmReader io.Reader 496 if sgl, ok := sl.writer.(*memsys.SGL); ok { 497 readers[i] = memsys.NewReader(sgl) 498 cksmReader = memsys.NewReader(sgl) 499 } else if sl.workFQN != "" { 500 readers[i], err = cos.NewFileHandle(sl.workFQN) 501 cksmReader, _ = cos.NewFileHandle(sl.workFQN) 502 if err != nil { 503 break 504 } 505 } else { 506 debug.FailTypeCast(sl.writer) 507 err = fmt.Errorf("unsupported slice source: %T", sl.writer) 508 break 509 } 510 511 errCksum := cksumSlice(cksmReader, sl.cksum, ctx.lom.ObjName) 512 if errCksum != nil { 513 nlog.Errorf("error slice %d: %v", i, errCksum) 514 err = newSliceWriter(ctx, writers, restored, cksums, cksumType, i, sliceSize) 515 if err != nil { 516 break 517 } 518 readers[i] = nil 519 } 520 } 521 522 if err != nil { 523 return restored, err 524 } 525 526 if cmn.Rom.FastV(4, cos.SmoduleEC) { 527 nlog.Infof("Reconstructing %s", ctx.lom) 528 } 529 stream, err := reedsolomon.NewStreamC(ctx.meta.Data, ctx.meta.Parity, true, true) 530 if err != nil { 531 return restored, err 532 } 533 534 if err := stream.Reconstruct(readers, writers); err != nil { 535 return restored, err 536 } 537 538 for idx, rst := range restored { 539 if rst == nil { 540 continue 541 } 542 if cksums[idx] != nil { 543 cksums[idx].Finalize() 544 rst.cksum = cksums[idx].Clone() 545 } 546 } 547 548 version := "" 549 srcReaders := make([]io.Reader, ctx.meta.Data) 550 for i := range ctx.meta.Data { 551 if ctx.slices[i] != nil && ctx.slices[i].writer != nil { 552 if version == "" { 553 version = ctx.slices[i].version 554 } 555 if sgl, ok := ctx.slices[i].writer.(*memsys.SGL); ok { 556 srcReaders[i] = memsys.NewReader(sgl) 557 } else { 558 if ctx.slices[i].workFQN == "" { 559 return restored, fmt.Errorf("invalid writer: %T", ctx.slices[i].writer) 560 } 561 srcReaders[i], err = cos.NewFileHandle(ctx.slices[i].workFQN) 562 if err != nil { 563 return restored, err 564 } 565 } 566 continue 567 } 568 569 debug.Assert(restored[i] != nil) 570 if version == "" { 571 version = restored[i].version 572 } 573 if restored[i].workFQN != "" { 574 srcReaders[i], err = cos.NewFileHandle(restored[i].workFQN) 575 if err != nil { 576 return restored, err 577 } 578 } else { 579 sgl, ok := restored[i].obj.(*memsys.SGL) 580 if !ok { 581 return restored, fmt.Errorf("empty slice %s[%d]", ctx.lom, i) 582 } 583 srcReaders[i] = memsys.NewReader(sgl) 584 } 585 } 586 587 src := io.MultiReader(srcReaders...) 588 if cmn.Rom.FastV(4, cos.SmoduleEC) { 589 nlog.Infof("Saving main object %s to %q", ctx.lom, ctx.lom.FQN) 590 } 591 592 if version != "" { 593 ctx.lom.SetVersion(version) 594 } 595 ctx.lom.SetSize(ctx.meta.Size) 596 mainMeta := *ctx.meta 597 mainMeta.SliceID = 0 598 args := &WriteArgs{ 599 Reader: src, 600 MD: mainMeta.NewPack(), 601 Cksum: cos.NewCksum(cksumType, ""), 602 Generation: mainMeta.Generation, 603 Xact: c.parent, 604 } 605 err = WriteReplicaAndMeta(ctx.lom, args) 606 return restored, err 607 } 608 609 // Look for the first non-nil slice in the list starting from the index `start`. 610 func getNextNonEmptySlice(slices []*slice, start int) (*slice, int) { 611 i := max(0, start) 612 for i < len(slices) && slices[i] == nil { 613 i++ 614 } 615 if i == len(slices) { 616 return nil, i 617 } 618 return slices[i], i + 1 619 } 620 621 // Return a list of target IDs that do not have slices yet. 622 func (*getJogger) emptyTargets(ctx *restoreCtx) ([]string, error) { 623 sliceCnt := ctx.meta.Data + ctx.meta.Parity 624 nodeToID := make(map[string]int, len(ctx.idToNode)) 625 // Transpose SliceID <-> DaemonID map for faster lookup 626 for k, v := range ctx.idToNode { 627 nodeToID[v] = k 628 } 629 // Generate the list of targets that should have a slice. 630 smap := core.T.Sowner().Get() 631 targets, err := smap.HrwTargetList(ctx.lom.Uname(), sliceCnt+1) 632 if err != nil { 633 nlog.Warningln(err) 634 return nil, err 635 } 636 empty := make([]string, 0, len(targets)) 637 for _, t := range targets { 638 if t.ID() == core.T.SID() { 639 continue 640 } 641 if _, ok := nodeToID[t.ID()]; ok { 642 continue 643 } 644 empty = append(empty, t.ID()) 645 } 646 if cmn.Rom.FastV(4, cos.SmoduleEC) { 647 nlog.Infof("Empty nodes for %s are %#v", ctx.lom, empty) 648 } 649 return empty, nil 650 } 651 652 func (*getJogger) freeSliceFrom(slices []*slice, start int) { 653 for sl, sliceID := getNextNonEmptySlice(slices, start); sl != nil; sl, sliceID = getNextNonEmptySlice(slices, sliceID) { 654 sl.free() 655 } 656 } 657 658 // upload missing slices to targets (that must have them): 659 // * slices - object slices reconstructed by `restoreMainObj` 660 // * idToNode - a map of targets that already contain a slice (SliceID <-> target) 661 func (c *getJogger) uploadRestoredSlices(ctx *restoreCtx, slices []*slice) error { 662 emptyNodes, err := c.emptyTargets(ctx) 663 if err != nil || len(emptyNodes) == 0 { 664 c.freeSliceFrom(slices, 0) 665 return err 666 } 667 668 var ( 669 sliceID int 670 sl *slice 671 remoteErr error 672 counter = atomic.NewInt32(0) 673 ) 674 // First, count the number of slices and initialize the counter to avoid 675 // races when network is faster than FS and transport callback comes before 676 // the next slice is being sent 677 for sl, id := getNextNonEmptySlice(slices, 0); sl != nil; sl, id = getNextNonEmptySlice(slices, id) { 678 counter.Inc() 679 } 680 if counter.Load() == 0 { 681 return nil 682 } 683 // Send reconstructed slices one by one to targets that are "empty". 684 for sl, sliceID = getNextNonEmptySlice(slices, 0); sl != nil && len(emptyNodes) != 0; sl, sliceID = getNextNonEmptySlice(slices, sliceID) { 685 tid := emptyNodes[0] 686 emptyNodes = emptyNodes[1:] 687 688 // clone the object's metadata and set the correct SliceID before sending 689 sliceMeta := ctx.meta.Clone() 690 sliceMeta.SliceID = sliceID 691 if sl.cksum != nil { 692 sliceMeta.CksumType, sliceMeta.CksumValue = sl.cksum.Get() 693 } 694 695 var reader cos.ReadOpenCloser 696 if sl.workFQN != "" { 697 reader, _ = cos.NewFileHandle(sl.workFQN) 698 } else { 699 s, ok := sl.obj.(*memsys.SGL) 700 debug.Assert(ok) 701 reader = memsys.NewReader(s) 702 } 703 dataSrc := &dataSource{ 704 reader: reader, 705 size: sl.n, 706 metadata: sliceMeta, 707 isSlice: true, 708 reqType: reqPut, 709 } 710 711 if cmn.Rom.FastV(4, cos.SmoduleEC) { 712 nlog.Infof("Sending slice %s[%d] to %s", ctx.lom, sliceMeta.SliceID, tid) 713 } 714 715 // Every slice's SGL is freed upon transfer completion 716 cb := func(daemonID string, s *slice) transport.ObjSentCB { 717 return func(_ *transport.ObjHdr, _ io.ReadCloser, _ any, err error) { 718 if err != nil { 719 nlog.Errorf("%s failed to send %s to %v: %v", core.T, ctx.lom, daemonID, err) 720 } 721 s.free() 722 } 723 }(tid, sl) 724 if err := c.parent.writeRemote([]string{tid}, ctx.lom, dataSrc, cb); err != nil { 725 remoteErr = err 726 nlog.Errorf("%s failed to send slice %s[%d] to %s", core.T, ctx.lom, sliceID, tid) 727 } 728 } 729 730 c.freeSliceFrom(slices, sliceID) 731 return remoteErr 732 } 733 734 // Free resources allocated for downloading slices from remote targets 735 func (c *getJogger) freeDownloaded(ctx *restoreCtx) { 736 for _, slice := range ctx.slices { 737 if slice != nil && slice.lom != nil { 738 core.FreeLOM(slice.lom) 739 } 740 } 741 for k := range ctx.nodes { 742 uname := unique(k, ctx.lom.Bck(), ctx.lom.ObjName) 743 c.parent.unregWriter(uname) 744 } 745 freeSlices(ctx.slices) 746 } 747 748 // Main function that starts restoring an object that was encoded 749 func (c *getJogger) restoreEncoded(ctx *restoreCtx) error { 750 if cmn.Rom.FastV(4, cos.SmoduleEC) { 751 nlog.Infof("Starting EC restore %s", ctx.lom) 752 } 753 754 // Download all slices from the targets that have sent metadata 755 err := c.requestSlices(ctx) 756 if err != nil { 757 c.freeDownloaded(ctx) 758 return err 759 } 760 761 // Restore and save locally the main replica 762 restored, err := c.restoreMainObj(ctx) 763 if err != nil { 764 nlog.Errorf("%s failed to restore main object %s: %v", core.T, ctx.lom, err) 765 c.freeDownloaded(ctx) 766 freeSlices(restored) 767 return err 768 } 769 770 c.parent.ObjsAdd(1, ctx.meta.Size) 771 772 // main replica is ready to download by a client. 773 if err := c.uploadRestoredSlices(ctx, restored); err != nil { 774 nlog.Errorf("Failed to upload restored slices of %s: %v", ctx.lom, err) 775 } else if cmn.Rom.FastV(4, cos.SmoduleEC) { 776 nlog.Infof("Slices %s restored successfully", ctx.lom) 777 } 778 779 c.freeDownloaded(ctx) 780 return nil 781 } 782 783 // Entry point: restores main objects and slices if possible 784 func (c *getJogger) restore(ctx *restoreCtx) error { 785 if ctx.lom.Bprops() == nil || !ctx.lom.ECEnabled() { 786 return ErrorECDisabled 787 } 788 789 if cmn.Rom.FastV(4, cos.SmoduleEC) { 790 nlog.Infof("Restoring %s", ctx.lom) 791 } 792 err := c.requestMeta(ctx) 793 if cmn.Rom.FastV(4, cos.SmoduleEC) { 794 nlog.Infof("Found meta for %s: %d, err: %v", ctx.lom, len(ctx.nodes), err) 795 } 796 if err != nil { 797 return err 798 } 799 800 ctx.lom.SetAtimeUnix(time.Now().UnixNano()) 801 if ctx.meta.IsCopy { 802 if ctx.toDisk { 803 return c.restoreReplicatedFromDisk(ctx) 804 } 805 return c.restoreReplicatedFromMemory(ctx) 806 } 807 808 if len(ctx.nodes) < ctx.meta.Data { 809 return fmt.Errorf("cannot restore: too many slices missing (found %d slices, need %d or more)", 810 len(ctx.nodes), ctx.meta.Data) 811 } 812 813 return c.restoreEncoded(ctx) 814 } 815 816 // Broadcast request for object's metadata. The function returns the list of 817 // nodes(with their EC metadata) that have the lastest object version 818 func (c *getJogger) requestMeta(ctx *restoreCtx) error { 819 var ( 820 wg = cos.NewLimitedWaitGroup(cmn.MaxParallelism(), 8) 821 mtx = &sync.Mutex{} 822 tmap = core.T.Sowner().Get().Tmap 823 ctMeta = core.NewCTFromLOM(ctx.lom, fs.ECMetaType) 824 825 md, err = LoadMetadata(ctMeta.FQN()) 826 mdExists = err == nil && len(md.Daemons) != 0 827 ) 828 if mdExists { 829 // Metafile exists and contains a list of targets 830 nodes := md.RemoteTargets() 831 ctx.nodes = make(map[string]*Metadata, len(nodes)) 832 for _, node := range nodes { 833 wg.Add(1) 834 go func(si *meta.Snode, c *getJogger, mtx *sync.Mutex, mdExists bool) { 835 ctx.requestMeta(si, c, mtx, mdExists) 836 wg.Done() 837 }(node, c, mtx, mdExists) 838 } 839 } else { 840 // Otherwise, broadcast 841 ctx.nodes = make(map[string]*Metadata, len(tmap)) 842 for _, node := range tmap { 843 if node.ID() == core.T.SID() { 844 continue 845 } 846 wg.Add(1) 847 go func(si *meta.Snode, c *getJogger, mtx *sync.Mutex, mdExists bool) { 848 ctx.requestMeta(si, c, mtx, mdExists) 849 wg.Done() 850 }(node, c, mtx, mdExists) 851 } 852 } 853 wg.Wait() 854 855 // No EC metadata found 856 if len(ctx.nodes) == 0 { 857 return ErrorNoMetafile 858 } 859 860 // Cleanup: delete all metadatas with "obsolete" information 861 for k, v := range ctx.nodes { 862 if v.Generation != ctx.meta.Generation { 863 nlog.Warningf("Target %s[slice id %d] old generation: %v == %v", 864 k, v.SliceID, v.Generation, ctx.meta.Generation) 865 delete(ctx.nodes, k) 866 } 867 } 868 869 return nil 870 } 871 872 //////////////// 873 // restoreCtx // 874 //////////////// 875 876 func (ctx *restoreCtx) requestMeta(si *meta.Snode, c *getJogger, mtx *sync.Mutex, mdExists bool) { 877 md, err := RequestECMeta(ctx.lom.Bucket(), ctx.lom.ObjName, si, c.client) 878 if err != nil { 879 if mdExists { 880 nlog.Errorf("No EC meta %s from %s: %v", ctx.lom.Cname(), si, err) 881 } else if cmn.Rom.FastV(4, cos.SmoduleEC) { 882 nlog.Infof("No EC meta %s from %s: %v", ctx.lom.Cname(), si, err) 883 } 884 return 885 } 886 887 mtx.Lock() 888 ctx.nodes[si.ID()] = md 889 // Detect the metadata with the latest generation on the fly. 890 if ctx.meta == nil || md.Generation > ctx.meta.Generation { 891 ctx.meta = md 892 } 893 mtx.Unlock() 894 }