github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ec/getjogger.go (about)

     1  // Package ec provides erasure coding (EC) based data protection for AIStore.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package ec
     6  
     7  import (
     8  	"bytes"
     9  	"errors"
    10  	"fmt"
    11  	"io"
    12  	"net/http"
    13  	"os"
    14  	"sync"
    15  	"time"
    16  
    17  	"github.com/NVIDIA/aistore/cmn"
    18  	"github.com/NVIDIA/aistore/cmn/atomic"
    19  	"github.com/NVIDIA/aistore/cmn/cos"
    20  	"github.com/NVIDIA/aistore/cmn/debug"
    21  	"github.com/NVIDIA/aistore/cmn/nlog"
    22  	"github.com/NVIDIA/aistore/core"
    23  	"github.com/NVIDIA/aistore/core/meta"
    24  	"github.com/NVIDIA/aistore/fs"
    25  	"github.com/NVIDIA/aistore/memsys"
    26  	"github.com/NVIDIA/aistore/transport"
    27  	"github.com/klauspost/reedsolomon"
    28  )
    29  
    30  type (
    31  	// Mountpath getJogger: processes GET requests to one mountpath
    32  	getJogger struct {
    33  		parent *XactGet
    34  		client *http.Client
    35  		mpath  string // Mountpath that the jogger manages
    36  
    37  		workCh chan *request // Channel to request TOP priority operation (restore)
    38  		stopCh cos.StopCh    // Jogger management channel: to stop it
    39  	}
    40  	restoreCtx struct {
    41  		lom      *core.LOM            // replica
    42  		meta     *Metadata            // restored object's EC metafile
    43  		nodes    map[string]*Metadata // EC metafiles downloaded from other targets
    44  		slices   []*slice             // slices downloaded from other targets
    45  		idToNode map[int]string       // existing sliceID <-> target
    46  		toDisk   bool                 // use memory or disk for temporary files
    47  	}
    48  )
    49  
    50  var (
    51  	restoreCtxPool  sync.Pool
    52  	emptyRestoreCtx restoreCtx
    53  )
    54  
    55  func allocRestoreCtx() (ctx *restoreCtx) {
    56  	if v := restoreCtxPool.Get(); v != nil {
    57  		ctx = v.(*restoreCtx)
    58  	} else {
    59  		ctx = &restoreCtx{}
    60  	}
    61  	return
    62  }
    63  
    64  func freeRestoreCtx(ctx *restoreCtx) {
    65  	*ctx = emptyRestoreCtx
    66  	restoreCtxPool.Put(ctx)
    67  }
    68  
    69  func (c *getJogger) newCtx(req *request) (*restoreCtx, error) {
    70  	lom, err := req.LIF.LOM()
    71  	if err != nil {
    72  		return nil, err
    73  	}
    74  	ctx := allocRestoreCtx()
    75  	ctx.toDisk = useDisk(0 /*size of the original object is unknown*/, c.parent.config)
    76  	ctx.lom = lom
    77  	err = lom.Load(true /*cache it*/, false /*locked*/)
    78  	if os.IsNotExist(err) {
    79  		err = nil
    80  	}
    81  	return ctx, err
    82  }
    83  
    84  func (*getJogger) freeCtx(ctx *restoreCtx) {
    85  	core.FreeLOM(ctx.lom)
    86  	freeRestoreCtx(ctx)
    87  }
    88  
    89  func (c *getJogger) run() {
    90  	nlog.Infof("started EC for mountpath: %s, bucket %s", c.mpath, c.parent.bck)
    91  
    92  	for {
    93  		select {
    94  		case req := <-c.workCh:
    95  			c.parent.stats.updateWaitTime(time.Since(req.tm))
    96  			req.tm = time.Now()
    97  			c.parent.IncPending()
    98  			c.ec(req)
    99  			c.parent.DecPending()
   100  			freeReq(req)
   101  		case <-c.stopCh.Listen():
   102  			return
   103  		}
   104  	}
   105  }
   106  
   107  func (c *getJogger) stop() {
   108  	nlog.Infof("stopping EC for mountpath: %s, bucket: %s", c.mpath, c.parent.bck)
   109  	c.stopCh.Close()
   110  }
   111  
   112  // Finalize the EC restore: report an error to a caller, do housekeeping.
   113  func (*getJogger) finalizeReq(req *request, err error) {
   114  	if err != nil {
   115  		nlog.Errorf("Error restoring %s: %v", req.LIF.Uname, err)
   116  	}
   117  	if req.ErrCh != nil {
   118  		if err != nil {
   119  			req.ErrCh <- err
   120  		}
   121  		close(req.ErrCh)
   122  	}
   123  }
   124  
   125  func (c *getJogger) ec(req *request) {
   126  	debug.Assert(req.Action == ActRestore)
   127  	ctx, err := c.newCtx(req)
   128  	if ctx == nil {
   129  		debug.Assert(err != nil)
   130  		return
   131  	}
   132  	if err == nil {
   133  		err = c.restore(ctx)
   134  		c.parent.stats.updateDecodeTime(time.Since(req.tm), err != nil)
   135  	}
   136  	if err == nil {
   137  		c.parent.stats.updateObjTime(time.Since(req.putTime))
   138  		err = ctx.lom.Persist()
   139  	}
   140  	c.freeCtx(ctx)
   141  	c.finalizeReq(req, err)
   142  }
   143  
   144  // The final step of replica restoration process: the main target detects which
   145  // nodes do not have replicas and then runs respective replications.
   146  // * reader - replica content to send to remote targets
   147  func (c *getJogger) copyMissingReplicas(ctx *restoreCtx, reader cos.ReadOpenCloser) error {
   148  	if err := ctx.lom.Load(false /*cache it*/, false /*locked*/); err != nil {
   149  		return err
   150  	}
   151  	smap := core.T.Sowner().Get()
   152  	targets, err := smap.HrwTargetList(ctx.lom.Uname(), ctx.meta.Parity+1)
   153  	if err != nil {
   154  		return err
   155  	}
   156  
   157  	// Fill the list of daemonIDs that do not have replica
   158  	daemons := make([]string, 0, len(targets))
   159  	for _, target := range targets {
   160  		if target.ID() == core.T.SID() {
   161  			continue
   162  		}
   163  
   164  		if _, ok := ctx.nodes[target.ID()]; !ok {
   165  			daemons = append(daemons, target.ID())
   166  		}
   167  	}
   168  
   169  	// If any target lost its replica send the replica to it, and free allocated
   170  	// memory on completion. Otherwise free allocated memory and return immediately
   171  	if len(daemons) == 0 {
   172  		freeObject(reader)
   173  		return nil
   174  	}
   175  
   176  	var srcReader cos.ReadOpenCloser
   177  	switch r := reader.(type) {
   178  	case *memsys.SGL:
   179  		srcReader = memsys.NewReader(r)
   180  	case *cos.FileHandle:
   181  		srcReader, err = cos.NewFileHandle(ctx.lom.FQN)
   182  	default:
   183  		debug.FailTypeCast(reader)
   184  		err = fmt.Errorf("unsupported reader type: %T", reader)
   185  	}
   186  
   187  	if err != nil {
   188  		return err
   189  	}
   190  
   191  	// _ io.ReadCloser: pass copyMisssingReplicas reader argument(memsys.SGL type)
   192  	// instead of callback's reader argument(memsys.Reader type) to freeObject
   193  	// Reason: memsys.Reader does not provide access to internal memsys.SGL that must be freed
   194  	cb := func(_ *transport.ObjHdr, _ io.ReadCloser, _ any, err error) {
   195  		if err != nil {
   196  			nlog.Errorf("%s failed to send %s to %v: %v", core.T, ctx.lom, daemons, err)
   197  		}
   198  		freeObject(reader)
   199  	}
   200  	src := &dataSource{
   201  		reader:   srcReader,
   202  		size:     ctx.lom.SizeBytes(),
   203  		metadata: ctx.meta,
   204  		reqType:  reqPut,
   205  	}
   206  	return c.parent.writeRemote(daemons, ctx.lom, src, cb)
   207  }
   208  
   209  func (c *getJogger) restoreReplicatedFromMemory(ctx *restoreCtx) error {
   210  	var (
   211  		writer *memsys.SGL
   212  	)
   213  	// Try to read replica from targets one by one until the replica is downloaded
   214  	for node := range ctx.nodes {
   215  		uname := unique(node, ctx.lom.Bck(), ctx.lom.ObjName)
   216  		iReqBuf := newIntraReq(reqGet, ctx.meta, ctx.lom.Bck()).NewPack(g.smm)
   217  
   218  		w := g.smm.NewSGL(cos.KiB)
   219  		if _, err := c.parent.readRemote(ctx.lom, node, uname, iReqBuf, w); err != nil {
   220  			nlog.Errorf("%s failed to read from %s", core.T, node)
   221  			w.Free()
   222  			g.smm.Free(iReqBuf)
   223  			w = nil
   224  			continue
   225  		}
   226  		g.smm.Free(iReqBuf)
   227  		if w.Size() != 0 {
   228  			// A valid replica is found - break and do not free SGL
   229  			writer = w
   230  			break
   231  		}
   232  		w.Free()
   233  	}
   234  	if cmn.Rom.FastV(4, cos.SmoduleEC) {
   235  		nlog.Infof("Found meta -> obj get %s, writer found: %v", ctx.lom, writer != nil)
   236  	}
   237  
   238  	if writer == nil {
   239  		return errors.New("failed to read a replica from any target")
   240  	}
   241  
   242  	ctx.lom.SetSize(writer.Size())
   243  	args := &WriteArgs{
   244  		Reader:     memsys.NewReader(writer),
   245  		MD:         ctx.meta.NewPack(),
   246  		Cksum:      cos.NewCksum(ctx.meta.CksumType, ctx.meta.CksumValue),
   247  		Generation: ctx.meta.Generation,
   248  		Xact:       c.parent,
   249  	}
   250  	if err := WriteReplicaAndMeta(ctx.lom, args); err != nil {
   251  		writer.Free()
   252  		return err
   253  	}
   254  
   255  	err := c.copyMissingReplicas(ctx, writer)
   256  	if err != nil {
   257  		writer.Free()
   258  	}
   259  	return err
   260  }
   261  
   262  func (c *getJogger) restoreReplicatedFromDisk(ctx *restoreCtx) error {
   263  	var (
   264  		writer *os.File
   265  		n      int64
   266  	)
   267  	// Try to read a replica from targets one by one until the replica is downloaded
   268  	tmpFQN := fs.CSM.Gen(ctx.lom, fs.WorkfileType, "ec-restore-repl")
   269  
   270  	for node := range ctx.nodes {
   271  		uname := unique(node, ctx.lom.Bck(), ctx.lom.ObjName)
   272  
   273  		w, err := ctx.lom.CreateFile(tmpFQN)
   274  		if err != nil {
   275  			nlog.Errorf("Failed to create file: %v", err)
   276  			break
   277  		}
   278  		iReqBuf := newIntraReq(reqGet, ctx.meta, ctx.lom.Bck()).NewPack(g.smm)
   279  		n, err = c.parent.readRemote(ctx.lom, node, uname, iReqBuf, w)
   280  		g.smm.Free(iReqBuf)
   281  
   282  		if err == nil && n != 0 {
   283  			// A valid replica is found - break and do close file handle
   284  			err = cos.FlushClose(w)
   285  			if err != nil {
   286  				nlog.Errorf("Failed to flush and close: %v", err)
   287  				break
   288  			}
   289  			ctx.lom.SetSize(n)
   290  			writer = w
   291  			break
   292  		}
   293  
   294  		cos.Close(w)
   295  		errRm := cos.RemoveFile(tmpFQN)
   296  		debug.AssertNoErr(errRm)
   297  	}
   298  	if cmn.Rom.FastV(4, cos.SmoduleEC) {
   299  		nlog.Infof("Found meta -> obj get %s, writer found: %v", ctx.lom, writer != nil)
   300  	}
   301  
   302  	if writer == nil {
   303  		return errors.New("failed to read a replica from any target")
   304  	}
   305  	if err := ctx.lom.RenameFrom(tmpFQN); err != nil {
   306  		return err
   307  	}
   308  
   309  	if err := ctx.lom.Persist(); err != nil {
   310  		return err
   311  	}
   312  
   313  	b := cos.MustMarshal(ctx.meta)
   314  	ctMeta := core.NewCTFromLOM(ctx.lom, fs.ECMetaType)
   315  	if err := ctMeta.Write(bytes.NewReader(b), -1); err != nil {
   316  		return err
   317  	}
   318  	if _, exists := core.T.Bowner().Get().Get(ctMeta.Bck()); !exists {
   319  		if errRm := cos.RemoveFile(ctMeta.FQN()); errRm != nil {
   320  			nlog.Errorf("nested error: save restored replica -> remove metafile: %v", errRm)
   321  		}
   322  		return fmt.Errorf("%s metafile saved while bucket %s was being destroyed", ctMeta.ObjectName(), ctMeta.Bucket())
   323  	}
   324  
   325  	reader, err := cos.NewFileHandle(ctx.lom.FQN)
   326  	if err != nil {
   327  		return err
   328  	}
   329  	err = c.copyMissingReplicas(ctx, reader)
   330  	if err != nil {
   331  		freeObject(reader)
   332  	}
   333  	return err
   334  }
   335  
   336  // Main object is not found and it is clear that it was encoded. Request
   337  // all data and parity slices from targets in a cluster.
   338  func (c *getJogger) requestSlices(ctx *restoreCtx) error {
   339  	var (
   340  		wgSlices = cos.NewTimeoutGroup()
   341  		sliceCnt = ctx.meta.Data + ctx.meta.Parity
   342  		daemons  = make([]string, 0, len(ctx.nodes)) // Targets to be requested for slices
   343  	)
   344  	ctx.slices = make([]*slice, sliceCnt)
   345  	ctx.idToNode = make(map[int]string)
   346  
   347  	for k, v := range ctx.nodes {
   348  		if v.SliceID < 1 || v.SliceID > sliceCnt {
   349  			nlog.Warningf("Node %s has invalid slice ID %d", k, v.SliceID)
   350  			continue
   351  		}
   352  
   353  		if cmn.Rom.FastV(4, cos.SmoduleEC) {
   354  			nlog.Infof("Slice %s[%d] requesting from %s", ctx.lom, v.SliceID, k)
   355  		}
   356  		var writer *slice
   357  		if ctx.toDisk {
   358  			prefix := fmt.Sprintf("ec-restore-%d", v.SliceID)
   359  			fqn := fs.CSM.Gen(ctx.lom, fs.WorkfileType, prefix)
   360  			fh, err := ctx.lom.CreateFile(fqn)
   361  			if err != nil {
   362  				return err
   363  			}
   364  			writer = &slice{
   365  				writer:  fh,
   366  				twg:     wgSlices,
   367  				workFQN: fqn,
   368  			}
   369  		} else {
   370  			writer = &slice{
   371  				writer: g.pmm.NewSGL(cos.KiB * 512),
   372  				twg:    wgSlices,
   373  			}
   374  		}
   375  		ctx.slices[v.SliceID-1] = writer
   376  		ctx.idToNode[v.SliceID] = k
   377  		wgSlices.Add(1)
   378  		uname := unique(k, ctx.lom.Bck(), ctx.lom.ObjName)
   379  		if c.parent.regWriter(uname, writer) {
   380  			daemons = append(daemons, k)
   381  		}
   382  	}
   383  
   384  	iReq := newIntraReq(reqGet, ctx.meta, ctx.lom.Bck())
   385  	iReq.isSlice = true
   386  	request := iReq.NewPack(g.smm)
   387  	hdr := transport.ObjHdr{
   388  		ObjName: ctx.lom.ObjName,
   389  		Opaque:  request,
   390  		Opcode:  reqGet,
   391  	}
   392  	hdr.Bck.Copy(ctx.lom.Bucket())
   393  
   394  	o := transport.AllocSend()
   395  	o.Hdr = hdr
   396  
   397  	// Broadcast slice request and wait for targets to respond
   398  	if cmn.Rom.FastV(4, cos.SmoduleEC) {
   399  		nlog.Infof("Requesting daemons %v for slices of %s", daemons, ctx.lom)
   400  	}
   401  	if err := c.parent.sendByDaemonID(daemons, o, nil, true); err != nil {
   402  		freeSlices(ctx.slices)
   403  		g.smm.Free(request)
   404  		return err
   405  	}
   406  	if wgSlices.WaitTimeout(c.parent.config.Timeout.SendFile.D()) {
   407  		nlog.Errorf("%s timed out waiting for %s slices", core.T, ctx.lom)
   408  	}
   409  	g.smm.Free(request)
   410  	return nil
   411  }
   412  
   413  func newSliceWriter(ctx *restoreCtx, writers []io.Writer, restored []*slice,
   414  	cksums []*cos.CksumHash, cksumType string, idx int, sliceSize int64) error {
   415  	if ctx.toDisk {
   416  		prefix := fmt.Sprintf("ec-rebuild-%d", idx)
   417  		fqn := fs.CSM.Gen(ctx.lom, fs.WorkfileType, prefix)
   418  		file, err := ctx.lom.CreateFile(fqn)
   419  		if err != nil {
   420  			return err
   421  		}
   422  		if cksumType != cos.ChecksumNone {
   423  			cksums[idx] = cos.NewCksumHash(cksumType)
   424  			writers[idx] = cos.NewWriterMulti(cksums[idx].H, file)
   425  		} else {
   426  			writers[idx] = file
   427  		}
   428  		restored[idx] = &slice{workFQN: fqn, n: sliceSize}
   429  	} else {
   430  		sgl := g.pmm.NewSGL(sliceSize)
   431  		restored[idx] = &slice{obj: sgl, n: sliceSize}
   432  		if cksumType != cos.ChecksumNone {
   433  			cksums[idx] = cos.NewCksumHash(cksumType)
   434  			writers[idx] = cos.NewWriterMulti(cksums[idx].H, sgl)
   435  		} else {
   436  			writers[idx] = sgl
   437  		}
   438  	}
   439  
   440  	// Slice IDs starts from 1, hence `+1`
   441  	delete(ctx.idToNode, idx+1)
   442  
   443  	return nil
   444  }
   445  
   446  func cksumSlice(reader io.Reader, recvCksum *cos.Cksum, objName string) error {
   447  	cksumType := recvCksum.Type()
   448  	if cksumType == cos.ChecksumNone {
   449  		return nil
   450  	}
   451  	_, actualCksum, err := cos.CopyAndChecksum(io.Discard, reader, nil, cksumType)
   452  	if err != nil {
   453  		return fmt.Errorf("failed to checksum: %v", err)
   454  	}
   455  	if !actualCksum.Equal(recvCksum) {
   456  		err = cos.NewErrDataCksum(recvCksum, &actualCksum.Cksum, objName)
   457  	}
   458  	return err
   459  }
   460  
   461  // Reconstruct the main object from slices. Returns the list of reconstructed slices.
   462  func (c *getJogger) restoreMainObj(ctx *restoreCtx) ([]*slice, error) {
   463  	var (
   464  		err       error
   465  		sliceCnt  = ctx.meta.Data + ctx.meta.Parity
   466  		sliceSize = SliceSize(ctx.meta.Size, ctx.meta.Data)
   467  		readers   = make([]io.Reader, sliceCnt)
   468  		writers   = make([]io.Writer, sliceCnt)
   469  		restored  = make([]*slice, sliceCnt)
   470  		cksums    = make([]*cos.CksumHash, sliceCnt)
   471  		cksumType = ctx.lom.CksumType()
   472  	)
   473  
   474  	// Allocate resources for reconstructed(missing) slices.
   475  	for i, sl := range ctx.slices {
   476  		if sl != nil && sl.writer != nil {
   477  			if cmn.Rom.FastV(4, cos.SmoduleEC) {
   478  				nlog.Infof("Got slice %d size %d (want %d) of %s", i+1, sl.n, sliceSize, ctx.lom)
   479  			}
   480  			if sl.n == 0 {
   481  				freeObject(sl.obj)
   482  				sl.obj = nil
   483  				freeObject(sl.writer)
   484  				sl.writer = nil
   485  			}
   486  		}
   487  		if sl == nil || sl.writer == nil {
   488  			err = newSliceWriter(ctx, writers, restored, cksums, cksumType, i, sliceSize)
   489  			if err != nil {
   490  				break
   491  			}
   492  			continue
   493  		}
   494  
   495  		var cksmReader io.Reader
   496  		if sgl, ok := sl.writer.(*memsys.SGL); ok {
   497  			readers[i] = memsys.NewReader(sgl)
   498  			cksmReader = memsys.NewReader(sgl)
   499  		} else if sl.workFQN != "" {
   500  			readers[i], err = cos.NewFileHandle(sl.workFQN)
   501  			cksmReader, _ = cos.NewFileHandle(sl.workFQN)
   502  			if err != nil {
   503  				break
   504  			}
   505  		} else {
   506  			debug.FailTypeCast(sl.writer)
   507  			err = fmt.Errorf("unsupported slice source: %T", sl.writer)
   508  			break
   509  		}
   510  
   511  		errCksum := cksumSlice(cksmReader, sl.cksum, ctx.lom.ObjName)
   512  		if errCksum != nil {
   513  			nlog.Errorf("error slice %d: %v", i, errCksum)
   514  			err = newSliceWriter(ctx, writers, restored, cksums, cksumType, i, sliceSize)
   515  			if err != nil {
   516  				break
   517  			}
   518  			readers[i] = nil
   519  		}
   520  	}
   521  
   522  	if err != nil {
   523  		return restored, err
   524  	}
   525  
   526  	if cmn.Rom.FastV(4, cos.SmoduleEC) {
   527  		nlog.Infof("Reconstructing %s", ctx.lom)
   528  	}
   529  	stream, err := reedsolomon.NewStreamC(ctx.meta.Data, ctx.meta.Parity, true, true)
   530  	if err != nil {
   531  		return restored, err
   532  	}
   533  
   534  	if err := stream.Reconstruct(readers, writers); err != nil {
   535  		return restored, err
   536  	}
   537  
   538  	for idx, rst := range restored {
   539  		if rst == nil {
   540  			continue
   541  		}
   542  		if cksums[idx] != nil {
   543  			cksums[idx].Finalize()
   544  			rst.cksum = cksums[idx].Clone()
   545  		}
   546  	}
   547  
   548  	version := ""
   549  	srcReaders := make([]io.Reader, ctx.meta.Data)
   550  	for i := range ctx.meta.Data {
   551  		if ctx.slices[i] != nil && ctx.slices[i].writer != nil {
   552  			if version == "" {
   553  				version = ctx.slices[i].version
   554  			}
   555  			if sgl, ok := ctx.slices[i].writer.(*memsys.SGL); ok {
   556  				srcReaders[i] = memsys.NewReader(sgl)
   557  			} else {
   558  				if ctx.slices[i].workFQN == "" {
   559  					return restored, fmt.Errorf("invalid writer: %T", ctx.slices[i].writer)
   560  				}
   561  				srcReaders[i], err = cos.NewFileHandle(ctx.slices[i].workFQN)
   562  				if err != nil {
   563  					return restored, err
   564  				}
   565  			}
   566  			continue
   567  		}
   568  
   569  		debug.Assert(restored[i] != nil)
   570  		if version == "" {
   571  			version = restored[i].version
   572  		}
   573  		if restored[i].workFQN != "" {
   574  			srcReaders[i], err = cos.NewFileHandle(restored[i].workFQN)
   575  			if err != nil {
   576  				return restored, err
   577  			}
   578  		} else {
   579  			sgl, ok := restored[i].obj.(*memsys.SGL)
   580  			if !ok {
   581  				return restored, fmt.Errorf("empty slice %s[%d]", ctx.lom, i)
   582  			}
   583  			srcReaders[i] = memsys.NewReader(sgl)
   584  		}
   585  	}
   586  
   587  	src := io.MultiReader(srcReaders...)
   588  	if cmn.Rom.FastV(4, cos.SmoduleEC) {
   589  		nlog.Infof("Saving main object %s to %q", ctx.lom, ctx.lom.FQN)
   590  	}
   591  
   592  	if version != "" {
   593  		ctx.lom.SetVersion(version)
   594  	}
   595  	ctx.lom.SetSize(ctx.meta.Size)
   596  	mainMeta := *ctx.meta
   597  	mainMeta.SliceID = 0
   598  	args := &WriteArgs{
   599  		Reader:     src,
   600  		MD:         mainMeta.NewPack(),
   601  		Cksum:      cos.NewCksum(cksumType, ""),
   602  		Generation: mainMeta.Generation,
   603  		Xact:       c.parent,
   604  	}
   605  	err = WriteReplicaAndMeta(ctx.lom, args)
   606  	return restored, err
   607  }
   608  
   609  // Look for the first non-nil slice in the list starting from the index `start`.
   610  func getNextNonEmptySlice(slices []*slice, start int) (*slice, int) {
   611  	i := max(0, start)
   612  	for i < len(slices) && slices[i] == nil {
   613  		i++
   614  	}
   615  	if i == len(slices) {
   616  		return nil, i
   617  	}
   618  	return slices[i], i + 1
   619  }
   620  
   621  // Return a list of target IDs that do not have slices yet.
   622  func (*getJogger) emptyTargets(ctx *restoreCtx) ([]string, error) {
   623  	sliceCnt := ctx.meta.Data + ctx.meta.Parity
   624  	nodeToID := make(map[string]int, len(ctx.idToNode))
   625  	// Transpose SliceID <-> DaemonID map for faster lookup
   626  	for k, v := range ctx.idToNode {
   627  		nodeToID[v] = k
   628  	}
   629  	// Generate the list of targets that should have a slice.
   630  	smap := core.T.Sowner().Get()
   631  	targets, err := smap.HrwTargetList(ctx.lom.Uname(), sliceCnt+1)
   632  	if err != nil {
   633  		nlog.Warningln(err)
   634  		return nil, err
   635  	}
   636  	empty := make([]string, 0, len(targets))
   637  	for _, t := range targets {
   638  		if t.ID() == core.T.SID() {
   639  			continue
   640  		}
   641  		if _, ok := nodeToID[t.ID()]; ok {
   642  			continue
   643  		}
   644  		empty = append(empty, t.ID())
   645  	}
   646  	if cmn.Rom.FastV(4, cos.SmoduleEC) {
   647  		nlog.Infof("Empty nodes for %s are %#v", ctx.lom, empty)
   648  	}
   649  	return empty, nil
   650  }
   651  
   652  func (*getJogger) freeSliceFrom(slices []*slice, start int) {
   653  	for sl, sliceID := getNextNonEmptySlice(slices, start); sl != nil; sl, sliceID = getNextNonEmptySlice(slices, sliceID) {
   654  		sl.free()
   655  	}
   656  }
   657  
   658  // upload missing slices to targets (that must have them):
   659  // * slices - object slices reconstructed by `restoreMainObj`
   660  // * idToNode - a map of targets that already contain a slice (SliceID <-> target)
   661  func (c *getJogger) uploadRestoredSlices(ctx *restoreCtx, slices []*slice) error {
   662  	emptyNodes, err := c.emptyTargets(ctx)
   663  	if err != nil || len(emptyNodes) == 0 {
   664  		c.freeSliceFrom(slices, 0)
   665  		return err
   666  	}
   667  
   668  	var (
   669  		sliceID   int
   670  		sl        *slice
   671  		remoteErr error
   672  		counter   = atomic.NewInt32(0)
   673  	)
   674  	// First, count the number of slices and initialize the counter to avoid
   675  	// races when network is faster than FS and transport callback comes before
   676  	// the next slice is being sent
   677  	for sl, id := getNextNonEmptySlice(slices, 0); sl != nil; sl, id = getNextNonEmptySlice(slices, id) {
   678  		counter.Inc()
   679  	}
   680  	if counter.Load() == 0 {
   681  		return nil
   682  	}
   683  	// Send reconstructed slices one by one to targets that are "empty".
   684  	for sl, sliceID = getNextNonEmptySlice(slices, 0); sl != nil && len(emptyNodes) != 0; sl, sliceID = getNextNonEmptySlice(slices, sliceID) {
   685  		tid := emptyNodes[0]
   686  		emptyNodes = emptyNodes[1:]
   687  
   688  		// clone the object's metadata and set the correct SliceID before sending
   689  		sliceMeta := ctx.meta.Clone()
   690  		sliceMeta.SliceID = sliceID
   691  		if sl.cksum != nil {
   692  			sliceMeta.CksumType, sliceMeta.CksumValue = sl.cksum.Get()
   693  		}
   694  
   695  		var reader cos.ReadOpenCloser
   696  		if sl.workFQN != "" {
   697  			reader, _ = cos.NewFileHandle(sl.workFQN)
   698  		} else {
   699  			s, ok := sl.obj.(*memsys.SGL)
   700  			debug.Assert(ok)
   701  			reader = memsys.NewReader(s)
   702  		}
   703  		dataSrc := &dataSource{
   704  			reader:   reader,
   705  			size:     sl.n,
   706  			metadata: sliceMeta,
   707  			isSlice:  true,
   708  			reqType:  reqPut,
   709  		}
   710  
   711  		if cmn.Rom.FastV(4, cos.SmoduleEC) {
   712  			nlog.Infof("Sending slice %s[%d] to %s", ctx.lom, sliceMeta.SliceID, tid)
   713  		}
   714  
   715  		// Every slice's SGL is freed upon transfer completion
   716  		cb := func(daemonID string, s *slice) transport.ObjSentCB {
   717  			return func(_ *transport.ObjHdr, _ io.ReadCloser, _ any, err error) {
   718  				if err != nil {
   719  					nlog.Errorf("%s failed to send %s to %v: %v", core.T, ctx.lom, daemonID, err)
   720  				}
   721  				s.free()
   722  			}
   723  		}(tid, sl)
   724  		if err := c.parent.writeRemote([]string{tid}, ctx.lom, dataSrc, cb); err != nil {
   725  			remoteErr = err
   726  			nlog.Errorf("%s failed to send slice %s[%d] to %s", core.T, ctx.lom, sliceID, tid)
   727  		}
   728  	}
   729  
   730  	c.freeSliceFrom(slices, sliceID)
   731  	return remoteErr
   732  }
   733  
   734  // Free resources allocated for downloading slices from remote targets
   735  func (c *getJogger) freeDownloaded(ctx *restoreCtx) {
   736  	for _, slice := range ctx.slices {
   737  		if slice != nil && slice.lom != nil {
   738  			core.FreeLOM(slice.lom)
   739  		}
   740  	}
   741  	for k := range ctx.nodes {
   742  		uname := unique(k, ctx.lom.Bck(), ctx.lom.ObjName)
   743  		c.parent.unregWriter(uname)
   744  	}
   745  	freeSlices(ctx.slices)
   746  }
   747  
   748  // Main function that starts restoring an object that was encoded
   749  func (c *getJogger) restoreEncoded(ctx *restoreCtx) error {
   750  	if cmn.Rom.FastV(4, cos.SmoduleEC) {
   751  		nlog.Infof("Starting EC restore %s", ctx.lom)
   752  	}
   753  
   754  	// Download all slices from the targets that have sent metadata
   755  	err := c.requestSlices(ctx)
   756  	if err != nil {
   757  		c.freeDownloaded(ctx)
   758  		return err
   759  	}
   760  
   761  	// Restore and save locally the main replica
   762  	restored, err := c.restoreMainObj(ctx)
   763  	if err != nil {
   764  		nlog.Errorf("%s failed to restore main object %s: %v", core.T, ctx.lom, err)
   765  		c.freeDownloaded(ctx)
   766  		freeSlices(restored)
   767  		return err
   768  	}
   769  
   770  	c.parent.ObjsAdd(1, ctx.meta.Size)
   771  
   772  	// main replica is ready to download by a client.
   773  	if err := c.uploadRestoredSlices(ctx, restored); err != nil {
   774  		nlog.Errorf("Failed to upload restored slices of %s: %v", ctx.lom, err)
   775  	} else if cmn.Rom.FastV(4, cos.SmoduleEC) {
   776  		nlog.Infof("Slices %s restored successfully", ctx.lom)
   777  	}
   778  
   779  	c.freeDownloaded(ctx)
   780  	return nil
   781  }
   782  
   783  // Entry point: restores main objects and slices if possible
   784  func (c *getJogger) restore(ctx *restoreCtx) error {
   785  	if ctx.lom.Bprops() == nil || !ctx.lom.ECEnabled() {
   786  		return ErrorECDisabled
   787  	}
   788  
   789  	if cmn.Rom.FastV(4, cos.SmoduleEC) {
   790  		nlog.Infof("Restoring %s", ctx.lom)
   791  	}
   792  	err := c.requestMeta(ctx)
   793  	if cmn.Rom.FastV(4, cos.SmoduleEC) {
   794  		nlog.Infof("Found meta for %s: %d, err: %v", ctx.lom, len(ctx.nodes), err)
   795  	}
   796  	if err != nil {
   797  		return err
   798  	}
   799  
   800  	ctx.lom.SetAtimeUnix(time.Now().UnixNano())
   801  	if ctx.meta.IsCopy {
   802  		if ctx.toDisk {
   803  			return c.restoreReplicatedFromDisk(ctx)
   804  		}
   805  		return c.restoreReplicatedFromMemory(ctx)
   806  	}
   807  
   808  	if len(ctx.nodes) < ctx.meta.Data {
   809  		return fmt.Errorf("cannot restore: too many slices missing (found %d slices, need %d or more)",
   810  			len(ctx.nodes), ctx.meta.Data)
   811  	}
   812  
   813  	return c.restoreEncoded(ctx)
   814  }
   815  
   816  // Broadcast request for object's metadata. The function returns the list of
   817  // nodes(with their EC metadata) that have the lastest object version
   818  func (c *getJogger) requestMeta(ctx *restoreCtx) error {
   819  	var (
   820  		wg     = cos.NewLimitedWaitGroup(cmn.MaxParallelism(), 8)
   821  		mtx    = &sync.Mutex{}
   822  		tmap   = core.T.Sowner().Get().Tmap
   823  		ctMeta = core.NewCTFromLOM(ctx.lom, fs.ECMetaType)
   824  
   825  		md, err  = LoadMetadata(ctMeta.FQN())
   826  		mdExists = err == nil && len(md.Daemons) != 0
   827  	)
   828  	if mdExists {
   829  		// Metafile exists and contains a list of targets
   830  		nodes := md.RemoteTargets()
   831  		ctx.nodes = make(map[string]*Metadata, len(nodes))
   832  		for _, node := range nodes {
   833  			wg.Add(1)
   834  			go func(si *meta.Snode, c *getJogger, mtx *sync.Mutex, mdExists bool) {
   835  				ctx.requestMeta(si, c, mtx, mdExists)
   836  				wg.Done()
   837  			}(node, c, mtx, mdExists)
   838  		}
   839  	} else {
   840  		// Otherwise, broadcast
   841  		ctx.nodes = make(map[string]*Metadata, len(tmap))
   842  		for _, node := range tmap {
   843  			if node.ID() == core.T.SID() {
   844  				continue
   845  			}
   846  			wg.Add(1)
   847  			go func(si *meta.Snode, c *getJogger, mtx *sync.Mutex, mdExists bool) {
   848  				ctx.requestMeta(si, c, mtx, mdExists)
   849  				wg.Done()
   850  			}(node, c, mtx, mdExists)
   851  		}
   852  	}
   853  	wg.Wait()
   854  
   855  	// No EC metadata found
   856  	if len(ctx.nodes) == 0 {
   857  		return ErrorNoMetafile
   858  	}
   859  
   860  	// Cleanup: delete all metadatas with "obsolete" information
   861  	for k, v := range ctx.nodes {
   862  		if v.Generation != ctx.meta.Generation {
   863  			nlog.Warningf("Target %s[slice id %d] old generation: %v == %v",
   864  				k, v.SliceID, v.Generation, ctx.meta.Generation)
   865  			delete(ctx.nodes, k)
   866  		}
   867  	}
   868  
   869  	return nil
   870  }
   871  
   872  ////////////////
   873  // restoreCtx //
   874  ////////////////
   875  
   876  func (ctx *restoreCtx) requestMeta(si *meta.Snode, c *getJogger, mtx *sync.Mutex, mdExists bool) {
   877  	md, err := RequestECMeta(ctx.lom.Bucket(), ctx.lom.ObjName, si, c.client)
   878  	if err != nil {
   879  		if mdExists {
   880  			nlog.Errorf("No EC meta %s from %s: %v", ctx.lom.Cname(), si, err)
   881  		} else if cmn.Rom.FastV(4, cos.SmoduleEC) {
   882  			nlog.Infof("No EC meta %s from %s: %v", ctx.lom.Cname(), si, err)
   883  		}
   884  		return
   885  	}
   886  
   887  	mtx.Lock()
   888  	ctx.nodes[si.ID()] = md
   889  	// Detect the metadata with the latest generation on the fly.
   890  	if ctx.meta == nil || md.Generation > ctx.meta.Generation {
   891  		ctx.meta = md
   892  	}
   893  	mtx.Unlock()
   894  }