github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ec/putjogger.go (about)

     1  // Package ec provides erasure coding (EC) based data protection for AIStore.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package ec
     6  
     7  import (
     8  	"bytes"
     9  	"errors"
    10  	"fmt"
    11  	"io"
    12  	"os"
    13  	"sync"
    14  	"time"
    15  
    16  	"github.com/NVIDIA/aistore/cmn"
    17  	"github.com/NVIDIA/aistore/cmn/atomic"
    18  	"github.com/NVIDIA/aistore/cmn/cos"
    19  	"github.com/NVIDIA/aistore/cmn/debug"
    20  	"github.com/NVIDIA/aistore/cmn/mono"
    21  	"github.com/NVIDIA/aistore/cmn/nlog"
    22  	"github.com/NVIDIA/aistore/core"
    23  	"github.com/NVIDIA/aistore/core/meta"
    24  	"github.com/NVIDIA/aistore/fs"
    25  	"github.com/NVIDIA/aistore/memsys"
    26  	"github.com/NVIDIA/aistore/transport"
    27  	"github.com/klauspost/reedsolomon"
    28  )
    29  
    30  type (
    31  	encodeCtx struct {
    32  		lom          *core.LOM        // replica
    33  		meta         *Metadata        //
    34  		fh           *cos.FileHandle  // file handle for the replica
    35  		sliceSize    int64            // calculated slice size
    36  		padSize      int64            // zero tail of the last object's data slice
    37  		dataSlices   int              // the number of data slices
    38  		paritySlices int              // the number of parity slices
    39  		cksums       []*cos.CksumHash // checksums of parity slices (filled by reed-solomon)
    40  		slices       []*slice         // all EC slices (in the order of slice IDs)
    41  		targets      []*meta.Snode    // target list (in the order of slice IDs: targets[i] receives slices[i])
    42  	}
    43  
    44  	// a mountpath putJogger: processes PUT/DEL requests to one mountpath
    45  	putJogger struct {
    46  		parent *XactPut
    47  		slab   *memsys.Slab
    48  		buffer []byte
    49  		mpath  string
    50  
    51  		putCh  chan *request // top priority operation (object PUT)
    52  		xactCh chan *request // low priority operation (ec-encode)
    53  		stopCh cos.StopCh    // jogger management channel: to stop it
    54  
    55  		toDisk bool // use files or SGL
    56  	}
    57  )
    58  
    59  var (
    60  	encCtxPool         sync.Pool
    61  	emptyCtx           encodeCtx
    62  	errSliceSendFailed = errors.New("failed to send slice")
    63  )
    64  
    65  func allocCtx() (ctx *encodeCtx) {
    66  	if v := encCtxPool.Get(); v != nil {
    67  		ctx = v.(*encodeCtx)
    68  	} else {
    69  		ctx = &encodeCtx{}
    70  	}
    71  	return
    72  }
    73  
    74  func (ctx *encodeCtx) freeReplica() {
    75  	freeObject(ctx.fh)
    76  }
    77  
    78  ///////////////
    79  // putJogger //
    80  ///////////////
    81  
    82  func (*putJogger) newCtx(lom *core.LOM, meta *Metadata) (ctx *encodeCtx, err error) {
    83  	ctx = allocCtx()
    84  	ctx.lom = lom
    85  	ctx.dataSlices = lom.Bprops().EC.DataSlices
    86  	ctx.paritySlices = lom.Bprops().EC.ParitySlices
    87  	ctx.meta = meta
    88  
    89  	totalCnt := ctx.paritySlices + ctx.dataSlices
    90  	ctx.sliceSize = SliceSize(ctx.lom.SizeBytes(), ctx.dataSlices)
    91  	ctx.slices = make([]*slice, totalCnt)
    92  	ctx.padSize = ctx.sliceSize*int64(ctx.dataSlices) - ctx.lom.SizeBytes()
    93  
    94  	ctx.fh, err = cos.NewFileHandle(lom.FQN)
    95  	return ctx, err
    96  }
    97  
    98  func (*putJogger) freeCtx(ctx *encodeCtx) {
    99  	*ctx = emptyCtx
   100  	encCtxPool.Put(ctx)
   101  }
   102  
   103  func (c *putJogger) freeResources() {
   104  	c.slab.Free(c.buffer)
   105  	c.buffer = nil
   106  	c.slab = nil
   107  }
   108  
   109  func (c *putJogger) processRequest(req *request) {
   110  	lom, err := req.LIF.LOM()
   111  	if err != nil {
   112  		return
   113  	}
   114  
   115  	c.parent.IncPending()
   116  	defer func() {
   117  		if req.Callback != nil {
   118  			req.Callback(lom, err)
   119  		}
   120  		core.FreeLOM(lom)
   121  		c.parent.DecPending()
   122  	}()
   123  
   124  	if req.Action == ActSplit {
   125  		if err = lom.Load(false /*cache it*/, false /*locked*/); err != nil {
   126  			return
   127  		}
   128  		ecConf := lom.Bprops().EC
   129  		memRequired := lom.SizeBytes() * int64(ecConf.DataSlices+ecConf.ParitySlices) / int64(ecConf.ParitySlices)
   130  		c.toDisk = useDisk(memRequired, c.parent.config)
   131  	}
   132  
   133  	c.parent.stats.updateWaitTime(time.Since(req.tm))
   134  	req.tm = time.Now()
   135  	if err = c.ec(req, lom); err != nil {
   136  		err = cmn.NewErrFailedTo(core.T, req.Action, lom.Cname(), err)
   137  		c.parent.AddErr(err, 0)
   138  	}
   139  }
   140  
   141  func (c *putJogger) run(wg *sync.WaitGroup) {
   142  	nlog.Infof("Started EC for mountpath: %s, bucket %s", c.mpath, c.parent.bck)
   143  	defer wg.Done()
   144  	c.buffer, c.slab = g.pmm.Alloc()
   145  	for {
   146  		select {
   147  		case req := <-c.putCh:
   148  			c.processRequest(req)
   149  			freeReq(req)
   150  		case req := <-c.xactCh:
   151  			c.processRequest(req)
   152  			freeReq(req)
   153  		case <-c.stopCh.Listen():
   154  			c.freeResources()
   155  			return
   156  		}
   157  	}
   158  }
   159  
   160  func (c *putJogger) stop() {
   161  	nlog.Infof("Stopping EC for mountpath: %s, bucket %s", c.mpath, c.parent.bck)
   162  	c.stopCh.Close()
   163  }
   164  
   165  func (c *putJogger) ec(req *request, lom *core.LOM) (err error) {
   166  	switch req.Action {
   167  	case ActSplit:
   168  		if err = c.encode(req, lom); err != nil {
   169  			ctMeta := core.NewCTFromLOM(lom, fs.ECMetaType)
   170  			errRm := cos.RemoveFile(ctMeta.FQN())
   171  			debug.AssertNoErr(errRm)
   172  		}
   173  		c.parent.stats.updateEncodeTime(time.Since(req.tm), err != nil)
   174  	case ActDelete:
   175  		err = c.cleanup(lom)
   176  		c.parent.stats.updateDeleteTime(time.Since(req.tm), err != nil)
   177  	default:
   178  		err = fmt.Errorf("invalid EC action for putJogger: %v", req.Action)
   179  	}
   180  
   181  	if err == nil {
   182  		c.parent.stats.updateObjTime(time.Since(req.putTime))
   183  	}
   184  	return err
   185  }
   186  
   187  func (c *putJogger) replicate(ctx *encodeCtx) error {
   188  	err := c.createCopies(ctx)
   189  	if err != nil {
   190  		ctx.freeReplica()
   191  		c.cleanup(ctx.lom)
   192  	}
   193  	return err
   194  }
   195  
   196  func (c *putJogger) splitAndDistribute(ctx *encodeCtx) error {
   197  	err := initializeSlices(ctx)
   198  	if err == nil {
   199  		err = c.sendSlices(ctx)
   200  	}
   201  	if err != nil {
   202  		ctx.freeReplica()
   203  		if err != errSliceSendFailed {
   204  			freeSlices(ctx.slices)
   205  		}
   206  		c.cleanup(ctx.lom)
   207  	}
   208  	return err
   209  }
   210  
   211  // calculates and stores data and parity slices
   212  func (c *putJogger) encode(req *request, lom *core.LOM) error {
   213  	if cmn.Rom.FastV(4, cos.SmoduleEC) {
   214  		nlog.Infof("Encoding %q...", lom)
   215  	}
   216  	var (
   217  		ecConf     = lom.Bprops().EC
   218  		reqTargets = ecConf.ParitySlices + 1
   219  		smap       = core.T.Sowner().Get()
   220  	)
   221  	if !req.IsCopy {
   222  		reqTargets += ecConf.DataSlices
   223  	}
   224  	targetCnt := smap.CountActiveTs()
   225  	if targetCnt < reqTargets {
   226  		return fmt.Errorf("%v: given EC config (d=%d, p=%d), %d targets required to encode %s (have %d, %s)",
   227  			cmn.ErrNotEnoughTargets, ecConf.DataSlices, ecConf.ParitySlices, reqTargets, lom, targetCnt, smap.StringEx())
   228  	}
   229  
   230  	var (
   231  		ctMeta                = core.NewCTFromLOM(lom, fs.ECMetaType)
   232  		generation            = mono.NanoTime()
   233  		cksumType, cksumValue = lom.Checksum().Get()
   234  	)
   235  	meta := &Metadata{
   236  		MDVersion:   MDVersionLast,
   237  		Generation:  generation,
   238  		Size:        lom.SizeBytes(),
   239  		Data:        ecConf.DataSlices,
   240  		Parity:      ecConf.ParitySlices,
   241  		IsCopy:      req.IsCopy,
   242  		ObjCksum:    cksumValue,
   243  		CksumType:   cksumType,
   244  		FullReplica: core.T.SID(),
   245  		Daemons:     make(cos.MapStrUint16, reqTargets),
   246  	}
   247  
   248  	c.parent.LomAdd(lom)
   249  
   250  	ctx, err := c.newCtx(lom, meta)
   251  	defer c.freeCtx(ctx)
   252  	if err != nil {
   253  		return err
   254  	}
   255  	targets, err := smap.HrwTargetList(ctx.lom.Uname(), reqTargets)
   256  	if err != nil {
   257  		return err
   258  	}
   259  	ctx.targets = targets[1:]
   260  	meta.Daemons[targets[0].ID()] = 0 // main or full replica always on the first target
   261  	for i, tgt := range ctx.targets {
   262  		sliceID := uint16(i + 1)
   263  		if meta.IsCopy {
   264  			sliceID = 0
   265  		}
   266  		meta.Daemons[tgt.ID()] = sliceID
   267  	}
   268  
   269  	if meta.IsCopy {
   270  		err = c.replicate(ctx)
   271  	} else {
   272  		err = c.splitAndDistribute(ctx)
   273  	}
   274  	if err != nil {
   275  		return err
   276  	}
   277  	metaBuf := bytes.NewReader(meta.NewPack())
   278  	if err := ctMeta.Write(metaBuf, -1); err != nil {
   279  		return err
   280  	}
   281  	if _, exists := core.T.Bowner().Get().Get(ctMeta.Bck()); !exists {
   282  		if errRm := cos.RemoveFile(ctMeta.FQN()); errRm != nil {
   283  			nlog.Errorf("nested error: encode -> remove metafile: %v", errRm)
   284  		}
   285  		return fmt.Errorf("%s metafile saved while bucket %s was being destroyed", ctMeta.ObjectName(), ctMeta.Bucket())
   286  	}
   287  	return nil
   288  }
   289  
   290  func (c *putJogger) ctSendCallback(hdr *transport.ObjHdr, _ io.ReadCloser, _ any, err error) {
   291  	g.smm.Free(hdr.Opaque)
   292  	if err != nil {
   293  		nlog.Errorf("failed to send o[%s]: %v", hdr.Cname(), err)
   294  	}
   295  	c.parent.DecPending()
   296  }
   297  
   298  // Remove slices and replicas across the cluster: remove local metafile
   299  // if exists and broadcast the request to other targets
   300  func (c *putJogger) cleanup(lom *core.LOM) error {
   301  	ctMeta := core.NewCTFromLOM(lom, fs.ECMetaType)
   302  	md, err := LoadMetadata(ctMeta.FQN())
   303  	if err != nil {
   304  		if os.IsNotExist(err) {
   305  			// Metafile does not exist = nothing to clean up
   306  			err = nil
   307  		}
   308  		return err
   309  	}
   310  	nodes := md.RemoteTargets()
   311  	if err := cos.RemoveFile(ctMeta.FQN()); err != nil {
   312  		return err
   313  	}
   314  
   315  	request := newIntraReq(reqDel, nil, lom.Bck()).NewPack(g.smm)
   316  	o := transport.AllocSend()
   317  	o.Hdr = transport.ObjHdr{ObjName: lom.ObjName, Opaque: request, Opcode: reqDel}
   318  	o.Hdr.Bck.Copy(lom.Bucket())
   319  	o.Callback = c.ctSendCallback
   320  	c.parent.IncPending()
   321  	return c.parent.mgr.req().Send(o, nil, nodes...)
   322  }
   323  
   324  // Sends object replicas to targets that must have replicas after the client
   325  // uploads the main replica
   326  func (c *putJogger) createCopies(ctx *encodeCtx) error {
   327  	// generate a list of target to send the replica (all excluding this one)
   328  	nodes := make([]string, 0, len(ctx.targets))
   329  	for _, tgt := range ctx.targets {
   330  		nodes = append(nodes, tgt.ID())
   331  	}
   332  
   333  	// broadcast the replica to the targets
   334  	src := &dataSource{
   335  		reader:   ctx.fh,
   336  		size:     ctx.lom.SizeBytes(),
   337  		metadata: ctx.meta,
   338  		reqType:  reqPut,
   339  	}
   340  	return c.parent.writeRemote(nodes, ctx.lom, src, nil)
   341  }
   342  
   343  func checksumDataSlices(ctx *encodeCtx, cksmReaders []io.Reader, cksumType string) error {
   344  	debug.Assert(cksumType != "") // caller checks for 'none'
   345  	for i, reader := range cksmReaders {
   346  		_, cksum, err := cos.CopyAndChecksum(io.Discard, reader, nil, cksumType)
   347  		if err != nil {
   348  			return err
   349  		}
   350  		ctx.slices[i].cksum = cksum.Clone()
   351  	}
   352  	return nil
   353  }
   354  
   355  // generateSlicesToMemory gets FQN to the original file and encodes it into EC slices
   356  // writers are slices created by EC encoding process(memory is allocated)
   357  func generateSlicesToMemory(ctx *encodeCtx) error {
   358  	var (
   359  		cksumType    = ctx.lom.CksumType()
   360  		initSize     = min(ctx.sliceSize, cos.MiB)
   361  		sliceWriters = make([]io.Writer, ctx.paritySlices)
   362  	)
   363  	for i := range ctx.paritySlices {
   364  		writer := g.pmm.NewSGL(initSize)
   365  		ctx.slices[i+ctx.dataSlices] = &slice{obj: writer}
   366  		if cksumType == cos.ChecksumNone {
   367  			sliceWriters[i] = writer
   368  		} else {
   369  			ctx.cksums[i] = cos.NewCksumHash(cksumType)
   370  			sliceWriters[i] = cos.NewWriterMulti(writer, ctx.cksums[i].H)
   371  		}
   372  	}
   373  
   374  	return finalizeSlices(ctx, sliceWriters)
   375  }
   376  
   377  func initializeSlices(ctx *encodeCtx) (err error) {
   378  	// readers are slices of original object(no memory allocated)
   379  	cksmReaders := make([]io.Reader, ctx.dataSlices)
   380  	sizeLeft := ctx.lom.SizeBytes()
   381  	for i := range ctx.dataSlices {
   382  		var (
   383  			reader     cos.ReadOpenCloser
   384  			cksmReader cos.ReadOpenCloser
   385  			offset     = int64(i) * ctx.sliceSize
   386  		)
   387  		if sizeLeft < ctx.sliceSize {
   388  			reader = cos.NewSectionHandle(ctx.fh, offset, sizeLeft, ctx.padSize)
   389  			cksmReader = cos.NewSectionHandle(ctx.fh, offset, sizeLeft, ctx.padSize)
   390  		} else {
   391  			reader = cos.NewSectionHandle(ctx.fh, offset, ctx.sliceSize, 0)
   392  			cksmReader = cos.NewSectionHandle(ctx.fh, offset, ctx.sliceSize, 0)
   393  		}
   394  		ctx.slices[i] = &slice{obj: ctx.fh, reader: reader}
   395  		cksmReaders[i] = cksmReader
   396  		sizeLeft -= ctx.sliceSize
   397  	}
   398  
   399  	// We have established readers of data slices, we can already start calculating hashes for them
   400  	// during calculating parity slices and their hashes
   401  	if cksumType := ctx.lom.CksumType(); cksumType != cos.ChecksumNone {
   402  		ctx.cksums = make([]*cos.CksumHash, ctx.paritySlices)
   403  		err = checksumDataSlices(ctx, cksmReaders, cksumType)
   404  	}
   405  	return
   406  }
   407  
   408  func finalizeSlices(ctx *encodeCtx, writers []io.Writer) error {
   409  	stream, err := reedsolomon.NewStreamC(ctx.dataSlices, ctx.paritySlices, true, true)
   410  	if err != nil {
   411  		return err
   412  	}
   413  
   414  	// Calculate parity slices and their checksums
   415  	readers := make([]io.Reader, ctx.dataSlices)
   416  	for i := range ctx.dataSlices {
   417  		readers[i] = ctx.slices[i].reader
   418  	}
   419  	if err := stream.Encode(readers, writers); err != nil {
   420  		return err
   421  	}
   422  
   423  	if cksumType := ctx.lom.CksumType(); cksumType != cos.ChecksumNone {
   424  		for i := range ctx.cksums {
   425  			ctx.cksums[i].Finalize()
   426  			ctx.slices[i+ctx.dataSlices].cksum = ctx.cksums[i].Clone()
   427  		}
   428  	}
   429  	return nil
   430  }
   431  
   432  // generateSlicesToDisk gets FQN to the original file and encodes it into EC slices
   433  func generateSlicesToDisk(ctx *encodeCtx) error {
   434  	writers := make([]io.Writer, ctx.paritySlices)
   435  	sliceWriters := make([]io.Writer, ctx.paritySlices)
   436  
   437  	defer func() {
   438  		for _, wr := range writers {
   439  			if wr == nil {
   440  				continue
   441  			}
   442  			// writer can be only *os.File within this function
   443  			f := wr.(*os.File)
   444  			cos.Close(f)
   445  		}
   446  	}()
   447  
   448  	cksumType := ctx.lom.CksumType()
   449  	for i := range ctx.paritySlices {
   450  		workFQN := fs.CSM.Gen(ctx.lom, fs.WorkfileType, fmt.Sprintf("ec-write-%d", i))
   451  		writer, err := ctx.lom.CreateFile(workFQN)
   452  		if err != nil {
   453  			return err
   454  		}
   455  		ctx.slices[i+ctx.dataSlices] = &slice{writer: writer, workFQN: workFQN}
   456  		writers[i] = writer
   457  		if cksumType == cos.ChecksumNone {
   458  			sliceWriters[i] = writer
   459  		} else {
   460  			ctx.cksums[i] = cos.NewCksumHash(cksumType)
   461  			sliceWriters[i] = cos.NewWriterMulti(writer, ctx.cksums[i].H)
   462  		}
   463  	}
   464  
   465  	return finalizeSlices(ctx, sliceWriters)
   466  }
   467  
   468  func (c *putJogger) sendSlice(ctx *encodeCtx, data *slice, node *meta.Snode, idx int) error {
   469  	// Reopen the slice's reader, because it was read to the end by erasure
   470  	// encoding while calculating parity slices.
   471  	reader, err := ctx.slices[idx].reopenReader()
   472  	if err != nil {
   473  		data.release()
   474  		return err
   475  	}
   476  
   477  	mcopy := &Metadata{}
   478  	cos.CopyStruct(mcopy, ctx.meta)
   479  	mcopy.SliceID = idx + 1
   480  	mcopy.ObjVersion = ctx.lom.Version()
   481  	if ctx.slices[idx].cksum != nil {
   482  		mcopy.CksumType, mcopy.CksumValue = ctx.slices[idx].cksum.Get()
   483  	}
   484  
   485  	src := &dataSource{
   486  		reader:   reader,
   487  		size:     ctx.sliceSize,
   488  		obj:      data,
   489  		metadata: mcopy,
   490  		isSlice:  true,
   491  		reqType:  reqPut,
   492  	}
   493  	sentCB := func(hdr *transport.ObjHdr, _ io.ReadCloser, _ any, err error) {
   494  		if data != nil {
   495  			data.release()
   496  		}
   497  		if err != nil {
   498  			nlog.Errorln("Failed to send", hdr.Cname()+": ", err)
   499  		}
   500  	}
   501  
   502  	return c.parent.writeRemote([]string{node.ID()}, ctx.lom, src, sentCB)
   503  }
   504  
   505  // Copies the constructed EC slices to remote targets.
   506  func (c *putJogger) sendSlices(ctx *encodeCtx) (err error) {
   507  	// load the data slices from original object and construct parity ones
   508  	if c.toDisk {
   509  		err = generateSlicesToDisk(ctx)
   510  	} else {
   511  		err = generateSlicesToMemory(ctx)
   512  	}
   513  
   514  	if err != nil {
   515  		return err
   516  	}
   517  
   518  	dataSlice := &slice{refCnt: *atomic.NewInt32(int32(ctx.dataSlices)), obj: ctx.fh}
   519  	// If the slice is data one - no immediate cleanup is required because this
   520  	// slice is just a section reader of the entire file.
   521  	var copyErr error
   522  	for i, tgt := range ctx.targets {
   523  		var sl *slice
   524  		// Each data slice is a section reader of the replica, so the memory is
   525  		// freed only after the last data slice is sent. Parity slices allocate memory,
   526  		// so the counter is set to 1, to free immediately after send.
   527  		if i < ctx.dataSlices {
   528  			sl = dataSlice
   529  		} else {
   530  			sl = &slice{refCnt: *atomic.NewInt32(1), obj: ctx.slices[i].obj, workFQN: ctx.slices[i].workFQN}
   531  		}
   532  		if err := c.sendSlice(ctx, sl, tgt, i); err != nil {
   533  			copyErr = err
   534  		}
   535  	}
   536  
   537  	if copyErr != nil {
   538  		nlog.Errorf("Error while copying (data=%d, parity=%d) for %q: %v",
   539  			ctx.dataSlices, ctx.paritySlices, ctx.lom.ObjName, copyErr)
   540  		err = errSliceSendFailed
   541  	} else if cmn.Rom.FastV(4, cos.SmoduleEC) {
   542  		nlog.Infof("EC created (data=%d, parity=%d) for %q",
   543  			ctx.dataSlices, ctx.paritySlices, ctx.lom.ObjName)
   544  	}
   545  
   546  	return err
   547  }