github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/reb/recv.go (about)

     1  // Package reb provides global cluster-wide rebalance upon adding/removing storage nodes.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package reb
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"io"
    11  	"os"
    12  
    13  	"github.com/NVIDIA/aistore/cmn"
    14  	"github.com/NVIDIA/aistore/cmn/cos"
    15  	"github.com/NVIDIA/aistore/cmn/debug"
    16  	"github.com/NVIDIA/aistore/cmn/nlog"
    17  	"github.com/NVIDIA/aistore/core"
    18  	"github.com/NVIDIA/aistore/core/meta"
    19  	"github.com/NVIDIA/aistore/ec"
    20  	"github.com/NVIDIA/aistore/fs"
    21  	"github.com/NVIDIA/aistore/transport"
    22  )
    23  
    24  // TODO: currently, cannot return errors from the receive handlers, here and elsewhere
    25  //       (see `_regRecv` for "static lifecycle")
    26  
    27  func (reb *Reb) _recvErr(err error) error {
    28  	if err == nil {
    29  		return err
    30  	}
    31  	if xreb := reb.xctn(); xreb != nil {
    32  		xreb.Abort(err)
    33  	}
    34  	return nil
    35  }
    36  
    37  func (reb *Reb) recvObj(hdr *transport.ObjHdr, objReader io.Reader, err error) error {
    38  	defer transport.DrainAndFreeReader(objReader)
    39  	if err != nil {
    40  		nlog.Errorln(err)
    41  		return err
    42  	}
    43  
    44  	smap, err := reb._waitForSmap()
    45  	if err != nil {
    46  		return reb._recvErr(err)
    47  	}
    48  	unpacker := cos.NewUnpacker(hdr.Opaque)
    49  	act, err := unpacker.ReadByte()
    50  	if err != nil {
    51  		nlog.Errorf("Failed to read message type: %v", err)
    52  		return reb._recvErr(err)
    53  	}
    54  	if act == rebMsgRegular {
    55  		err := reb.recvObjRegular(hdr, smap, unpacker, objReader)
    56  		return reb._recvErr(err)
    57  	}
    58  	debug.Assertf(act == rebMsgEC, "act=%d", act)
    59  	err = reb.recvECData(hdr, unpacker, objReader)
    60  	return reb._recvErr(err)
    61  }
    62  
    63  func (reb *Reb) recvAck(hdr *transport.ObjHdr, _ io.Reader, err error) error {
    64  	if err != nil {
    65  		nlog.Errorln(err)
    66  		return err
    67  	}
    68  
    69  	unpacker := cos.NewUnpacker(hdr.Opaque)
    70  	act, err := unpacker.ReadByte()
    71  	if err != nil {
    72  		err = fmt.Errorf("failed to read message type: %v", err)
    73  		return reb._recvErr(err)
    74  	}
    75  	if act == rebMsgEC {
    76  		err := reb.recvECAck(hdr, unpacker)
    77  		return reb._recvErr(err)
    78  	}
    79  	debug.Assertf(act == rebMsgRegular, "act=%d", act)
    80  	err = reb.recvRegularAck(hdr, unpacker)
    81  	return reb._recvErr(err)
    82  }
    83  
    84  func (reb *Reb) recvStageNtfn(hdr *transport.ObjHdr, _ io.Reader, errRx error) error {
    85  	if errRx != nil {
    86  		nlog.Errorf("%s: %v", core.T, errRx)
    87  		return errRx
    88  	}
    89  	ntfn, err := reb.decodeStageNtfn(hdr.Opaque)
    90  	if err != nil {
    91  		return reb._recvErr(err)
    92  	}
    93  
    94  	var (
    95  		rebID      = reb.RebID()
    96  		rsmap      = reb.smap.Load()
    97  		otherStage = stages[ntfn.stage]
    98  		xreb       = reb.xctn()
    99  	)
   100  	if xreb == nil {
   101  		if reb.stages.stage.Load() != rebStageInactive {
   102  			nlog.Errorf("%s: nil rebalancing xaction", reb.logHdr(rebID, rsmap))
   103  		}
   104  		return nil
   105  	}
   106  	if xreb.IsAborted() {
   107  		return nil
   108  	}
   109  
   110  	// TODO: see "static lifecycle" comment above
   111  
   112  	// eq
   113  	if rebID == ntfn.rebID {
   114  		reb.stages.setStage(ntfn.daemonID, ntfn.stage)
   115  		if ntfn.stage == rebStageAbort {
   116  			err := fmt.Errorf("abort stage notification from %s(%s)", meta.Tname(ntfn.daemonID), otherStage)
   117  			xreb.Abort(cmn.NewErrAborted(xreb.Name(), reb.logHdr(rebID, rsmap), err))
   118  		}
   119  		return nil
   120  	}
   121  	// other's old
   122  	if rebID > ntfn.rebID {
   123  		nlog.Warningf("%s: stage notification from %s(%s): %s", reb.logHdr(rebID, rsmap),
   124  			meta.Tname(ntfn.daemonID), otherStage, reb.warnID(ntfn.rebID, ntfn.daemonID))
   125  		return nil
   126  	}
   127  
   128  	xreb.Abort(cmn.NewErrAborted(xreb.Name(), reb.logHdr(rebID, rsmap), err))
   129  	return nil
   130  }
   131  
   132  //
   133  // regular (non-EC) receive
   134  //
   135  
   136  func (reb *Reb) recvObjRegular(hdr *transport.ObjHdr, smap *meta.Smap, unpacker *cos.ByteUnpack, objReader io.Reader) error {
   137  	ack := &regularAck{}
   138  	if err := unpacker.ReadAny(ack); err != nil {
   139  		nlog.Errorf("Failed to parse ACK: %v", err)
   140  		return err
   141  	}
   142  	if ack.rebID != reb.RebID() {
   143  		nlog.Warningf("received %s: %s", hdr.Cname(), reb.warnID(ack.rebID, ack.daemonID))
   144  		return nil
   145  	}
   146  	tsid := ack.daemonID // the sender
   147  	// Rx
   148  	lom := core.AllocLOM(hdr.ObjName)
   149  	defer core.FreeLOM(lom)
   150  	if err := lom.InitBck(&hdr.Bck); err != nil {
   151  		nlog.Errorln(err)
   152  		return nil
   153  	}
   154  	if stage := reb.stages.stage.Load(); stage >= rebStageFin {
   155  		reb.laterx.Store(true)
   156  		if stage > rebStageFin && cmn.Rom.FastV(4, cos.SmoduleReb) {
   157  			nlog.Infof("Warning: %s: post stage-fin receive from %s %s (stage %s)",
   158  				core.T.Snode(), meta.Tname(tsid), lom, stages[stage])
   159  		}
   160  	} else if stage < rebStageTraverse {
   161  		nlog.Errorf("%s: early receive from %s %s (stage %s)", core.T, meta.Tname(tsid), lom, stages[stage])
   162  	}
   163  	lom.CopyAttrs(&hdr.ObjAttrs, true /*skip-checksum*/) // see "PUT is a no-op"
   164  	xreb := reb.xctn()
   165  	if xreb.IsAborted() {
   166  		return nil
   167  	}
   168  	params := core.AllocPutParams()
   169  	{
   170  		params.WorkTag = fs.WorkfilePut
   171  		params.Reader = io.NopCloser(objReader)
   172  		params.OWT = cmn.OwtRebalance
   173  		params.Cksum = hdr.ObjAttrs.Cksum
   174  		params.Atime = lom.Atime()
   175  		params.Xact = xreb
   176  	}
   177  	erp := core.T.PutObject(lom, params)
   178  	core.FreePutParams(params)
   179  	if erp != nil {
   180  		nlog.Errorln(erp)
   181  		return erp
   182  	}
   183  	// stats
   184  	xreb.InObjsAdd(1, hdr.ObjAttrs.Size)
   185  
   186  	// ACK
   187  	tsi := smap.GetTarget(tsid)
   188  	if tsi == nil {
   189  		err := fmt.Errorf("%s is not in the %s", meta.Tname(tsid), smap)
   190  		nlog.Errorln(err)
   191  		return err
   192  	}
   193  	if stage := reb.stages.stage.Load(); stage < rebStageFinStreams && stage != rebStageInactive {
   194  		ack := &regularAck{rebID: reb.RebID(), daemonID: core.T.SID()}
   195  		hdr.Opaque = ack.NewPack()
   196  		hdr.ObjAttrs.Size = 0
   197  		if err := reb.dm.ACK(hdr, nil, tsi); err != nil {
   198  			nlog.Errorln(err)
   199  			return err
   200  		}
   201  	}
   202  	return nil
   203  }
   204  
   205  func (reb *Reb) recvRegularAck(hdr *transport.ObjHdr, unpacker *cos.ByteUnpack) error {
   206  	ack := &regularAck{}
   207  	if err := unpacker.ReadAny(ack); err != nil {
   208  		nlog.Errorf("Failed to parse ACK: %v", err)
   209  		return err
   210  	}
   211  	if ack.rebID != reb.rebID.Load() {
   212  		nlog.Warningf("ACK from %s: %s", ack.daemonID, reb.warnID(ack.rebID, ack.daemonID))
   213  		return nil
   214  	}
   215  
   216  	lom := core.AllocLOM(hdr.ObjName)
   217  	if err := lom.InitBck(&hdr.Bck); err != nil {
   218  		core.FreeLOM(lom)
   219  		nlog.Errorln(err)
   220  		return nil
   221  	}
   222  
   223  	// No immediate file deletion: let LRU cleanup the "misplaced" object
   224  	// TODO: mark the object "Deleted"
   225  
   226  	reb.delLomAck(lom, ack.rebID, true /*free pending (orig) transmitted LOM*/)
   227  	core.FreeLOM(lom)
   228  	return nil
   229  }
   230  
   231  //
   232  // EC receive
   233  //
   234  
   235  func (*Reb) recvECAck(hdr *transport.ObjHdr, unpacker *cos.ByteUnpack) (err error) {
   236  	ack := &ecAck{}
   237  	err = unpacker.ReadAny(ack)
   238  	if err != nil {
   239  		nlog.Errorf("Failed to unmarshal EC ACK for %s: %v", hdr.Cname(), err)
   240  	}
   241  	return
   242  }
   243  
   244  // Receive MD update. Handling includes partially updating local information:
   245  // only the list of daemons and the _main_ target.
   246  func receiveMD(req *stageNtfn, hdr *transport.ObjHdr) error {
   247  	ctMeta, err := core.NewCTFromBO(&hdr.Bck, hdr.ObjName, core.T.Bowner(), fs.ECMetaType)
   248  	if err != nil {
   249  		return err
   250  	}
   251  	md, err := ec.LoadMetadata(ctMeta.FQN())
   252  	if err != nil {
   253  		if os.IsNotExist(err) {
   254  			err = nil
   255  		}
   256  		return err
   257  	}
   258  	if md.Generation != req.md.Generation {
   259  		return nil
   260  	}
   261  	md.FullReplica = req.md.FullReplica
   262  	md.Daemons = req.md.Daemons
   263  	mdBytes := md.NewPack()
   264  
   265  	return ctMeta.Write(bytes.NewReader(mdBytes), -1)
   266  }
   267  
   268  func (reb *Reb) receiveCT(req *stageNtfn, hdr *transport.ObjHdr, reader io.Reader) error {
   269  	ct, err := core.NewCTFromBO(&hdr.Bck, hdr.ObjName, core.T.Bowner(), fs.ECSliceType)
   270  	if err != nil {
   271  		return err
   272  	}
   273  	md, err := detectLocalCT(req, ct)
   274  	if err != nil {
   275  		nlog.Errorf("%s: %v", ct.FQN(), err)
   276  		return err
   277  	}
   278  	// Fix the metadata: update CT locations
   279  	delete(req.md.Daemons, req.daemonID)
   280  	if md != nil && req.md.Generation < md.Generation {
   281  		// Local CT is newer - do not save anything
   282  		return nil
   283  	}
   284  	// Check for slice conflict
   285  	workFQN, moveTo, err := reb.renameLocalCT(req, ct, md)
   286  	if err != nil {
   287  		return err
   288  	}
   289  	req.md.FullReplica = core.T.SID()
   290  	req.md.Daemons[core.T.SID()] = uint16(req.md.SliceID)
   291  	if moveTo != nil {
   292  		req.md.Daemons[moveTo.ID()] = uint16(md.SliceID)
   293  	}
   294  	// Save received CT to local drives
   295  	err = reb.saveCTToDisk(req, hdr, reader)
   296  	if err != nil {
   297  		if errRm := os.Remove(ct.FQN()); errRm != nil {
   298  			nlog.Errorf("Failed to remove %s: %v", ct.FQN(), errRm)
   299  		}
   300  		if moveTo != nil {
   301  			if errMv := os.Rename(workFQN, ct.FQN()); errMv != nil {
   302  				nlog.Errorf("Error restoring slice: %v", errMv)
   303  			}
   304  		}
   305  		return err
   306  	}
   307  	// Send local slice
   308  	if moveTo != nil {
   309  		req.md.SliceID = md.SliceID
   310  		if err = reb.sendFromDisk(ct, req.md, moveTo, workFQN); err != nil {
   311  			nlog.Errorf("Failed to move slice to %s: %v", moveTo, err)
   312  		}
   313  	}
   314  	// Broadcast updated MD
   315  	ntfnMD := stageNtfn{daemonID: core.T.SID(), stage: rebStageTraverse, rebID: reb.rebID.Load(), md: req.md, action: rebActUpdateMD}
   316  	nodes := req.md.RemoteTargets()
   317  	for _, tsi := range nodes {
   318  		if moveTo != nil && moveTo.ID() == tsi.ID() {
   319  			continue
   320  		}
   321  		reb.onAir.Inc()
   322  		xreb := reb.xctn()
   323  		if xreb.IsAborted() {
   324  			break
   325  		}
   326  		o := transport.AllocSend()
   327  		o.Hdr = transport.ObjHdr{ObjName: ct.ObjectName(), ObjAttrs: cmn.ObjAttrs{Size: 0}}
   328  		o.Hdr.Bck.Copy(ct.Bck().Bucket())
   329  		o.Hdr.Opaque = ntfnMD.NewPack(rebMsgEC)
   330  		o.Callback = reb.transportECCB
   331  		if errSend := reb.dm.Send(o, nil, tsi); errSend != nil && err == nil {
   332  			err = fmt.Errorf("failed to send updated metafile: %v", err)
   333  		}
   334  	}
   335  	return err
   336  }
   337  
   338  // receiving EC CT
   339  func (reb *Reb) recvECData(hdr *transport.ObjHdr, unpacker *cos.ByteUnpack, reader io.Reader) error {
   340  	req := &stageNtfn{}
   341  	err := unpacker.ReadAny(req)
   342  	if err != nil {
   343  		nlog.Errorf("invalid stage notification %s: %v", hdr.ObjName, err)
   344  		return err
   345  	}
   346  	if req.rebID != reb.rebID.Load() {
   347  		nlog.Warningf("%s: not yet started or already finished rebalancing (%d, %d)",
   348  			core.T.Snode(), req.rebID, reb.rebID.Load())
   349  		return nil
   350  	}
   351  	if req.action == rebActUpdateMD {
   352  		err := receiveMD(req, hdr)
   353  		if err != nil {
   354  			nlog.Errorf("failed to receive MD for %s: %v", hdr.Cname(), err)
   355  			nlog.Errorf("Warning: (g%d, %s) ignoring, proceeding anyway...", req.rebID, core.T) // TODO: revisit
   356  		}
   357  		return nil
   358  	}
   359  	if err := reb.receiveCT(req, hdr, reader); err != nil {
   360  		nlog.Errorf("failed to receive CT for %s: %v", hdr.Cname(), err)
   361  		return err
   362  	}
   363  	return nil
   364  }