github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/reb/ec.go (about)

     1  // Package reb provides global cluster-wide rebalance upon adding/removing storage nodes.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package reb
     6  
     7  import (
     8  	"errors"
     9  	"fmt"
    10  	"io"
    11  	"os"
    12  	"path/filepath"
    13  	"sync"
    14  
    15  	"github.com/NVIDIA/aistore/api/apc"
    16  	"github.com/NVIDIA/aistore/cmn"
    17  	"github.com/NVIDIA/aistore/cmn/cos"
    18  	"github.com/NVIDIA/aistore/cmn/debug"
    19  	"github.com/NVIDIA/aistore/cmn/nlog"
    20  	"github.com/NVIDIA/aistore/core"
    21  	"github.com/NVIDIA/aistore/core/meta"
    22  	"github.com/NVIDIA/aistore/ec"
    23  	"github.com/NVIDIA/aistore/fs"
    24  	"github.com/NVIDIA/aistore/transport"
    25  )
    26  
    27  // High level overview of how EC rebalance works.
    28  // 1. EC traverses only metafile(%mt) directories. A jogger per mountpath.
    29  // 2. A jogger skips a metafile if:
    30  //    - its `FullReplica` is not the local target ID
    31  //    - its `FullReplica` equals the local target ID and HRW chooses local target
    32  // 3. Otherwise, a jogger calculates a correct target using HRW and moves CT there
    33  // 4. A target on receiving:
    34  // 4.1. Preparation:
    35  //      - Update metadata: fix `Daemons` and `FullReplica` fields
    36  // 4.1. If the target has another CT of the same object and generation:
    37  //      - move local CT to a working directory
    38  // 4.2. If the target contains another CT of the same object and generation:
    39  //      - save sent CT and metafile
    40  // 4.3 If anything was moved to working directory at step 4.1:
    41  //      - select a target that has no valid CT of the object
    42  //      - moves the local CT to the selected target
    43  // 4.3. Finalization:
    44  //      - broadcast new metadata to all targets in `Daemons` field for them to
    45  //        update their metafiles. Targets do not overwrite their metafiles with a new
    46  //        one. They update only `Daemons` and `FullReplica` fields.
    47  
    48  func (reb *Reb) runECjoggers() {
    49  	var (
    50  		wg             = &sync.WaitGroup{}
    51  		availablePaths = fs.GetAvail()
    52  		cfg            = cmn.GCO.Get()
    53  		b              = reb.xctn().Bck()
    54  	)
    55  	for _, mi := range availablePaths {
    56  		bck := cmn.Bck{Provider: apc.AIS}
    57  		if b != nil {
    58  			bck = cmn.Bck{Name: b.Name, Provider: apc.AIS, Ns: b.Ns}
    59  		}
    60  		wg.Add(1)
    61  		go reb.jogEC(mi, &bck, wg)
    62  	}
    63  	for _, provider := range cfg.Backend.Providers {
    64  		for _, mi := range availablePaths {
    65  			bck := cmn.Bck{Provider: provider.Name}
    66  			if b != nil {
    67  				bck = cmn.Bck{Name: bck.Name, Provider: provider.Name, Ns: bck.Ns}
    68  			}
    69  			wg.Add(1)
    70  			go reb.jogEC(mi, &bck, wg)
    71  		}
    72  	}
    73  	wg.Wait()
    74  }
    75  
    76  // mountpath walker - walks through files in /meta/ directory
    77  func (reb *Reb) jogEC(mi *fs.Mountpath, bck *cmn.Bck, wg *sync.WaitGroup) {
    78  	defer wg.Done()
    79  	opts := &fs.WalkOpts{
    80  		Mi:       mi,
    81  		CTs:      []string{fs.ECMetaType},
    82  		Callback: reb.walkEC,
    83  		Sorted:   false,
    84  	}
    85  	opts.Bck.Copy(bck)
    86  	if err := fs.Walk(opts); err != nil {
    87  		xreb := reb.xctn()
    88  		if xreb.IsAborted() || xreb.Finished() {
    89  			nlog.Infof("aborting traversal")
    90  		} else {
    91  			nlog.Warningf("failed to traverse, err: %v", err)
    92  		}
    93  	}
    94  }
    95  
    96  // Sends local CT along with EC metadata to default target.
    97  // The CT is on a local drive and not loaded into SGL. Just read and send.
    98  func (reb *Reb) sendFromDisk(ct *core.CT, meta *ec.Metadata, target *meta.Snode, workFQN ...string) (err error) {
    99  	var (
   100  		lom    *core.LOM
   101  		roc    cos.ReadOpenCloser
   102  		fqn    = ct.FQN()
   103  		action = uint32(rebActRebCT)
   104  	)
   105  	debug.Assert(meta != nil)
   106  	if len(workFQN) != 0 {
   107  		fqn = workFQN[0]
   108  		action = rebActMoveCT
   109  	}
   110  	// TODO: unify acquiring a reader for LOM and CT
   111  	if ct.ContentType() == fs.ObjectType {
   112  		lom = core.AllocLOM(ct.ObjectName())
   113  		if err = lom.InitBck(ct.Bck().Bucket()); err != nil {
   114  			core.FreeLOM(lom)
   115  			return
   116  		}
   117  		lom.Lock(false)
   118  		if err = lom.Load(false /*cache it*/, true /*locked*/); err != nil {
   119  			lom.Unlock(false)
   120  			core.FreeLOM(lom)
   121  			return
   122  		}
   123  	} else {
   124  		lom = nil // sending slice; TODO: rlock
   125  	}
   126  
   127  	// open
   128  	if lom != nil {
   129  		defer core.FreeLOM(lom)
   130  		roc, err = lom.NewDeferROC()
   131  	} else {
   132  		roc, err = cos.NewFileHandle(fqn)
   133  	}
   134  	if err != nil {
   135  		return
   136  	}
   137  
   138  	// transmit
   139  	ntfn := stageNtfn{daemonID: core.T.SID(), stage: rebStageTraverse, rebID: reb.rebID.Load(), md: meta, action: action}
   140  	o := transport.AllocSend()
   141  	o.Hdr = transport.ObjHdr{ObjName: ct.ObjectName(), ObjAttrs: cmn.ObjAttrs{Size: meta.Size}}
   142  	o.Hdr.Bck.Copy(ct.Bck().Bucket())
   143  	if lom != nil {
   144  		o.Hdr.ObjAttrs.CopyFrom(lom.ObjAttrs(), false /*skip cksum*/)
   145  	}
   146  	if meta.SliceID != 0 {
   147  		o.Hdr.ObjAttrs.Size = ec.SliceSize(meta.Size, meta.Data)
   148  	}
   149  	reb.onAir.Inc()
   150  	o.Hdr.Opaque = ntfn.NewPack(rebMsgEC)
   151  	o.Callback = reb.transportECCB
   152  	if err = reb.dm.Send(o, roc, target); err != nil {
   153  		err = fmt.Errorf("failed to send slices to nodes [%s..]: %v", target.ID(), err)
   154  		return
   155  	}
   156  	xreb := reb.xctn()
   157  	xreb.OutObjsAdd(1, o.Hdr.ObjAttrs.Size)
   158  	return
   159  }
   160  
   161  func (reb *Reb) transportECCB(_ *transport.ObjHdr, _ io.ReadCloser, _ any, _ error) {
   162  	reb.onAir.Dec()
   163  }
   164  
   165  // Saves received CT to a local drive if needed:
   166  //  1. Full object/replica is received
   167  //  2. A CT is received and this target is not the default target (it
   168  //     means that the CTs came from default target after EC had been rebuilt)
   169  func (reb *Reb) saveCTToDisk(ntfn *stageNtfn, hdr *transport.ObjHdr, data io.Reader) error {
   170  	cos.Assert(ntfn.md != nil)
   171  	var (
   172  		err error
   173  		bck = meta.CloneBck(&hdr.Bck)
   174  	)
   175  	if err := bck.Init(core.T.Bowner()); err != nil {
   176  		return err
   177  	}
   178  	md := ntfn.md.NewPack()
   179  	if ntfn.md.SliceID != 0 {
   180  		args := &ec.WriteArgs{Reader: data, MD: md, Xact: reb.xctn()}
   181  		err = ec.WriteSliceAndMeta(hdr, args)
   182  	} else {
   183  		var lom *core.LOM
   184  		lom, err = core.AllocLomFromHdr(hdr)
   185  		if err == nil {
   186  			args := &ec.WriteArgs{Reader: data, MD: md, Cksum: hdr.ObjAttrs.Cksum, Xact: reb.xctn()}
   187  			err = ec.WriteReplicaAndMeta(lom, args)
   188  		}
   189  		core.FreeLOM(lom)
   190  	}
   191  	return err
   192  }
   193  
   194  // Used when slice conflict is detected: a target receives a new slice and it already
   195  // has a slice of the same generation with different ID
   196  func (*Reb) renameAsWorkFile(ct *core.CT) (string, error) {
   197  	fqn := ct.Make(fs.WorkfileType)
   198  	// Using os.Rename is safe as both CT and Workfile on the same mountpath
   199  	if err := os.Rename(ct.FQN(), fqn); err != nil {
   200  		return "", err
   201  	}
   202  	return fqn, nil
   203  }
   204  
   205  // Find a target that has either an obsolete slice or no slice of the object.
   206  // Used to resolve the conflict: this target is the "main" one (has a full
   207  // replica) but it also stores a slice of the object. So, the existing slice
   208  // goes to any other _free_ target.
   209  func (reb *Reb) findEmptyTarget(md *ec.Metadata, ct *core.CT, sender string) (*meta.Snode, error) {
   210  	var (
   211  		sliceCnt     = md.Data + md.Parity + 2
   212  		smap         = reb.smap.Load()
   213  		hrwList, err = smap.HrwTargetList(ct.Bck().MakeUname(ct.ObjectName()), sliceCnt)
   214  	)
   215  	if err != nil {
   216  		return nil, err
   217  	}
   218  	for _, tsi := range hrwList {
   219  		if tsi.ID() == sender || tsi.ID() == core.T.SID() {
   220  			continue
   221  		}
   222  		remoteMD, err := ec.RequestECMeta(ct.Bucket(), ct.ObjectName(), tsi, core.T.DataClient())
   223  		if remoteMD != nil && remoteMD.Generation < md.Generation {
   224  			return tsi, nil
   225  		}
   226  		if remoteMD != nil && remoteMD.Generation == md.Generation {
   227  			_, ok := md.Daemons[tsi.ID()]
   228  			if !ok {
   229  				// ct.ObjectName()[remoteMD.SliceID] not found (new slice md.SliceID)
   230  				return tsi, nil
   231  			}
   232  		}
   233  		if err != nil && cos.IsNotExist(err, 0) {
   234  			return tsi, nil
   235  		}
   236  		if err != nil {
   237  			nlog.Errorf("Failed to read metadata from %s: %v", tsi.StringEx(), err)
   238  		}
   239  	}
   240  	return nil, errors.New("no _free_ targets")
   241  }
   242  
   243  // Check if this target has a metadata for the received CT
   244  func detectLocalCT(req *stageNtfn, ct *core.CT) (*ec.Metadata, error) {
   245  	if req.action == rebActMoveCT {
   246  		// internal CT move after slice conflict - save always
   247  		return nil, nil
   248  	}
   249  	if _, ok := req.md.Daemons[core.T.SID()]; !ok {
   250  		return nil, nil
   251  	}
   252  	mdCT, err := core.NewCTFromBO(ct.Bck().Bucket(), ct.ObjectName(), core.T.Bowner(), fs.ECMetaType)
   253  	if err != nil {
   254  		return nil, err
   255  	}
   256  	locMD, err := ec.LoadMetadata(mdCT.FQN())
   257  	if err != nil && os.IsNotExist(err) {
   258  		err = nil
   259  	}
   260  	return locMD, err
   261  }
   262  
   263  // When a target receives a slice and the target has a slice with different ID:
   264  // - move slice to a workfile directory
   265  // - return Snode that must receive the local slice, and workfile path
   266  // - the caller saves received CT to local drives, and then sends workfile
   267  func (reb *Reb) renameLocalCT(req *stageNtfn, ct *core.CT, md *ec.Metadata) (
   268  	workFQN string, moveTo *meta.Snode, err error) {
   269  	if md == nil || req.action == rebActMoveCT {
   270  		return
   271  	}
   272  	if md.SliceID == 0 || md.SliceID == req.md.SliceID || req.md.Generation != md.Generation {
   273  		return
   274  	}
   275  	if workFQN, err = reb.renameAsWorkFile(ct); err != nil {
   276  		return
   277  	}
   278  	if moveTo, err = reb.findEmptyTarget(md, ct, req.daemonID); err != nil {
   279  		if errMv := os.Rename(workFQN, ct.FQN()); errMv != nil {
   280  			nlog.Errorf("Error restoring slice: %v", errMv)
   281  		}
   282  	}
   283  	return
   284  }
   285  
   286  func (reb *Reb) walkEC(fqn string, de fs.DirEntry) error {
   287  	xreb := reb.xctn()
   288  	if err := xreb.AbortErr(); err != nil {
   289  		// notify `dir.Walk` to stop iterations
   290  		nlog.Infoln(xreb.Name(), "walk-ec aborted", err)
   291  		return err
   292  	}
   293  
   294  	if de.IsDir() {
   295  		return nil
   296  	}
   297  
   298  	ct, err := core.NewCTFromFQN(fqn, core.T.Bowner())
   299  	if err != nil {
   300  		return nil
   301  	}
   302  	// do not touch directories for buckets with EC disabled (for now)
   303  	if !ct.Bck().Props.EC.Enabled {
   304  		return filepath.SkipDir
   305  	}
   306  
   307  	md, err := ec.LoadMetadata(fqn)
   308  	if err != nil {
   309  		nlog.Warningf("failed to load %q metadata: %v", fqn, err)
   310  		return nil
   311  	}
   312  
   313  	// Skip a CT if this target is not the 'main' one
   314  	if md.FullReplica != core.T.SID() {
   315  		return nil
   316  	}
   317  
   318  	smap := reb.smap.Load()
   319  	hrwTarget, err := smap.HrwHash2T(ct.Digest())
   320  	if err != nil || hrwTarget.ID() == core.T.SID() {
   321  		return err
   322  	}
   323  
   324  	// check if both slice/replica and metafile exist
   325  	isReplica := md.SliceID == 0
   326  	var fileFQN string
   327  	if isReplica {
   328  		fileFQN = ct.Make(fs.ObjectType)
   329  	} else {
   330  		fileFQN = ct.Make(fs.ECSliceType)
   331  	}
   332  	if err := cos.Stat(fileFQN); err != nil {
   333  		nlog.Warningf("%s no CT for metadata[%d]: %s", core.T, md.SliceID, fileFQN)
   334  		return nil
   335  	}
   336  
   337  	ct, err = core.NewCTFromFQN(fileFQN, core.T.Bowner())
   338  	if err != nil {
   339  		return nil
   340  	}
   341  	return reb.sendFromDisk(ct, md, hrwTarget)
   342  }