github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ec/ec.go (about)

     1  // Package ec provides erasure coding (EC) based data protection for AIStore.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package ec
     6  
     7  import (
     8  	"bytes"
     9  	"errors"
    10  	"fmt"
    11  	"io"
    12  	"net/http"
    13  	"net/url"
    14  	"os"
    15  	"sync"
    16  	"time"
    17  
    18  	"github.com/NVIDIA/aistore/api/apc"
    19  	"github.com/NVIDIA/aistore/cmn"
    20  	"github.com/NVIDIA/aistore/cmn/atomic"
    21  	"github.com/NVIDIA/aistore/cmn/cos"
    22  	"github.com/NVIDIA/aistore/cmn/debug"
    23  	"github.com/NVIDIA/aistore/cmn/nlog"
    24  	"github.com/NVIDIA/aistore/core"
    25  	"github.com/NVIDIA/aistore/core/meta"
    26  	"github.com/NVIDIA/aistore/fs"
    27  	"github.com/NVIDIA/aistore/memsys"
    28  	"github.com/NVIDIA/aistore/transport"
    29  	"github.com/NVIDIA/aistore/xact/xreg"
    30  )
    31  
    32  // EC module provides data protection on a per bucket basis. By default, the
    33  // data protection is off. To enable it, set the bucket EC configuration:
    34  //	ECConf:
    35  //		Enable: true|false    # enables or disables protection
    36  //		DataSlices: [1-32]    # the number of data slices
    37  //		ParitySlices: [1-32]  # the number of parity slices
    38  //		ObjSizeLimit: 0       # replication versus erasure coding
    39  //
    40  // NOTE: replicating small object is cheaper than erasure encoding.
    41  // The ObjSizeLimit option sets the corresponding threshold. Set it to the
    42  // size (in bytes), or 0 (zero) to use the AIStore default 256KiB.
    43  //
    44  // NOTE: ParitySlices defines the maximum number of storage targets a cluster
    45  // can loose but it is still able to restore the original object
    46  //
    47  // NOTE: Since small objects are always replicated, they always have only one
    48  // data slice and #ParitySlices replicas
    49  //
    50  // NOTE: All slices and replicas must be on the different targets. The target
    51  // list is calculated by HrwTargetList. The first target in the list is the
    52  // "main" target that keeps the full object, the others keep only slices/replicas
    53  //
    54  // NOTE: All slices must be of the same size. So, the last slice can be padded
    55  // with zeros. In most cases, padding results in the total size of data
    56  // replicas being a bit bigger than than the size of the original object.
    57  //
    58  // NOTE: Every slice and replica must have corresponding metadata file that is
    59  // located in the same mountpath as its slice/replica
    60  //
    61  //
    62  // EC local storage directories inside mountpaths:
    63  //		/%ob/ - for main object and its replicas
    64  //		/%ec/ - for object data and parity slices
    65  //		/%mt/ - for metadata files
    66  //
    67  // How protection works.
    68  //
    69  // Object PUT:
    70  // 1. The main target - the target responsible for keeping the full object
    71  //	  data and for restoring the object if damaged - is selected by
    72  //	  HrwTarget. A proxy delegates object PUT request to it.
    73  // 2. The main target calculates all other targets to keep slices/replicas. For
    74  //	  small files it is #ParitySlices, for big ones it #DataSlices+#ParitySlices
    75  //	  targets.
    76  // 3. If the object is small, the main target broadcast the replicas.
    77  //    Otherwise, the target calculates data and parity slices, then sends them.
    78  //
    79  // Object GET:
    80  // 1. The main target - the target that is responsible for keeping the full object
    81  //	  data and for restoring the object becomes damaged - is determined by
    82  //	  HrwTarget algorithm. A proxy delegates object GET request to it.
    83  // 2. If the main target has the original object, it sends the data back
    84  //    Otherwise it tries to look up it inside other mountpaths (if resilver
    85  //	  is running) or on remote targets (if rebalance is running).
    86  // 3. If everything fails and EC is enabled for the bucket, the main target
    87  //	  initiates object restoration process:
    88  //    - First, the main target requests for object's metafile from all targets
    89  //	    in the cluster. If no target responds with a valid metafile, the object
    90  //		is considered missing.
    91  //    - Otherwise, the main target tries to download and restore the original data:
    92  //      Replica case:
    93  //	        The main target request targets which have valid metafile for a replica
    94  //			one by one. When a target sends a valid object, the main target saves
    95  //			the object to local storage and reuploads its replicas to the targets.
    96  //      EC case:
    97  //			The main target requests targets which have valid metafile for slices
    98  //			in parallel. When all the targets respond, the main target starts
    99  //			restoring the object, and, in case of success, saves the restored object
   100  //			to local storage and sends recalculated data and parity slices to the
   101  //			targets which must have a slice but are 'empty' at this moment.
   102  // NOTE: the slices are stored on targets in random order, except the first
   103  //	     PUT when the main target stores the slices in the order of HrwTargetList
   104  //		 algorithm returns.
   105  
   106  const (
   107  	ActSplit   = "split"
   108  	ActRestore = "restore"
   109  	ActDelete  = "delete"
   110  
   111  	RespStreamName = "ec-resp"
   112  	ReqStreamName  = "ec-req"
   113  
   114  	ActClearRequests  = "clear-requests"
   115  	ActEnableRequests = "enable-requests"
   116  
   117  	URLCT   = "ct"   // for using in URL path - requests for slices/replicas
   118  	URLMeta = "meta" /// .. - metadata requests
   119  
   120  	// EC switches to disk from SGL when memory pressure is high and the amount of
   121  	// memory required to encode an object exceeds the limit
   122  	objSizeHighMem = 50 * cos.MiB
   123  )
   124  
   125  type (
   126  	// request - structure to request an object to be EC'ed or restored
   127  	request struct {
   128  		LIF      core.LIF   // object info
   129  		Action   string     // what to do with the object (see Act* consts)
   130  		ErrCh    chan error // for final EC result (used only in restore)
   131  		Callback core.OnFinishObj
   132  
   133  		putTime time.Time // time when the object is put into main queue
   134  		tm      time.Time // to measure different steps
   135  		IsCopy  bool      // replicate or use erasure coding
   136  		rebuild bool      // true - internal request to reencode, e.g., from ec-encode xaction
   137  	}
   138  
   139  	RequestsControlMsg struct {
   140  		Action string
   141  	}
   142  
   143  	WriteArgs struct {
   144  		MD         []byte     // CT's metafile content
   145  		Reader     io.Reader  // CT content
   146  		BID        uint64     // bucket ID
   147  		Cksum      *cos.Cksum // object checksum
   148  		Generation int64      // EC Generation
   149  		Xact       core.Xact  // xaction that drives it
   150  	}
   151  
   152  	// keeps temporarily a slice of object data until it is sent to remote node
   153  	slice struct {
   154  		obj     cos.ReadOpenCloser // the whole object or its replica
   155  		reader  cos.ReadOpenCloser // used in encoding - a slice of `obj`
   156  		writer  io.Writer          // for parity slices and downloading slices from other targets when restoring
   157  		twg     *cos.TimeoutGroup  // for synchronous download (when restoring from slices)
   158  		lom     *core.LOM          // for xattrs
   159  		n       int64              // number of byte sent/received
   160  		refCnt  atomic.Int32       // number of references
   161  		workFQN string             // FQN for temporary slice/replica
   162  		cksum   *cos.Cksum         // checksum of the slice
   163  		version string             // version of the remote object
   164  	}
   165  
   166  	// a source for data response: the data to send to the caller
   167  	// If obj is not nil then after the reader is sent to the remote target,
   168  	// the obj's counter is decreased. And if its value drops to zero the
   169  	// allocated SGL is freed. This logic is required to send a set of
   170  	// sliceReaders that point to the same SGL (broadcasting data slices)
   171  	dataSource struct {
   172  		reader   cos.ReadOpenCloser // a reader to sent to a remote target
   173  		size     int64              // size of the data
   174  		obj      *slice             // internal info about SGL slice
   175  		metadata *Metadata          // object's metadata
   176  		isSlice  bool               // is it slice or replica
   177  		reqType  intraReqType       // request's type, slice/meta request/response
   178  	}
   179  )
   180  
   181  type global struct {
   182  	reqPool  sync.Pool
   183  	pmm      *memsys.MMSA // memory manager slab/SGL allocator (pages)
   184  	smm      *memsys.MMSA // ditto, bytes
   185  	emptyReq request
   186  }
   187  
   188  var g global
   189  
   190  var (
   191  	ErrorECDisabled = errors.New("EC is disabled for bucket")
   192  	ErrorNoMetafile = errors.New("no metafile")
   193  	ErrorNotFound   = errors.New("not found")
   194  )
   195  
   196  func Init() {
   197  	g.pmm = core.T.PageMM()
   198  	g.smm = core.T.ByteMM()
   199  
   200  	fs.CSM.Reg(fs.ECSliceType, &fs.ECSliceContentResolver{})
   201  	fs.CSM.Reg(fs.ECMetaType, &fs.ECMetaContentResolver{})
   202  
   203  	xreg.RegBckXact(&getFactory{})
   204  	xreg.RegBckXact(&putFactory{})
   205  	xreg.RegBckXact(&rspFactory{})
   206  	xreg.RegBckXact(&encFactory{})
   207  
   208  	if err := initManager(); err != nil {
   209  		cos.ExitLogf("Failed to init manager: %v", err)
   210  	}
   211  }
   212  
   213  ///////////
   214  // slice //
   215  ///////////
   216  
   217  // Free allocated memory and removes slice's temporary file
   218  func (s *slice) free() {
   219  	freeObject(s.obj)
   220  	s.obj = nil
   221  	if s.reader != nil {
   222  		cos.Close(s.reader)
   223  	}
   224  	if s.writer != nil {
   225  		switch w := s.writer.(type) {
   226  		case *os.File:
   227  			cos.Close(w)
   228  		case *memsys.SGL:
   229  			w.Free()
   230  		default:
   231  			debug.FailTypeCast(s.writer)
   232  		}
   233  	}
   234  	if s.workFQN != "" {
   235  		if err := os.Remove(s.workFQN); err != nil && !os.IsNotExist(err) {
   236  			nlog.Errorln(err)
   237  		}
   238  	}
   239  }
   240  
   241  // Decrease the number of links to the object (the initial number is set
   242  // at slice creation time). If the number drops to zero the allocated
   243  // memory/temporary file is cleaned up
   244  func (s *slice) release() {
   245  	if s.obj != nil || s.workFQN != "" {
   246  		refCnt := s.refCnt.Dec()
   247  		if refCnt < 1 {
   248  			s.free()
   249  		}
   250  	}
   251  }
   252  
   253  func (s *slice) reopenReader() (reader cos.ReadOpenCloser, err error) {
   254  	if s.reader != nil {
   255  		var rc io.ReadCloser
   256  		reader = s.reader
   257  		switch r := reader.(type) {
   258  		case *memsys.Reader:
   259  			_, err = r.Seek(0, io.SeekStart)
   260  		case *cos.SectionHandle:
   261  			rc, err = r.Open()
   262  			if err == nil {
   263  				reader = rc.(cos.ReadOpenCloser)
   264  			}
   265  		default:
   266  			debug.FailTypeCast(s.reader)
   267  			err = fmt.Errorf("unsupported reader type: %T", s.reader)
   268  		}
   269  		return reader, err
   270  	}
   271  
   272  	if sgl, ok := s.obj.(*memsys.SGL); ok {
   273  		reader = memsys.NewReader(sgl)
   274  	} else if s.workFQN != "" {
   275  		reader, err = cos.NewFileHandle(s.workFQN)
   276  	} else {
   277  		debug.FailTypeCast(s.obj)
   278  		err = fmt.Errorf("unsupported obj type: %T", s.obj)
   279  	}
   280  	return reader, err
   281  }
   282  
   283  //
   284  // misc. utils
   285  //
   286  
   287  func allocateReq(action string, lif core.LIF) (req *request) {
   288  	if v := g.reqPool.Get(); v != nil {
   289  		req = v.(*request)
   290  	} else {
   291  		req = &request{}
   292  	}
   293  	req.Action = action
   294  	req.LIF = lif
   295  	return
   296  }
   297  
   298  func freeReq(req *request) {
   299  	*req = g.emptyReq
   300  	g.reqPool.Put(req)
   301  }
   302  
   303  // SliceSize returns the size of one slice that EC will create for the object
   304  func SliceSize(fileSize int64, slices int) int64 {
   305  	return (fileSize + int64(slices) - 1) / int64(slices)
   306  }
   307  
   308  // Monitoring the background transferring of replicas and slices requires
   309  // a unique ID for each of them. Because of all replicas/slices of an object have
   310  // the same names, cluster.Uname is not enough to generate unique ID. Adding an
   311  // extra prefix - an identifier of the destination - solves the issue
   312  func unique(prefix string, bck *meta.Bck, objName string) string {
   313  	return prefix + cos.PathSeparator + bck.MakeUname(objName)
   314  }
   315  
   316  func IsECCopy(size int64, ecConf *cmn.ECConf) bool {
   317  	return size < ecConf.ObjSizeLimit || ecConf.ObjSizeLimit == cmn.ObjSizeToAlwaysReplicate
   318  }
   319  
   320  // returns whether EC must use disk instead of keeping everything in memory.
   321  // Depends on available free memory and size of an object to process
   322  func useDisk(objSize int64, config *cmn.Config) bool {
   323  	if config.EC.DiskOnly {
   324  		return true
   325  	}
   326  	memPressure := g.pmm.Pressure()
   327  	switch memPressure {
   328  	case memsys.OOM, memsys.PressureExtreme:
   329  		return true
   330  	case memsys.PressureHigh:
   331  		return objSize > objSizeHighMem
   332  	default:
   333  		return false
   334  	}
   335  }
   336  
   337  // Frees allocated memory if it is SGL or closes the file handle if regular file
   338  func freeObject(r any) {
   339  	if r == nil {
   340  		return
   341  	}
   342  	switch handle := r.(type) {
   343  	case *memsys.SGL:
   344  		if handle != nil {
   345  			handle.Free()
   346  		}
   347  	case *cos.FileHandle:
   348  		if handle != nil {
   349  			// few slices share the same handle, on error all release everything
   350  			_ = handle.Close()
   351  		}
   352  	case *os.File:
   353  		if handle != nil {
   354  			cos.Close(handle)
   355  		}
   356  	default:
   357  		debug.FailTypeCast(r)
   358  	}
   359  }
   360  
   361  // removes all temporary slices in case of erasure coding failure
   362  func freeSlices(slices []*slice) {
   363  	for _, s := range slices {
   364  		if s != nil {
   365  			s.free()
   366  		}
   367  	}
   368  }
   369  
   370  // RequestECMeta returns an EC metadata found on a remote target.
   371  func RequestECMeta(bck *cmn.Bck, objName string, si *meta.Snode, client *http.Client) (*Metadata, error) {
   372  	path := apc.URLPathEC.Join(URLMeta, bck.Name, objName)
   373  	query := url.Values{}
   374  	query = bck.AddToQuery(query)
   375  	url := si.URL(cmn.NetIntraData) + path
   376  	rq, err := http.NewRequest(http.MethodGet, url, http.NoBody)
   377  	if err != nil {
   378  		return nil, err
   379  	}
   380  	rq.URL.RawQuery = query.Encode()
   381  	resp, err := client.Do(rq) //nolint:bodyclose // closed inside cos.Close
   382  	if err != nil {
   383  		return nil, err
   384  	}
   385  
   386  	defer cos.Close(resp.Body)
   387  	if resp.StatusCode == http.StatusNotFound {
   388  		return nil, cos.NewErrNotFound(core.T, bck.Cname(objName))
   389  	}
   390  	if resp.StatusCode != http.StatusOK {
   391  		return nil, cmn.NewErrFailedTo(core.T, "request ec md", bck.Cname(objName), err)
   392  	}
   393  	return MetaFromReader(resp.Body)
   394  }
   395  
   396  // Saves the main replica to local drives
   397  func writeObject(lom *core.LOM, reader io.Reader, size int64, xctn core.Xact) error {
   398  	if size > 0 {
   399  		reader = io.LimitReader(reader, size)
   400  	}
   401  	readCloser := io.NopCloser(reader)
   402  	params := core.AllocPutParams()
   403  	{
   404  		params.WorkTag = "ec"
   405  		params.Reader = readCloser
   406  		params.SkipEC = true
   407  		params.Atime = time.Now()
   408  		params.Size = size
   409  		params.Xact = xctn
   410  		params.OWT = cmn.OwtRebalance
   411  	}
   412  	err := core.T.PutObject(lom, params)
   413  	core.FreePutParams(params)
   414  	return err
   415  }
   416  
   417  func validateBckBID(bck *cmn.Bck, bid uint64) error {
   418  	if bid == 0 {
   419  		return nil
   420  	}
   421  	newBck := meta.CloneBck(bck)
   422  	err := newBck.Init(core.T.Bowner())
   423  	if err == nil && newBck.Props.BID != bid {
   424  		err = fmt.Errorf("bucket ID mismatch: local %d, sender %d", newBck.Props.BID, bid)
   425  	}
   426  	return err
   427  }
   428  
   429  // WriteSliceAndMeta saves slice and its metafile
   430  func WriteSliceAndMeta(hdr *transport.ObjHdr, args *WriteArgs) error {
   431  	ct, err := core.NewCTFromBO(&hdr.Bck, hdr.ObjName, core.T.Bowner(), fs.ECSliceType)
   432  	if err != nil {
   433  		return err
   434  	}
   435  	ct.Lock(true)
   436  	ctMeta := ct.Clone(fs.ECMetaType)
   437  	defer func() {
   438  		ct.Unlock(true)
   439  		if err == nil {
   440  			return
   441  		}
   442  		if rmErr := cos.RemoveFile(ct.FQN()); rmErr != nil {
   443  			nlog.Errorf("nested error: save replica -> remove replica: %v", rmErr)
   444  		}
   445  		if rmErr := cos.RemoveFile(ctMeta.FQN()); rmErr != nil {
   446  			nlog.Errorf("nested error: save replica -> remove metafile: %v", rmErr)
   447  		}
   448  	}()
   449  	if args.Generation != 0 {
   450  		if oldMeta, oldErr := LoadMetadata(ctMeta.FQN()); oldErr == nil && oldMeta.Generation > args.Generation {
   451  			return nil
   452  		}
   453  	}
   454  	tmpFQN := ct.Make(fs.WorkfileType)
   455  	if err := ct.Write(args.Reader, hdr.ObjAttrs.Size, tmpFQN); err != nil {
   456  		return err
   457  	}
   458  	if err := ctMeta.Write(bytes.NewReader(args.MD), -1); err != nil {
   459  		return err
   460  	}
   461  	if _, exists := core.T.Bowner().Get().Get(ctMeta.Bck()); !exists {
   462  		err = fmt.Errorf("slice-and-meta: %s metafile saved while bucket %s was being destroyed",
   463  			ctMeta.ObjectName(), ctMeta.Bucket())
   464  		return err
   465  	}
   466  	err = validateBckBID(&hdr.Bck, args.BID)
   467  	return err
   468  }
   469  
   470  // WriteReplicaAndMeta saves replica and its metafile
   471  func WriteReplicaAndMeta(lom *core.LOM, args *WriteArgs) (err error) {
   472  	lom.Lock(false)
   473  	if args.Generation != 0 {
   474  		ctMeta := core.NewCTFromLOM(lom, fs.ECMetaType)
   475  		if oldMeta, oldErr := LoadMetadata(ctMeta.FQN()); oldErr == nil && oldMeta.Generation > args.Generation {
   476  			lom.Unlock(false)
   477  			return nil
   478  		}
   479  	}
   480  	lom.Unlock(false)
   481  
   482  	if err = writeObject(lom, args.Reader, lom.SizeBytes(true), args.Xact); err != nil {
   483  		return
   484  	}
   485  	if !args.Cksum.IsEmpty() && args.Cksum.Value() != "" { // NOTE: empty value
   486  		if !lom.EqCksum(args.Cksum) {
   487  			err = cos.NewErrDataCksum(args.Cksum, lom.Checksum(), lom.Cname())
   488  			return
   489  		}
   490  	}
   491  	ctMeta := core.NewCTFromLOM(lom, fs.ECMetaType)
   492  	ctMeta.Lock(true)
   493  
   494  	defer func() {
   495  		ctMeta.Unlock(true)
   496  		if err == nil {
   497  			return
   498  		}
   499  		if rmErr := cos.RemoveFile(lom.FQN); rmErr != nil {
   500  			nlog.Errorf("nested error: save replica -> remove replica: %v", rmErr)
   501  		}
   502  		if rmErr := cos.RemoveFile(ctMeta.FQN()); rmErr != nil {
   503  			nlog.Errorf("nested error: save replica -> remove metafile: %v", rmErr)
   504  		}
   505  	}()
   506  	if err = ctMeta.Write(bytes.NewReader(args.MD), -1); err != nil {
   507  		return
   508  	}
   509  	if _, exists := core.T.Bowner().Get().Get(ctMeta.Bck()); !exists {
   510  		err = fmt.Errorf("replica-and-meta: %s metafile saved while bucket %s was being destroyed",
   511  			ctMeta.ObjectName(), ctMeta.Bucket())
   512  		return
   513  	}
   514  	err = validateBckBID(lom.Bucket(), args.BID)
   515  	return
   516  }