github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ec/xaction.go (about)

     1  // Package ec provides erasure coding (EC) based data protection for AIStore.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package ec
     6  
     7  import (
     8  	"fmt"
     9  	"io"
    10  	"os"
    11  	"sync"
    12  
    13  	"github.com/NVIDIA/aistore/cmn"
    14  	"github.com/NVIDIA/aistore/cmn/atomic"
    15  	"github.com/NVIDIA/aistore/cmn/cos"
    16  	"github.com/NVIDIA/aistore/cmn/debug"
    17  	"github.com/NVIDIA/aistore/cmn/nlog"
    18  	"github.com/NVIDIA/aistore/core"
    19  	"github.com/NVIDIA/aistore/core/meta"
    20  	"github.com/NVIDIA/aistore/transport"
    21  	"github.com/NVIDIA/aistore/xact"
    22  )
    23  
    24  const (
    25  	requestBufSizeFS     = 70
    26  	requestBufSizeEncode = 16
    27  )
    28  
    29  type (
    30  	xactECBase struct {
    31  		xact.DemandBase
    32  		config *cmn.Config // config
    33  		stats  stats       // EC statistics
    34  		bck    cmn.Bck     // which bucket xctn belongs to
    35  
    36  		dOwner *dataOwner // data slice manager
    37  		mgr    *Manager   // EC manager
    38  	}
    39  
    40  	xactReqBase struct {
    41  		mpathReqCh chan mpathReq // notify about mountpath changes
    42  		controlCh  chan RequestsControlMsg
    43  
    44  		rejectReq atomic.Bool // marker if EC requests should be rejected
    45  	}
    46  
    47  	mpathReq struct {
    48  		action string
    49  		mpath  string
    50  	}
    51  
    52  	// Manages SGL objects that are waiting for a data from a remote target
    53  	dataOwner struct {
    54  		mtx    sync.Mutex
    55  		slices map[string]*slice
    56  	}
    57  )
    58  
    59  func (r *xactECBase) init(config *cmn.Config, bck *cmn.Bck, mgr *Manager) {
    60  	r.stats = stats{bck: *bck}
    61  	r.config = config
    62  	r.bck = *bck
    63  	r.dOwner = &dataOwner{slices: make(map[string]*slice, 10)}
    64  	r.mgr = mgr
    65  }
    66  
    67  /////////////////
    68  // xactReqBase //
    69  /////////////////
    70  
    71  func (r *xactReqBase) init() {
    72  	r.mpathReqCh = make(chan mpathReq, 1)
    73  	r.controlCh = make(chan RequestsControlMsg, 8)
    74  }
    75  
    76  // ClearRequests disables receiving new EC requests, they will be terminated with error
    77  // Then it starts draining a channel from pending EC requests
    78  // It does not enable receiving new EC requests, it has to be done explicitly, when EC is enabled again
    79  func (r *xactReqBase) ClearRequests() {
    80  	msg := RequestsControlMsg{
    81  		Action: ActClearRequests,
    82  	}
    83  
    84  	r.controlCh <- msg
    85  }
    86  
    87  func (r *xactReqBase) EnableRequests() {
    88  	msg := RequestsControlMsg{
    89  		Action: ActEnableRequests,
    90  	}
    91  
    92  	r.controlCh <- msg
    93  }
    94  
    95  func (r *xactReqBase) setEcRequestsDisabled() {
    96  	r.rejectReq.Store(true)
    97  }
    98  
    99  func (r *xactReqBase) setEcRequestsEnabled() {
   100  	r.rejectReq.Store(false)
   101  }
   102  
   103  func (r *xactReqBase) ecRequestsEnabled() bool {
   104  	return !r.rejectReq.Load()
   105  }
   106  
   107  ////////////////
   108  // xactECBase //
   109  ////////////////
   110  
   111  func newSliceResponse(md *Metadata, attrs *cmn.ObjAttrs, fqn string) (reader cos.ReadOpenCloser, err error) {
   112  	attrs.Ver = md.ObjVersion
   113  	attrs.Cksum = cos.NewCksum(md.CksumType, md.CksumValue)
   114  
   115  	stat, err := os.Stat(fqn)
   116  	if err != nil {
   117  		return nil, err
   118  	}
   119  	attrs.Size = stat.Size()
   120  	reader, err = cos.NewFileHandle(fqn)
   121  	if err != nil {
   122  		nlog.Warningf("Failed to read file stats: %s", err)
   123  		return nil, err
   124  	}
   125  	return reader, nil
   126  }
   127  
   128  // replica/full object request
   129  func newReplicaResponse(attrs *cmn.ObjAttrs, bck *meta.Bck, objName string) (reader cos.ReadOpenCloser, err error) {
   130  	lom := core.AllocLOM(objName)
   131  	defer core.FreeLOM(lom)
   132  	if err = lom.InitBck(bck.Bucket()); err != nil {
   133  		return nil, err
   134  	}
   135  	if err = lom.Load(true /*cache it*/, false /*locked*/); err != nil {
   136  		nlog.Warningln(err)
   137  		return nil, err
   138  	}
   139  	reader, err = cos.NewFileHandle(lom.FQN)
   140  	if err != nil {
   141  		return nil, err
   142  	}
   143  	if lom.SizeBytes() == 0 {
   144  		return nil, nil
   145  	}
   146  	attrs.Size = lom.SizeBytes()
   147  	attrs.Ver = lom.Version()
   148  	attrs.Atime = lom.AtimeUnix()
   149  	attrs.Cksum = lom.Checksum()
   150  	return reader, nil
   151  }
   152  
   153  // Sends the replica/meta/slice data: either to copy replicas/slices after
   154  // encoding or to send requested "object" to a client. In the latter case
   155  // if the local object does not exist, it sends an empty body and sets
   156  // exists=false in response header
   157  func (r *xactECBase) dataResponse(act intraReqType, hdr *transport.ObjHdr, fqn string, bck *meta.Bck, objName string,
   158  	md *Metadata) (err error) {
   159  	var (
   160  		reader   cos.ReadOpenCloser
   161  		objAttrs cmn.ObjAttrs
   162  	)
   163  	ireq := newIntraReq(act, nil, bck)
   164  	if md != nil && md.SliceID != 0 {
   165  		// slice request
   166  		reader, err = newSliceResponse(md, &objAttrs, fqn)
   167  		ireq.exists = err == nil
   168  	} else {
   169  		// replica/full object request
   170  		reader, err = newReplicaResponse(&objAttrs, bck, objName)
   171  		ireq.exists = err == nil
   172  	}
   173  	debug.Assert((objAttrs.Size == 0 && reader == nil) || (objAttrs.Size != 0 && reader != nil))
   174  
   175  	rHdr := transport.ObjHdr{ObjName: objName, ObjAttrs: objAttrs, Opcode: act}
   176  	rHdr.Bck.Copy(bck.Bucket())
   177  	rHdr.Opaque = ireq.NewPack(g.smm)
   178  
   179  	o := transport.AllocSend()
   180  	o.Hdr, o.Callback = rHdr, r.sendCb
   181  
   182  	r.ObjsAdd(1, objAttrs.Size)
   183  	r.IncPending()
   184  	return r.sendByDaemonID([]string{hdr.SID}, o, reader, false)
   185  }
   186  
   187  func (r *xactECBase) sendCb(hdr *transport.ObjHdr, _ io.ReadCloser, _ any, err error) {
   188  	g.smm.Free(hdr.Opaque)
   189  	if err != nil {
   190  		err = cmn.NewErrFailedTo(core.T, "ec-send", hdr.Cname(), err)
   191  		r.AddErr(err, 0)
   192  	}
   193  	r.DecPending()
   194  }
   195  
   196  // Send a data or request to one or few targets by their DaemonIDs. Most of the time
   197  // only DaemonID is known - that is why the function gets DaemonID and internally
   198  // transforms it into meta.Snode.
   199  // * daemonIDs - a list of targets
   200  // * hdr - transport header
   201  // * reader - a data to send
   202  // * cb - optional callback to be called when the transfer completes
   203  // * isRequest - defines the type of request:
   204  //   - true - send lightweight request to all targets (usually reader is nil
   205  //     in this case)
   206  //   - false - send a slice/replica/metadata to targets
   207  func (r *xactECBase) sendByDaemonID(daemonIDs []string, o *transport.Obj, reader cos.ReadOpenCloser, isRequest bool) error {
   208  	var (
   209  		err   error
   210  		nodes = meta.AllocNodes(len(daemonIDs))
   211  		smap  = core.T.Sowner().Get()
   212  	)
   213  	for _, id := range daemonIDs {
   214  		si, ok := smap.Tmap[id]
   215  		if !ok {
   216  			nlog.Errorf("t[%s] not found", id)
   217  			continue
   218  		}
   219  		nodes = append(nodes, si)
   220  	}
   221  	if isRequest {
   222  		err = r.mgr.req().Send(o, reader, nodes...)
   223  	} else {
   224  		err = r.mgr.resp().Send(o, reader, nodes...)
   225  	}
   226  	meta.FreeNodes(nodes)
   227  	return err
   228  }
   229  
   230  // send request to a target, wait for its response, read the data into writer.
   231  //   - daemonID - target to send a request
   232  //   - bucket/objName - what to request
   233  //   - uname - unique name for the operation: the name is built from daemonID,
   234  //     bucket and object names. HTTP data receiving handler generates a name
   235  //     when receiving data and if it finds a writer registered with the same
   236  //     name, it puts the data to its writer and notifies when download is done
   237  //   - request - request to send
   238  //   - writer - an opened writer that will receive the replica/slice/meta
   239  func (r *xactECBase) readRemote(lom *core.LOM, daemonID, uname string, request []byte, writer io.Writer) (int64, error) {
   240  	hdr := transport.ObjHdr{ObjName: lom.ObjName, Opaque: request, Opcode: reqGet}
   241  	hdr.Bck.Copy(lom.Bucket())
   242  
   243  	o := transport.AllocSend()
   244  	o.Hdr = hdr
   245  
   246  	sw := &slice{writer: writer, twg: cos.NewTimeoutGroup(), lom: lom}
   247  	sw.twg.Add(1)
   248  	r.regWriter(uname, sw)
   249  
   250  	if cmn.Rom.FastV(4, cos.SmoduleEC) {
   251  		nlog.Infof("Requesting object %s from %s", lom, daemonID)
   252  	}
   253  	if err := r.sendByDaemonID([]string{daemonID}, o, nil, true); err != nil {
   254  		r.unregWriter(uname)
   255  		r.AddErr(err)
   256  		return 0, err
   257  	}
   258  	if sw.twg.WaitTimeout(r.config.Timeout.SendFile.D()) {
   259  		r.unregWriter(uname)
   260  		err := fmt.Errorf("read-remote(%s): timeout %v", uname, r.config.Timeout.SendFile.D())
   261  		r.AddErr(err)
   262  		return 0, err
   263  	}
   264  	r.unregWriter(uname)
   265  
   266  	if cmn.Rom.FastV(4, cos.SmoduleEC) {
   267  		nlog.Infof("Received object %s from %s", lom, daemonID)
   268  	}
   269  	if sw.version != "" {
   270  		lom.SetVersion(sw.version)
   271  	}
   272  	lom.SetCksum(sw.cksum)
   273  	lom.Uncache()
   274  	return sw.n, nil
   275  }
   276  
   277  // Registers a new slice that will wait for the data to come from
   278  // a remote target
   279  func (r *xactECBase) regWriter(uname string, writer *slice) bool {
   280  	r.dOwner.mtx.Lock()
   281  	_, ok := r.dOwner.slices[uname]
   282  	if ok {
   283  		nlog.Errorf("Writer for %s is already registered", uname)
   284  	} else {
   285  		r.dOwner.slices[uname] = writer
   286  	}
   287  	r.dOwner.mtx.Unlock()
   288  
   289  	return !ok
   290  }
   291  
   292  // Unregisters a slice that has been waiting for the data to come from
   293  // a remote target
   294  func (r *xactECBase) unregWriter(uname string) {
   295  	r.dOwner.mtx.Lock()
   296  	delete(r.dOwner.slices, uname)
   297  	r.dOwner.mtx.Unlock()
   298  }
   299  
   300  // Used to copy replicas/slices after the object is encoded after PUT/restored
   301  // after GET, or to respond to meta/slice/replica request.
   302  //   - daemonIDs - receivers of the data
   303  //   - bucket/objName - object path
   304  //   - reader - object/slice/meta data
   305  //   - src - extra information about the data to send
   306  //   - cb - a caller may set its own callback to execute when the transfer is done.
   307  //     A special case:
   308  //     if a caller does not define its own callback, and it sets the `obj` in
   309  //     `src` it means that the caller wants to automatically free the memory
   310  //     allocated for the `obj` SGL after the object is transferred. The caller
   311  //     may set optional counter in `obj` - the default callback decreases the
   312  //     counter each time the callback is called and when the value drops below 1,
   313  //     `writeRemote` callback frees the SGL
   314  //     The counter is used for sending slices of one big SGL to a few nodes. In
   315  //     this case every slice must be sent to only one target, and transport bundle
   316  //     cannot help to track automatically when SGL should be freed.
   317  func (r *xactECBase) writeRemote(daemonIDs []string, lom *core.LOM, src *dataSource, cb transport.ObjSentCB) error {
   318  	if src.metadata != nil && src.metadata.ObjVersion == "" {
   319  		src.metadata.ObjVersion = lom.Version()
   320  	}
   321  	req := newIntraReq(src.reqType, src.metadata, lom.Bck())
   322  	req.isSlice = src.isSlice
   323  
   324  	putData := req.NewPack(g.smm)
   325  	objAttrs := cmn.ObjAttrs{
   326  		Size:  src.size,
   327  		Ver:   lom.Version(),
   328  		Atime: lom.AtimeUnix(),
   329  	}
   330  	if src.metadata != nil && src.metadata.SliceID != 0 {
   331  		// for a slice read everything from slice's metadata
   332  		if src.metadata.ObjVersion != "" {
   333  			objAttrs.Ver = src.metadata.ObjVersion
   334  		}
   335  		objAttrs.Cksum = cos.NewCksum(src.metadata.CksumType, src.metadata.CksumValue)
   336  	} else {
   337  		objAttrs.Cksum = lom.Checksum()
   338  	}
   339  	hdr := transport.ObjHdr{
   340  		ObjName:  lom.ObjName,
   341  		ObjAttrs: objAttrs,
   342  		Opaque:   putData,
   343  		Opcode:   src.reqType,
   344  	}
   345  	hdr.Bck.Copy(lom.Bucket())
   346  	oldCallback := cb
   347  	cb = func(hdr *transport.ObjHdr, reader io.ReadCloser, arg any, err error) {
   348  		g.smm.Free(hdr.Opaque)
   349  		if oldCallback != nil {
   350  			oldCallback(hdr, reader, arg, err)
   351  		}
   352  		r.DecPending()
   353  	}
   354  
   355  	o := transport.AllocSend()
   356  	o.Hdr, o.Callback = hdr, cb
   357  
   358  	r.IncPending()
   359  	return r.sendByDaemonID(daemonIDs, o, src.reader, false)
   360  }
   361  
   362  // Save data from a target response to SGL or file. When exists is false it
   363  // just drains the response body and returns - because it does not contain
   364  // any data. On completion the function must call writer.wg.Done to notify
   365  // the caller that the data read is completed.
   366  // * writer - where to save the slice/meta/replica data
   367  // * exists - if the remote target had the requested object
   368  // * reader - response body
   369  func _writerReceive(writer *slice, exists bool, objAttrs cmn.ObjAttrs, reader io.Reader) (err error) {
   370  	if !exists {
   371  		writer.twg.Done()
   372  		return ErrorNotFound
   373  	}
   374  
   375  	buf, slab := g.pmm.Alloc()
   376  	writer.n, err = io.CopyBuffer(writer.writer, reader, buf)
   377  	writer.cksum = objAttrs.Cksum
   378  	if writer.version == "" && objAttrs.Ver != "" {
   379  		writer.version = objAttrs.Ver
   380  	}
   381  
   382  	writer.twg.Done()
   383  	slab.Free(buf)
   384  	return err
   385  }
   386  
   387  func (r *xactECBase) ECStats() *Stats { return r.stats.stats() }
   388  
   389  func (r *xactECBase) baseSnap() (snap *core.Snap) {
   390  	snap = &core.Snap{}
   391  	r.ToSnap(snap)
   392  
   393  	snap.IdleX = r.IsIdle()
   394  	return
   395  }