github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/xact/xs/archive.go (about)

     1  // Package xs is a collection of eXtended actions (xactions), including multi-object
     2  // operations, list-objects, (cluster) rebalance and (target) resilver, ETL, and more.
     3  /*
     4   * Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved.
     5   */
     6  package xs
     7  
     8  import (
     9  	"archive/tar"
    10  	"context"
    11  	"fmt"
    12  	"io"
    13  	"net/http"
    14  	"os"
    15  	"path/filepath"
    16  	"sync"
    17  	"time"
    18  
    19  	"github.com/NVIDIA/aistore/api/apc"
    20  	"github.com/NVIDIA/aistore/cmn"
    21  	"github.com/NVIDIA/aistore/cmn/archive"
    22  	"github.com/NVIDIA/aistore/cmn/atomic"
    23  	"github.com/NVIDIA/aistore/cmn/cos"
    24  	"github.com/NVIDIA/aistore/cmn/debug"
    25  	"github.com/NVIDIA/aistore/cmn/nlog"
    26  	"github.com/NVIDIA/aistore/core"
    27  	"github.com/NVIDIA/aistore/core/meta"
    28  	"github.com/NVIDIA/aistore/fs"
    29  	"github.com/NVIDIA/aistore/transport"
    30  	"github.com/NVIDIA/aistore/xact"
    31  	"github.com/NVIDIA/aistore/xact/xreg"
    32  )
    33  
    34  // TODO:
    35  // - enable multi-threaded list-range iter (see lrit.init)
    36  // - one source multiple destination buckets (feature)
    37  
    38  type (
    39  	archFactory struct {
    40  		streamingF
    41  	}
    42  	archwi struct { // archival work item; implements lrwi
    43  		writer  archive.Writer
    44  		r       *XactArch
    45  		msg     *cmn.ArchiveBckMsg
    46  		tsi     *meta.Snode
    47  		archlom *core.LOM
    48  		fqn     string   // workFQN --/--
    49  		wfh     *os.File // --/--
    50  		cksum   cos.CksumHashSize
    51  		cnt     atomic.Int32 // num archived
    52  		// tar only
    53  		appendPos int64 // append to existing
    54  		tarFormat tar.Format
    55  		// finishing
    56  		refc atomic.Int32
    57  	}
    58  	XactArch struct {
    59  		streamingX
    60  		workCh  chan *cmn.ArchiveBckMsg
    61  		bckTo   *meta.Bck
    62  		pending struct {
    63  			m map[string]*archwi
    64  			sync.RWMutex
    65  		}
    66  	}
    67  )
    68  
    69  // interface guard
    70  var (
    71  	_ core.Xact      = (*XactArch)(nil)
    72  	_ xreg.Renewable = (*archFactory)(nil)
    73  	_ lrwi           = (*archwi)(nil)
    74  )
    75  
    76  /////////////////
    77  // archFactory //
    78  /////////////////
    79  
    80  func (*archFactory) New(args xreg.Args, bck *meta.Bck) xreg.Renewable {
    81  	p := &archFactory{streamingF: streamingF{RenewBase: xreg.RenewBase{Args: args, Bck: bck}, kind: apc.ActArchive}}
    82  	return p
    83  }
    84  
    85  func (p *archFactory) Start() (err error) {
    86  	//
    87  	// target-local generation of a global UUID
    88  	//
    89  	bckTo, ok := p.Args.Custom.(*meta.Bck)
    90  	debug.Assertf(ok, "%+v", bckTo)
    91  	if !ok || bckTo.IsEmpty() {
    92  		bckTo = &meta.Bck{Name: "any"} // local usage to gen uuid, see r.bckTo below
    93  	}
    94  	p.Args.UUID, err = p.genBEID(p.Bck, bckTo)
    95  	if err != nil {
    96  		return err
    97  	}
    98  	//
    99  	// new x-archive
   100  	//
   101  	workCh := make(chan *cmn.ArchiveBckMsg, maxNumInParallel)
   102  	r := &XactArch{streamingX: streamingX{p: &p.streamingF, config: cmn.GCO.Get()}, workCh: workCh}
   103  	r.pending.m = make(map[string]*archwi, maxNumInParallel)
   104  	p.xctn = r
   105  	r.DemandBase.Init(p.UUID() /*== p.Args.UUID above*/, p.kind, p.Bck /*from*/, xact.IdleDefault)
   106  
   107  	if err := p.newDM(p.Args.UUID /*trname*/, r.recv, r.config, cmn.OwtPut, 0 /*pdu*/); err != nil {
   108  		return err
   109  	}
   110  	if r.p.dm != nil {
   111  		r.p.dm.SetXact(r)
   112  		r.p.dm.Open()
   113  	}
   114  	xact.GoRunW(r)
   115  	return
   116  }
   117  
   118  //////////////
   119  // XactArch //
   120  //////////////
   121  
   122  func (r *XactArch) Begin(msg *cmn.ArchiveBckMsg, archlom *core.LOM) (err error) {
   123  	if err = archlom.InitBck(&msg.ToBck); err != nil {
   124  		r.AddErr(err, 4, cos.SmoduleXs)
   125  		return err
   126  	}
   127  	debug.Assert(archlom.Cname() == msg.Cname()) // relying on it
   128  
   129  	wi := &archwi{r: r, msg: msg, archlom: archlom, tarFormat: tar.FormatUnknown}
   130  	wi.fqn = fs.CSM.Gen(wi.archlom, fs.WorkfileType, fs.WorkfileCreateArch)
   131  	wi.cksum.Init(archlom.CksumType())
   132  
   133  	// here and elsewhere: an extra check to make sure this target is active (ref: ignoreMaintenance)
   134  	smap := core.T.Sowner().Get()
   135  	if err = core.InMaintOrDecomm(smap, core.T.Snode(), r); err != nil {
   136  		return
   137  	}
   138  	nat := smap.CountActiveTs()
   139  	wi.refc.Store(int32(nat - 1))
   140  
   141  	wi.tsi, err = smap.HrwName2T(msg.ToBck.MakeUname(msg.ArchName))
   142  	if err != nil {
   143  		r.AddErr(err, 4, cos.SmoduleXs)
   144  		return
   145  	}
   146  
   147  	// fcreate at BEGIN time
   148  	if core.T.SID() == wi.tsi.ID() {
   149  		var (
   150  			s           string
   151  			lmfh        *os.File
   152  			finfo, errX = os.Stat(wi.archlom.FQN)
   153  			exists      = errX == nil
   154  		)
   155  		if exists && wi.msg.AppendIfExists {
   156  			s = " append"
   157  			lmfh, err = wi.beginAppend()
   158  		} else {
   159  			wi.wfh, err = wi.archlom.CreateFile(wi.fqn)
   160  		}
   161  		if err != nil {
   162  			return
   163  		}
   164  		if cmn.Rom.FastV(5, cos.SmoduleXs) {
   165  			nlog.Infof("%s: begin%s %s", r.Base.Name(), s, msg.Cname())
   166  		}
   167  
   168  		// construct format-specific writer; serialize for multi-target conc. writing
   169  		opts := archive.Opts{Serialize: nat > 1, TarFormat: wi.tarFormat}
   170  		wi.writer = archive.NewWriter(msg.Mime, wi.wfh, &wi.cksum, &opts)
   171  
   172  		// append case (above)
   173  		if lmfh != nil {
   174  			err = wi.writer.Copy(lmfh, finfo.Size())
   175  			if err != nil {
   176  				wi.writer.Fini()
   177  				wi.cleanup()
   178  				return
   179  			}
   180  		}
   181  	}
   182  
   183  	// most of the time there'll be a single destination bucket for the lifetime
   184  	if r.bckTo == nil {
   185  		if from := r.Bck().Bucket(); !from.Equal(&wi.msg.ToBck) {
   186  			r.bckTo = meta.CloneBck(&wi.msg.ToBck)
   187  		}
   188  	}
   189  
   190  	r.pending.Lock()
   191  	r.pending.m[msg.TxnUUID] = wi
   192  	r.wiCnt.Inc()
   193  	r.pending.Unlock()
   194  	return
   195  }
   196  
   197  func (r *XactArch) Do(msg *cmn.ArchiveBckMsg) {
   198  	r.IncPending()
   199  	r.workCh <- msg
   200  }
   201  
   202  func (r *XactArch) Run(wg *sync.WaitGroup) {
   203  	var err error
   204  	nlog.Infoln(r.Name())
   205  	wg.Done()
   206  	for {
   207  		select {
   208  		case msg := <-r.workCh:
   209  			r.pending.RLock()
   210  			wi, ok := r.pending.m[msg.TxnUUID]
   211  			r.pending.RUnlock()
   212  			if !ok {
   213  				debug.Assert(r.ErrCnt() > 0) // see cleanup
   214  				goto fin
   215  			}
   216  			var (
   217  				smap = core.T.Sowner().Get()
   218  				lrit = &lriterator{}
   219  			)
   220  			err = lrit.init(r, &msg.ListRange, r.Bck(), true /*TODO: remove blocking*/)
   221  			if err != nil {
   222  				r.Abort(err)
   223  				goto fin
   224  			}
   225  			err = lrit.run(wi, smap)
   226  			if err != nil {
   227  				r.AddErr(err)
   228  			}
   229  			lrit.wait()
   230  			if r.Err() != nil {
   231  				wi.cleanup()
   232  				goto fin
   233  			}
   234  			if core.T.SID() == wi.tsi.ID() {
   235  				go r.finalize(wi) // async finalize this shard
   236  			} else {
   237  				r.sendTerm(wi.msg.TxnUUID, wi.tsi, nil)
   238  				r.pending.Lock()
   239  				delete(r.pending.m, msg.TxnUUID)
   240  				r.wiCnt.Dec()
   241  				r.pending.Unlock()
   242  				r.DecPending()
   243  
   244  				core.FreeLOM(wi.archlom)
   245  			}
   246  		case <-r.IdleTimer():
   247  			goto fin
   248  		case <-r.ChanAbort():
   249  			goto fin
   250  		}
   251  	}
   252  fin:
   253  	r.streamingX.fin(true /*unreg Rx*/)
   254  	if r.Err() == nil {
   255  		return
   256  	}
   257  
   258  	// [cleanup] close and rm unfinished archives (compare w/ finalize)
   259  	r.pending.Lock()
   260  	for _, wi := range r.pending.m {
   261  		wi.cleanup()
   262  	}
   263  	clear(r.pending.m)
   264  	r.pending.Unlock()
   265  }
   266  
   267  func (r *XactArch) doSend(lom *core.LOM, wi *archwi, fh cos.ReadOpenCloser) {
   268  	debug.Assert(r.p.dm != nil)
   269  	o := transport.AllocSend()
   270  	hdr := &o.Hdr
   271  	{
   272  		hdr.Bck = wi.msg.ToBck
   273  		hdr.ObjName = lom.ObjName
   274  		hdr.ObjAttrs.CopyFrom(lom.ObjAttrs(), false /*skip cksum*/)
   275  		hdr.Opaque = []byte(wi.msg.TxnUUID)
   276  	}
   277  	// o.Callback nil on purpose (lom is freed by the iterator)
   278  	r.p.dm.Send(o, fh, wi.tsi)
   279  }
   280  
   281  func (r *XactArch) recv(hdr *transport.ObjHdr, objReader io.Reader, err error) error {
   282  	if err != nil && !cos.IsEOF(err) {
   283  		r.AddErr(err, 5, cos.SmoduleXs)
   284  		return err
   285  	}
   286  
   287  	r.IncPending()
   288  	err = r._recv(hdr, objReader)
   289  	r.DecPending()
   290  	transport.DrainAndFreeReader(objReader)
   291  	return err
   292  }
   293  
   294  func (r *XactArch) _recv(hdr *transport.ObjHdr, objReader io.Reader) error {
   295  	r.pending.RLock()
   296  	wi, ok := r.pending.m[cos.UnsafeS(hdr.Opaque)] // txnUUID
   297  	r.pending.RUnlock()
   298  	if !ok {
   299  		if r.Finished() || r.IsAborted() {
   300  			return nil
   301  		}
   302  		cnt, err := r.JoinErr()
   303  		debug.Assert(cnt > 0) // see cleanup
   304  		return err
   305  	}
   306  	debug.Assert(wi.tsi.ID() == core.T.SID() && wi.msg.TxnUUID == cos.UnsafeS(hdr.Opaque))
   307  
   308  	// NOTE: best-effort via ref-counting
   309  	if hdr.Opcode == opcodeDone {
   310  		refc := wi.refc.Dec()
   311  		debug.Assert(refc >= 0)
   312  		return nil
   313  	}
   314  
   315  	debug.Assert(hdr.Opcode == 0)
   316  	err := wi.writer.Write(wi.nameInArch(hdr.ObjName), &hdr.ObjAttrs, objReader)
   317  	if err == nil {
   318  		wi.cnt.Inc()
   319  	} else {
   320  		r.AddErr(err, 5, cos.SmoduleXs)
   321  	}
   322  	return nil
   323  }
   324  
   325  // NOTE: in goroutine
   326  func (r *XactArch) finalize(wi *archwi) {
   327  	q := wi.quiesce()
   328  	if q == core.QuiTimeout {
   329  		err := fmt.Errorf("%s: %v", r, cmn.ErrQuiesceTimeout)
   330  		r.AddErr(err, 4, cos.SmoduleXs)
   331  	}
   332  
   333  	r.pending.Lock()
   334  	delete(r.pending.m, wi.msg.TxnUUID)
   335  	r.wiCnt.Dec()
   336  	r.pending.Unlock()
   337  
   338  	ecode, err := r.fini(wi)
   339  	r.DecPending()
   340  	if cmn.Rom.FastV(5, cos.SmoduleXs) {
   341  		var s string
   342  		if err != nil {
   343  			s = fmt.Sprintf(": %v(%d)", err, ecode)
   344  		}
   345  		nlog.Infof("%s: finalize %s%s", r.Base.Name(), wi.msg.Cname(), s)
   346  	}
   347  	if err == nil || r.IsAborted() { // done ok (unless aborted)
   348  		return
   349  	}
   350  	debug.Assert(q != core.QuiAborted)
   351  
   352  	wi.cleanup()
   353  	r.AddErr(err, 5, cos.SmoduleXs)
   354  }
   355  
   356  func (r *XactArch) fini(wi *archwi) (ecode int, err error) {
   357  	wi.writer.Fini()
   358  
   359  	if r.IsAborted() {
   360  		wi.cleanup()
   361  		core.FreeLOM(wi.archlom)
   362  		return
   363  	}
   364  
   365  	var size int64
   366  	if wi.cnt.Load() == 0 {
   367  		s := "empty"
   368  		if wi.appendPos > 0 {
   369  			s = "no new appends to"
   370  		}
   371  		if cnt, errs := r.JoinErr(); cnt > 0 {
   372  			err = fmt.Errorf("%s: %s %s, err: %v (cnt=%d)", r, s, wi.archlom, errs, cnt)
   373  		} else {
   374  			err = fmt.Errorf("%s: %s %s", r, s, wi.archlom)
   375  		}
   376  	} else {
   377  		size, err = wi.finalize()
   378  	}
   379  	if err != nil {
   380  		wi.cleanup()
   381  		core.FreeLOM(wi.archlom)
   382  		ecode = http.StatusInternalServerError
   383  		return
   384  	}
   385  
   386  	wi.archlom.SetSize(size)
   387  	cos.Close(wi.wfh)
   388  	wi.wfh = nil
   389  
   390  	ecode, err = core.T.FinalizeObj(wi.archlom, wi.fqn, r, cmn.OwtArchive)
   391  	core.FreeLOM(wi.archlom)
   392  	r.ObjsAdd(1, size-wi.appendPos)
   393  	return
   394  }
   395  
   396  func (r *XactArch) Name() (s string) {
   397  	s = r.streamingX.Name()
   398  	if src, dst := r.FromTo(); src != nil {
   399  		s += " => " + dst.String()
   400  	}
   401  	return
   402  }
   403  
   404  func (r *XactArch) String() (s string) {
   405  	s = r.streamingX.String() + " => "
   406  	if r.wiCnt.Load() > 0 && r.bckTo != nil {
   407  		s += r.bckTo.String()
   408  	}
   409  	return
   410  }
   411  
   412  func (r *XactArch) FromTo() (src, dst *meta.Bck) {
   413  	if r.bckTo != nil {
   414  		src, dst = r.Bck(), r.bckTo
   415  	}
   416  	return
   417  }
   418  
   419  func (r *XactArch) Snap() (snap *core.Snap) {
   420  	snap = &core.Snap{}
   421  	r.ToSnap(snap)
   422  
   423  	snap.IdleX = r.IsIdle()
   424  	if f, t := r.FromTo(); f != nil {
   425  		snap.SrcBck, snap.DstBck = f.Clone(), t.Clone()
   426  	}
   427  	return
   428  }
   429  
   430  ////////////
   431  // archwi //
   432  ////////////
   433  
   434  func (wi *archwi) beginAppend() (lmfh *os.File, err error) {
   435  	msg := wi.msg
   436  	if msg.Mime == archive.ExtTar {
   437  		if err = wi.openTarForAppend(); err == nil || err != archive.ErrTarIsEmpty {
   438  			return
   439  		}
   440  	}
   441  	// msg.Mime has been already validated (see ais/* for apc.ActArchive)
   442  	// prep to copy `lmfh` --> `wi.fh` with subsequent APPEND-ing
   443  	lmfh, err = wi.archlom.OpenFile()
   444  	if err != nil {
   445  		return
   446  	}
   447  	if wi.wfh, err = wi.archlom.CreateFile(wi.fqn); err != nil {
   448  		cos.Close(lmfh)
   449  		lmfh = nil
   450  	}
   451  	return
   452  }
   453  
   454  func (wi *archwi) openTarForAppend() (err error) {
   455  	if err = os.Rename(wi.archlom.FQN, wi.fqn); err != nil {
   456  		return
   457  	}
   458  	// open (rw) lom itself
   459  	wi.wfh, wi.tarFormat, err = archive.OpenTarSeekEnd(wi.archlom.ObjName, wi.fqn)
   460  	if err != nil {
   461  		goto roll
   462  	}
   463  	wi.appendPos, err = wi.wfh.Seek(0, io.SeekCurrent)
   464  	if err == nil {
   465  		return // can append
   466  	}
   467  	wi.appendPos, wi.tarFormat = 0, tar.FormatUnknown // reset
   468  	cos.Close(wi.wfh)
   469  	wi.wfh = nil
   470  roll:
   471  	if errV := wi.archlom.RenameFrom(wi.fqn); errV != nil {
   472  		nlog.Errorf("%s: nested error: failed to append %s (%v) and rename back from %s (%v)",
   473  			wi.tsi, wi.archlom, err, wi.fqn, errV)
   474  	} else {
   475  		wi.fqn = ""
   476  	}
   477  	return
   478  }
   479  
   480  // multi-object iterator i/f: "handle work item"
   481  func (wi *archwi) do(lom *core.LOM, lrit *lriterator) {
   482  	var coldGet bool
   483  	if err := lom.Load(false /*cache it*/, false /*locked*/); err != nil {
   484  		if !cos.IsNotExist(err, 0) {
   485  			wi.r.AddErr(err, 5, cos.SmoduleXs)
   486  			return
   487  		}
   488  		if coldGet = lom.Bck().IsRemote(); !coldGet {
   489  			if lrit.lrp == lrpList {
   490  				// listed, not found
   491  				wi.r.AddErr(err, 5, cos.SmoduleXs)
   492  			}
   493  			return
   494  		}
   495  	}
   496  
   497  	if coldGet {
   498  		// cold
   499  		if ecode, err := core.T.GetCold(context.Background(), lom, cmn.OwtGetLock); err != nil {
   500  			if lrit.lrp != lrpList && cos.IsNotExist(err, ecode) {
   501  				return // range or prefix, not found
   502  			}
   503  			wi.r.AddErr(err, 5, cos.SmoduleXs)
   504  			return
   505  		}
   506  	}
   507  
   508  	fh, err := cos.NewFileHandle(lom.FQN)
   509  	if err != nil {
   510  		wi.r.AddErr(err, 5, cos.SmoduleXs)
   511  		return
   512  	}
   513  	if core.T.SID() != wi.tsi.ID() {
   514  		wi.r.doSend(lom, wi, fh)
   515  		return
   516  	}
   517  	debug.Assert(wi.wfh != nil) // see Begin
   518  	err = wi.writer.Write(wi.nameInArch(lom.ObjName), lom, fh /*reader*/)
   519  	cos.Close(fh)
   520  	if err == nil {
   521  		wi.cnt.Inc()
   522  	} else {
   523  		wi.r.AddErr(err, 5, cos.SmoduleXs)
   524  	}
   525  }
   526  
   527  func (wi *archwi) quiesce() core.QuiRes {
   528  	timeout := cmn.Rom.CplaneOperation()
   529  	return wi.r.Quiesce(timeout, func(total time.Duration) core.QuiRes {
   530  		if wi.refc.Load() == 0 && wi.r.wiCnt.Load() == 1 /*the last wi (so far) about to `fini`*/ {
   531  			return core.QuiDone
   532  		}
   533  		return xact.RefcntQuiCB(&wi.refc, wi.r.config.Timeout.SendFile.D()/2, total)
   534  	})
   535  }
   536  
   537  func (wi *archwi) nameInArch(objName string) string {
   538  	if !wi.msg.InclSrcBname {
   539  		return objName
   540  	}
   541  	buf := make([]byte, 0, len(wi.msg.FromBckName)+1+len(objName))
   542  	buf = append(buf, wi.msg.FromBckName...)
   543  	buf = append(buf, filepath.Separator)
   544  	buf = append(buf, objName...)
   545  	return cos.UnsafeS(buf)
   546  }
   547  
   548  func (wi *archwi) cleanup() {
   549  	if wi.wfh != nil {
   550  		cos.Close(wi.wfh)
   551  		wi.wfh = nil
   552  	}
   553  	if wi.fqn != "" {
   554  		if wi.archlom == nil || wi.archlom.FQN != wi.fqn {
   555  			cos.RemoveFile(wi.fqn)
   556  		}
   557  		wi.fqn = ""
   558  	}
   559  }
   560  
   561  func (wi *archwi) finalize() (int64, error) {
   562  	if wi.appendPos > 0 {
   563  		size, err := wi.wfh.Seek(0, io.SeekCurrent)
   564  		if err != nil {
   565  			return 0, err
   566  		}
   567  		debug.Assertf(size > wi.appendPos, "%d vs %d", size, wi.appendPos)
   568  		// checksum traded off
   569  		wi.archlom.SetCksum(cos.NewCksum(cos.ChecksumNone, ""))
   570  		return size, nil
   571  	}
   572  	wi.cksum.Finalize()
   573  	wi.archlom.SetCksum(&wi.cksum.Cksum)
   574  	return wi.cksum.Size, nil
   575  }