github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dload/job.go (about)

     1  // Package dload implements functionality to download resources into AIS cluster from external source.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package dload
     6  
     7  import (
     8  	"errors"
     9  	"fmt"
    10  	"path"
    11  	"strings"
    12  	"time"
    13  
    14  	"github.com/NVIDIA/aistore/api/apc"
    15  	"github.com/NVIDIA/aistore/cmn"
    16  	"github.com/NVIDIA/aistore/cmn/atomic"
    17  	"github.com/NVIDIA/aistore/cmn/cos"
    18  	"github.com/NVIDIA/aistore/cmn/debug"
    19  	"github.com/NVIDIA/aistore/cmn/nlog"
    20  	"github.com/NVIDIA/aistore/core"
    21  	"github.com/NVIDIA/aistore/core/meta"
    22  	"github.com/NVIDIA/aistore/nl"
    23  )
    24  
    25  const (
    26  	// Determines the size of single batch size generated in `genNext`.
    27  	downloadBatchSize = 10_000
    28  )
    29  
    30  // interface guard
    31  var (
    32  	_ jobif = (*sliceDlJob)(nil)
    33  	_ jobif = (*backendDlJob)(nil)
    34  	_ jobif = (*rangeDlJob)(nil)
    35  )
    36  
    37  type (
    38  	dlObj struct {
    39  		objName    string
    40  		link       string
    41  		fromRemote bool
    42  	}
    43  
    44  	jobif interface {
    45  		ID() string
    46  		XactID() string
    47  		Bck() *cmn.Bck
    48  		Description() string
    49  		Timeout() time.Duration
    50  		ActiveStats() (*StatusResp, error)
    51  		String() string
    52  		Notif() core.Notif // notifications
    53  		AddNotif(n core.Notif, job jobif)
    54  
    55  		// If total length (size) of download job is not known, -1 should be returned.
    56  		Len() int
    57  
    58  		// Determines if it requires also syncing.
    59  		Sync() bool
    60  
    61  		// Checks if object name matches the request.
    62  		checkObj(objName string) bool
    63  
    64  		// genNext is supposed to fulfill the following protocol:
    65  		//  `ok` is set to `true` if there is batch to process, `false` otherwise
    66  		genNext() (objs []dlObj, ok bool, err error)
    67  
    68  		// via tryAcquire and release
    69  		throttler() *throttler
    70  
    71  		// job cleanup
    72  		cleanup()
    73  	}
    74  
    75  	baseDlJob struct {
    76  		bck         *meta.Bck
    77  		notif       *NotifDownload
    78  		xdl         *Xact
    79  		id          string
    80  		description string
    81  		timeout     time.Duration
    82  		throt       throttler
    83  	}
    84  
    85  	sliceDlJob struct {
    86  		baseDlJob
    87  		objs    []dlObj
    88  		current int
    89  	}
    90  	multiDlJob struct {
    91  		sliceDlJob
    92  	}
    93  	singleDlJob struct {
    94  		sliceDlJob
    95  	}
    96  
    97  	rangeDlJob struct {
    98  		baseDlJob
    99  		objs  []dlObj            // objects' metas which are ready to be downloaded
   100  		pt    cos.ParsedTemplate // range template
   101  		dir   string             // objects directory(prefix) from request
   102  		count int                // total number object to download by a target
   103  		done  bool               // true when iterator is finished, nothing left to read
   104  	}
   105  
   106  	backendDlJob struct {
   107  		baseDlJob
   108  		prefix            string
   109  		suffix            string
   110  		continuationToken string
   111  		objs              []dlObj // objects' metas which are ready to be downloaded
   112  		sync              bool
   113  		done              bool
   114  	}
   115  
   116  	dljob struct {
   117  		id            string
   118  		xid           string
   119  		description   string
   120  		startedTime   time.Time
   121  		finishedTime  atomic.Time
   122  		finishedCnt   atomic.Int32
   123  		scheduledCnt  atomic.Int32
   124  		skippedCnt    atomic.Int32
   125  		errorCnt      atomic.Int32
   126  		total         int
   127  		aborted       atomic.Bool
   128  		allDispatched atomic.Bool
   129  	}
   130  )
   131  
   132  ///////////////
   133  // baseDlJob //
   134  ///////////////
   135  
   136  func (j *baseDlJob) init(id string, bck *meta.Bck, timeout, desc string, limits Limits, xdl *Xact) {
   137  	// TODO: this might be inaccurate if we download 1 or 2 objects because then
   138  	//  other targets will have limits but will not use them.
   139  	if limits.BytesPerHour > 0 {
   140  		limits.BytesPerHour /= core.T.Sowner().Get().CountActiveTs()
   141  	}
   142  	td, _ := time.ParseDuration(timeout)
   143  	{
   144  		j.id = id
   145  		j.bck = bck
   146  		j.timeout = td
   147  		j.description = desc
   148  		j.throt.init(limits)
   149  		j.xdl = xdl
   150  	}
   151  }
   152  
   153  func (j *baseDlJob) ID() string             { return j.id }
   154  func (j *baseDlJob) XactID() string         { return j.xdl.ID() }
   155  func (j *baseDlJob) Bck() *cmn.Bck          { return j.bck.Bucket() }
   156  func (j *baseDlJob) Timeout() time.Duration { return j.timeout }
   157  func (j *baseDlJob) Description() string    { return j.description }
   158  func (*baseDlJob) Sync() bool               { return false }
   159  
   160  func (j *baseDlJob) String() (s string) {
   161  	s = fmt.Sprintf("dl-job[%s]-%s", j.ID(), j.Bck())
   162  	if j.Description() == "" {
   163  		return
   164  	}
   165  	return s + "-" + j.Description()
   166  }
   167  
   168  func (j *baseDlJob) Notif() core.Notif { return j.notif }
   169  
   170  func (j *baseDlJob) AddNotif(n core.Notif, job jobif) {
   171  	var ok bool
   172  	debug.Assert(j.notif == nil) // currently, "add" means "set"
   173  	j.notif, ok = n.(*NotifDownload)
   174  	debug.Assert(ok)
   175  	j.notif.job = job
   176  	debug.Assert(j.notif.F != nil)
   177  	if n.Upon(core.UponProgress) {
   178  		debug.Assert(j.notif.P != nil)
   179  	}
   180  }
   181  
   182  func (j *baseDlJob) ActiveStats() (*StatusResp, error) {
   183  	resp, _, err := j.xdl.JobStatus(j.ID(), true /*onlyActive*/)
   184  	if err != nil {
   185  		return nil, err
   186  	}
   187  	return resp.(*StatusResp), nil
   188  }
   189  
   190  func (*baseDlJob) checkObj(string) bool    { debug.Assert(false); return false }
   191  func (j *baseDlJob) throttler() *throttler { return &j.throt }
   192  
   193  func (j *baseDlJob) cleanup() {
   194  	j.throttler().stop()
   195  	err, aborted := g.store.markFinished(j.ID())
   196  	aborted = aborted || j.xdl.IsAborted() // TODO: assert equality
   197  	if err != nil {
   198  		nlog.Errorln(j.String()+":", err, aborted)
   199  	}
   200  	g.store.flush(j.ID())
   201  	nl.OnFinished(j.Notif(), err, aborted)
   202  }
   203  
   204  //
   205  // sliceDlJob -- multiDlJob -- singleDlJob
   206  //
   207  
   208  func (j *sliceDlJob) init(bck *meta.Bck, objects cos.StrKVs) error {
   209  	objs, err := buildDlObjs(bck, objects)
   210  	if err != nil {
   211  		return err
   212  	}
   213  	j.objs = objs
   214  	return nil
   215  }
   216  
   217  func (j *sliceDlJob) Len() int { return len(j.objs) }
   218  
   219  func (j *sliceDlJob) genNext() (objs []dlObj, ok bool, err error) {
   220  	if j.current == len(j.objs) {
   221  		return nil, false, nil
   222  	}
   223  	if j.current+downloadBatchSize >= len(j.objs) {
   224  		objs = j.objs[j.current:]
   225  		j.current = len(j.objs)
   226  		return objs, true, nil
   227  	}
   228  
   229  	objs = j.objs[j.current : j.current+downloadBatchSize]
   230  	j.current += downloadBatchSize
   231  	return objs, true, nil
   232  }
   233  
   234  func newMultiDlJob(id string, bck *meta.Bck, payload *MultiBody, xdl *Xact) (mj *multiDlJob, err error) {
   235  	var objs cos.StrKVs
   236  
   237  	mj = &multiDlJob{}
   238  	mj.baseDlJob.init(id, bck, payload.Timeout, payload.Describe(), payload.Limits, xdl)
   239  
   240  	if objs, err = payload.ExtractPayload(); err != nil {
   241  		return nil, err
   242  	}
   243  	err = mj.sliceDlJob.init(bck, objs)
   244  	return
   245  }
   246  
   247  func (j *multiDlJob) String() (s string) { return "multi-" + j.baseDlJob.String() }
   248  
   249  func newSingleDlJob(id string, bck *meta.Bck, payload *SingleBody, xdl *Xact) (sj *singleDlJob, err error) {
   250  	var objs cos.StrKVs
   251  
   252  	sj = &singleDlJob{}
   253  	sj.baseDlJob.init(id, bck, payload.Timeout, payload.Describe(), payload.Limits, xdl)
   254  
   255  	if objs, err = payload.ExtractPayload(); err != nil {
   256  		return nil, err
   257  	}
   258  	err = sj.sliceDlJob.init(bck, objs)
   259  	return
   260  }
   261  
   262  func (j *singleDlJob) String() (s string) {
   263  	return "single-" + j.baseDlJob.String()
   264  }
   265  
   266  ////////////////
   267  // rangeDlJob //
   268  ////////////////
   269  
   270  // NOTE: the sizes of objects to be downloaded will be unknown.
   271  func newRangeDlJob(id string, bck *meta.Bck, payload *RangeBody, xdl *Xact) (rj *rangeDlJob, err error) {
   272  	rj = &rangeDlJob{}
   273  	if rj.pt, err = cos.ParseBashTemplate(payload.Template); err != nil {
   274  		return nil, err
   275  	}
   276  	rj.baseDlJob.init(id, bck, payload.Timeout, payload.Describe(), payload.Limits, xdl)
   277  
   278  	if rj.count, err = countObjects(rj.pt, payload.Subdir, rj.bck); err != nil {
   279  		return nil, err
   280  	}
   281  	rj.pt.InitIter()
   282  	rj.dir = payload.Subdir
   283  	return
   284  }
   285  
   286  func (j *rangeDlJob) SrcBck() *cmn.Bck { return j.bck.Bucket() }
   287  func (j *rangeDlJob) Len() int         { return j.count }
   288  
   289  func (j *rangeDlJob) genNext() ([]dlObj, bool, error) {
   290  	if j.done {
   291  		return nil, false, nil
   292  	}
   293  	if err := j.getNextObjs(); err != nil {
   294  		return nil, false, err
   295  	}
   296  	return j.objs, true, nil
   297  }
   298  
   299  func (j *rangeDlJob) String() (s string) {
   300  	return fmt.Sprintf("range-%s-%d-%s", &j.baseDlJob, j.count, j.dir)
   301  }
   302  
   303  func (j *rangeDlJob) getNextObjs() error {
   304  	var (
   305  		smap = core.T.Sowner().Get()
   306  		sid  = core.T.SID()
   307  	)
   308  	j.objs = j.objs[:0]
   309  	for len(j.objs) < downloadBatchSize {
   310  		link, ok := j.pt.Next()
   311  		if !ok {
   312  			j.done = true
   313  			break
   314  		}
   315  		name := path.Join(j.dir, path.Base(link))
   316  		obj, err := makeDlObj(smap, sid, j.bck, name, link)
   317  		if err != nil {
   318  			if err == errInvalidTarget {
   319  				continue
   320  			}
   321  			return err
   322  		}
   323  		j.objs = append(j.objs, obj)
   324  	}
   325  	return nil
   326  }
   327  
   328  //////////////////
   329  // backendDlJob //
   330  //////////////////
   331  
   332  func newBackendDlJob(id string, bck *meta.Bck, payload *BackendBody, xdl *Xact) (bj *backendDlJob, err error) {
   333  	if !bck.IsRemote() {
   334  		return nil, errors.New("bucket download requires a remote bucket")
   335  	} else if bck.IsHTTP() {
   336  		return nil, errors.New("bucket download does not support HTTP buckets")
   337  	}
   338  	bj = &backendDlJob{}
   339  	bj.baseDlJob.init(id, bck, payload.Timeout, payload.Describe(), payload.Limits, xdl)
   340  	{
   341  		bj.sync = payload.Sync
   342  		bj.prefix = payload.Prefix
   343  		bj.suffix = payload.Suffix
   344  	}
   345  	return
   346  }
   347  
   348  func (*backendDlJob) Len() int     { return -1 }
   349  func (j *backendDlJob) Sync() bool { return j.sync }
   350  
   351  func (j *backendDlJob) String() (s string) {
   352  	return fmt.Sprintf("backend-%s-%s-%s", &j.baseDlJob, j.prefix, j.suffix)
   353  }
   354  
   355  func (j *backendDlJob) checkObj(objName string) bool {
   356  	return strings.HasPrefix(objName, j.prefix) && strings.HasSuffix(objName, j.suffix)
   357  }
   358  
   359  func (j *backendDlJob) genNext() (objs []dlObj, ok bool, err error) {
   360  	if j.done {
   361  		return nil, false, nil
   362  	}
   363  	if err := j.getNextObjs(); err != nil {
   364  		return nil, false, err
   365  	}
   366  	return j.objs, true, nil
   367  }
   368  
   369  // Reads the content of a remote bucket page by page until any objects to
   370  // download found or the bucket list is over.
   371  func (j *backendDlJob) getNextObjs() error {
   372  	var (
   373  		sid     = core.T.SID()
   374  		smap    = core.T.Sowner().Get()
   375  		backend = core.T.Backend(j.bck)
   376  	)
   377  	j.objs = j.objs[:0]
   378  	for len(j.objs) < downloadBatchSize {
   379  		var (
   380  			lst = &cmn.LsoRes{}
   381  			msg = &apc.LsoMsg{Prefix: j.prefix, ContinuationToken: j.continuationToken, PageSize: j.bck.MaxPageSize()}
   382  		)
   383  		_, err := backend.ListObjects(j.bck, msg, lst)
   384  		if err != nil {
   385  			return err
   386  		}
   387  		j.continuationToken = lst.ContinuationToken
   388  
   389  		for _, entry := range lst.Entries {
   390  			if !j.checkObj(entry.Name) {
   391  				continue
   392  			}
   393  			obj, err := makeDlObj(smap, sid, j.bck, entry.Name, "")
   394  			if err != nil {
   395  				if err == errInvalidTarget {
   396  					continue
   397  				}
   398  				return err
   399  			}
   400  			j.objs = append(j.objs, obj)
   401  		}
   402  		if j.continuationToken == "" {
   403  			j.done = true
   404  			break
   405  		}
   406  	}
   407  	return nil
   408  }
   409  
   410  ///////////
   411  // dljob //
   412  ///////////
   413  
   414  func (j *dljob) clone() Job {
   415  	return Job{
   416  		ID:            j.id,
   417  		XactID:        j.xid,
   418  		Description:   j.description,
   419  		FinishedCnt:   int(j.finishedCnt.Load()),
   420  		ScheduledCnt:  int(j.scheduledCnt.Load()),
   421  		SkippedCnt:    int(j.skippedCnt.Load()),
   422  		ErrorCnt:      int(j.errorCnt.Load()),
   423  		Total:         j.total,
   424  		AllDispatched: j.allDispatched.Load(),
   425  		Aborted:       j.aborted.Load(),
   426  		StartedTime:   j.startedTime,
   427  		FinishedTime:  j.finishedTime.Load(),
   428  	}
   429  }
   430  
   431  // Used for debugging purposes to ensure integrity of the struct.
   432  func (j *dljob) valid() (err error) {
   433  	if j.aborted.Load() {
   434  		return
   435  	}
   436  	if !j.allDispatched.Load() {
   437  		return
   438  	}
   439  	if a, b, c := j.scheduledCnt.Load(), j.finishedCnt.Load(), j.errorCnt.Load(); a != b+c {
   440  		err = fmt.Errorf("invalid: %d != %d + %d", a, b, c)
   441  	}
   442  	return
   443  }