github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dload/api.go (about)

     1  // Package dload implements functionality to download resources into AIS cluster from external source.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package dload
     6  
     7  import (
     8  	"encoding/json"
     9  	"errors"
    10  	"fmt"
    11  	"path"
    12  	"regexp"
    13  	"strings"
    14  	"time"
    15  
    16  	"github.com/NVIDIA/aistore/cmn"
    17  	"github.com/NVIDIA/aistore/cmn/cos"
    18  	"github.com/NVIDIA/aistore/cmn/debug"
    19  	jsoniter "github.com/json-iterator/go"
    20  )
    21  
    22  type Type string
    23  
    24  const (
    25  	TypeSingle  Type = "single"
    26  	TypeRange   Type = "range"
    27  	TypeMulti   Type = "multi"
    28  	TypeBackend Type = "backend"
    29  )
    30  
    31  const PrefixJobID = "dnl-"
    32  
    33  const DownloadProgressInterval = 10 * time.Second
    34  
    35  type (
    36  	// NOTE: Changing this structure requires changes in `MarshalJSON` and `UnmarshalJSON` methods.
    37  	Body struct {
    38  		Type Type `json:"type"`
    39  		json.RawMessage
    40  	}
    41  
    42  	// Download POST result returned to the user
    43  	DlPostResp struct {
    44  		ID string `json:"id"`
    45  	}
    46  
    47  	Job struct {
    48  		ID            string    `json:"id"`
    49  		XactID        string    `json:"xaction_id"`
    50  		Description   string    `json:"description"`
    51  		StartedTime   time.Time `json:"started_time"`
    52  		FinishedTime  time.Time `json:"finished_time"`
    53  		FinishedCnt   int       `json:"finished_cnt"`
    54  		ScheduledCnt  int       `json:"scheduled_cnt"` // tasks being processed or already processed by dispatched
    55  		SkippedCnt    int       `json:"skipped_cnt"`   // number of tasks skipped
    56  		ErrorCnt      int       `json:"error_cnt"`
    57  		Total         int       `json:"total"`          // total number of tasks, negative if unknown
    58  		AllDispatched bool      `json:"all_dispatched"` // if true, dispatcher has already scheduled all tasks for given job
    59  		Aborted       bool      `json:"aborted"`
    60  	}
    61  
    62  	JobInfos []*Job
    63  
    64  	StatusResp struct {
    65  		Job
    66  		CurrentTasks  []TaskDlInfo  `json:"current_tasks,omitempty"`
    67  		FinishedTasks []TaskDlInfo  `json:"finished_tasks,omitempty"`
    68  		Errs          []TaskErrInfo `json:"download_errors,omitempty"`
    69  	}
    70  
    71  	Limits struct {
    72  		Connections  int `json:"connections"`
    73  		BytesPerHour int `json:"bytes_per_hour"`
    74  	}
    75  
    76  	Base struct {
    77  		Description      string  `json:"description"`
    78  		Bck              cmn.Bck `json:"bucket"`
    79  		Timeout          string  `json:"timeout"`
    80  		ProgressInterval string  `json:"progress_interval"`
    81  		Limits           Limits  `json:"limits"`
    82  	}
    83  
    84  	SingleObj struct {
    85  		ObjName    string `json:"object_name"`
    86  		Link       string `json:"link"`
    87  		FromRemote bool   `json:"from_remote"`
    88  	}
    89  
    90  	AdminBody struct {
    91  		ID         string `json:"id"`
    92  		Regex      string `json:"regex"`
    93  		OnlyActive bool   `json:"only_active_tasks"` // Skips detailed info about tasks finished/errored
    94  	}
    95  
    96  	TaskDlInfo struct {
    97  		Name       string    `json:"name"`
    98  		Downloaded int64     `json:"downloaded,string"`
    99  		Total      int64     `json:"total,string,omitempty"`
   100  		StartTime  time.Time `json:"start_time,omitempty"`
   101  		EndTime    time.Time `json:"end_time,omitempty"`
   102  	}
   103  	TaskInfoByName []TaskDlInfo
   104  
   105  	TaskErrInfo struct {
   106  		Name string `json:"name"`
   107  		Err  string `json:"error"`
   108  	}
   109  	TaskErrByName []TaskErrInfo
   110  
   111  	BackendBody struct {
   112  		Base
   113  		Prefix string `json:"prefix"`
   114  		Suffix string `json:"suffix"`
   115  		Sync   bool   `json:"synchronize"`
   116  	}
   117  
   118  	SingleBody struct {
   119  		Base
   120  		SingleObj
   121  	}
   122  
   123  	RangeBody struct {
   124  		Base
   125  		Template string `json:"template"`
   126  		Subdir   string `json:"subdir"`
   127  	}
   128  
   129  	MultiBody struct {
   130  		Base
   131  		ObjectsPayload any `json:"objects"`
   132  	}
   133  )
   134  
   135  func IsType(a string) bool {
   136  	b := Type(a)
   137  	return b == TypeMulti || b == TypeBackend || b == TypeSingle || b == TypeRange
   138  }
   139  
   140  /////////
   141  // Job //
   142  /////////
   143  
   144  func (j *Job) Aggregate(rhs *Job) {
   145  	j.FinishedCnt += rhs.FinishedCnt
   146  	j.ScheduledCnt += rhs.ScheduledCnt
   147  	j.SkippedCnt += rhs.SkippedCnt
   148  	j.ErrorCnt += rhs.ErrorCnt
   149  	j.Total += rhs.Total
   150  	j.AllDispatched = j.AllDispatched && rhs.AllDispatched
   151  	j.Aborted = j.Aborted || rhs.Aborted
   152  	if j.StartedTime.After(rhs.StartedTime) {
   153  		j.StartedTime = rhs.StartedTime
   154  	}
   155  	// Compute max out of `FinishedTime` only when both are non-zero.
   156  	if !cos.IsTimeZero(j.FinishedTime) {
   157  		if cos.IsTimeZero(rhs.FinishedTime) {
   158  			j.FinishedTime = rhs.FinishedTime
   159  		} else if j.FinishedTime.Before(rhs.FinishedTime) {
   160  			j.FinishedTime = rhs.FinishedTime
   161  		}
   162  	}
   163  }
   164  
   165  func _isRunning(fintime time.Time) bool { return cos.IsTimeZero(fintime) }
   166  
   167  func (j *Job) JobFinished() bool {
   168  	if _isRunning(j.FinishedTime) {
   169  		return false
   170  	}
   171  	debug.Assert(j.Aborted || (j.AllDispatched && j.ScheduledCnt == j.DoneCnt()))
   172  	return true
   173  }
   174  
   175  func (j *Job) JobRunning() bool {
   176  	return !j.JobFinished()
   177  }
   178  
   179  func (j *Job) TotalCnt() int {
   180  	if j.Total > 0 {
   181  		return j.Total
   182  	}
   183  	return j.ScheduledCnt
   184  }
   185  
   186  // DoneCnt returns number of tasks that have finished (either successfully or with an error).
   187  func (j *Job) DoneCnt() int { return j.FinishedCnt + j.ErrorCnt }
   188  
   189  // PendingCnt returns number of tasks which are currently being processed.
   190  func (j *Job) PendingCnt() int {
   191  	pending := j.TotalCnt() - j.DoneCnt()
   192  	debug.Assert(pending >= 0)
   193  	return pending
   194  }
   195  
   196  func (j *Job) String() string {
   197  	var (
   198  		sb       strings.Builder
   199  		pending  = j.PendingCnt()
   200  		finished = j.JobFinished()
   201  	)
   202  	sb.WriteString(j.ID)
   203  	if j.Description != "" {
   204  		sb.WriteString(" (")
   205  		sb.WriteString(j.Description)
   206  		sb.WriteString(")")
   207  	}
   208  	sb.WriteString(": ")
   209  
   210  	switch {
   211  	case j.Aborted:
   212  		sb.WriteString("aborted")
   213  	case finished:
   214  		sb.WriteString("finished")
   215  	default:
   216  		sb.WriteString(fmt.Sprintf("%d file%s still being downloaded", pending, cos.Plural(pending)))
   217  	}
   218  	return sb.String()
   219  }
   220  
   221  //////////
   222  // Body //
   223  //////////
   224  
   225  func (db Body) MarshalJSON() ([]byte, error) {
   226  	b, err := db.RawMessage.MarshalJSON()
   227  	if err != nil {
   228  		return nil, err
   229  	}
   230  	debug.Assert(b[0] == '{' && b[len(b)-1] == '}')
   231  	s := fmt.Sprintf(`{"type": %q, %s}`, db.Type, string(b[1:len(b)-1]))
   232  	return []byte(s), nil
   233  }
   234  
   235  func (db *Body) UnmarshalJSON(b []byte) error {
   236  	db.Type = Type(jsoniter.Get(b, "type").ToString())
   237  	if db.Type == "" {
   238  		return errors.New("'type' field is empty")
   239  	}
   240  	return db.RawMessage.UnmarshalJSON(b)
   241  }
   242  
   243  //////////////
   244  // JobInfos //
   245  //////////////
   246  
   247  func (d JobInfos) Len() int {
   248  	return len(d)
   249  }
   250  
   251  func (d JobInfos) Less(i, j int) bool {
   252  	di, dj := d[i], d[j]
   253  	if di.JobRunning() && dj.JobFinished() {
   254  		return true
   255  	} else if di.JobFinished() && dj.JobRunning() {
   256  		return false
   257  	} else if di.JobFinished() && dj.JobFinished() {
   258  		return di.FinishedTime.Before(dj.FinishedTime)
   259  	}
   260  	return di.StartedTime.Before(dj.StartedTime)
   261  }
   262  
   263  func (d JobInfos) Swap(i, j int) {
   264  	d[i], d[j] = d[j], d[i]
   265  }
   266  
   267  ////////////////
   268  // StatusResp //
   269  ////////////////
   270  
   271  func (d *StatusResp) Aggregate(rhs *StatusResp) *StatusResp {
   272  	if d == nil {
   273  		r := StatusResp{}
   274  		err := cos.MorphMarshal(rhs, &r)
   275  		debug.AssertNoErr(err)
   276  		return &r
   277  	}
   278  	d.Job.Aggregate(&rhs.Job)
   279  	d.CurrentTasks = append(d.CurrentTasks, rhs.CurrentTasks...)
   280  	d.FinishedTasks = append(d.FinishedTasks, rhs.FinishedTasks...)
   281  	d.Errs = append(d.Errs, rhs.Errs...)
   282  	return d
   283  }
   284  
   285  //////////
   286  // Base //
   287  //////////
   288  
   289  func (b *Base) Validate() error {
   290  	if b.Bck.Name == "" {
   291  		return errors.New("missing 'bucket.name'")
   292  	}
   293  	if b.Timeout != "" {
   294  		if _, err := time.ParseDuration(b.Timeout); err != nil {
   295  			return fmt.Errorf("failed to parse timeout field: %v", err)
   296  		}
   297  	}
   298  	if b.Limits.Connections < 0 {
   299  		return fmt.Errorf("'limit.connections' must be non-negative (got: %d)", b.Limits.Connections)
   300  	}
   301  	if b.Limits.BytesPerHour < 0 {
   302  		return fmt.Errorf("'limit.bytes_per_hour' must be non-negative (got: %d)", b.Limits.BytesPerHour)
   303  	}
   304  	return nil
   305  }
   306  
   307  ///////////////
   308  // SingleObj //
   309  ///////////////
   310  
   311  func (b *SingleObj) Validate() error {
   312  	if b.ObjName == "" {
   313  		objName := path.Base(b.Link)
   314  		if objName == "." || objName == "/" {
   315  			return errors.New("can not extract a valid 'object_name' from the provided download 'link'")
   316  		}
   317  		b.ObjName = objName
   318  	}
   319  	if b.Link == "" && !b.FromRemote {
   320  		return errors.New("missing 'link' in the request body")
   321  	}
   322  	if b.ObjName == "" {
   323  		return errors.New("missing 'object_name' in the request body")
   324  	}
   325  	return nil
   326  }
   327  
   328  ///////////////
   329  // AdminBody //
   330  ///////////////
   331  
   332  func (b *AdminBody) Validate(requireID bool) error {
   333  	if b.ID != "" && b.Regex != "" {
   334  		return fmt.Errorf("regex %q and job ID %q cannot be defined together (choose one or the other)", b.Regex, b.ID)
   335  	} else if b.Regex != "" {
   336  		if _, err := regexp.CompilePOSIX(b.Regex); err != nil {
   337  			return err
   338  		}
   339  	} else if b.ID == "" && requireID {
   340  		return errors.New("UUID not specified")
   341  	}
   342  	return nil
   343  }
   344  
   345  ////////////////////
   346  // TaskInfoByName //
   347  ////////////////////
   348  
   349  func (t TaskInfoByName) Len() int           { return len(t) }
   350  func (t TaskInfoByName) Swap(i, j int)      { t[i], t[j] = t[j], t[i] }
   351  func (t TaskInfoByName) Less(i, j int) bool { return t[i].Name < t[j].Name }
   352  
   353  func (t TaskErrByName) Len() int           { return len(t) }
   354  func (t TaskErrByName) Swap(i, j int)      { t[i], t[j] = t[j], t[i] }
   355  func (t TaskErrByName) Less(i, j int) bool { return t[i].Name < t[j].Name }
   356  
   357  ////////////////
   358  // SingleBody //
   359  ////////////////
   360  
   361  func (b *SingleBody) Validate() error {
   362  	if err := b.Base.Validate(); err != nil {
   363  		return err
   364  	}
   365  	return b.SingleObj.Validate()
   366  }
   367  
   368  func (b *SingleBody) ExtractPayload() (cos.StrKVs, error) {
   369  	objects := make(cos.StrKVs, 1)
   370  	objects[b.ObjName] = b.Link
   371  	return objects, nil
   372  }
   373  
   374  func (b *SingleBody) Describe() string {
   375  	if b.Description != "" {
   376  		return b.Description
   377  	}
   378  	return fmt.Sprintf("%s -> %s", b.Link, b.Bck.Cname(b.ObjName))
   379  }
   380  
   381  func (b *SingleBody) String() string {
   382  	return fmt.Sprintf("Link: %q, Bucket: %q, ObjName: %q.", b.Link, b.Bck, b.ObjName)
   383  }
   384  
   385  ///////////////
   386  // RangeBody //
   387  ///////////////
   388  
   389  func (b *RangeBody) Validate() error {
   390  	if err := b.Base.Validate(); err != nil {
   391  		return err
   392  	}
   393  	if b.Template == "" {
   394  		return errors.New("missing 'template' in the request body")
   395  	}
   396  	return nil
   397  }
   398  
   399  func (b *RangeBody) Describe() string {
   400  	if b.Description != "" {
   401  		return b.Description
   402  	}
   403  	return fmt.Sprintf("%s -> %s", b.Template, b.Bck)
   404  }
   405  
   406  func (b *RangeBody) String() string {
   407  	return fmt.Sprintf("bucket: %q, template: %q", b.Bck, b.Template)
   408  }
   409  
   410  ///////////////
   411  // MultiBody //
   412  ///////////////
   413  
   414  func (b *MultiBody) Validate() error {
   415  	if b.ObjectsPayload == nil {
   416  		return errors.New("body should not be empty")
   417  	}
   418  	return b.Base.Validate()
   419  }
   420  
   421  func (b *MultiBody) ExtractPayload() (cos.StrKVs, error) {
   422  	objects := make(cos.StrKVs, 10)
   423  	switch ty := b.ObjectsPayload.(type) {
   424  	case map[string]any:
   425  		for key, val := range ty {
   426  			switch v := val.(type) {
   427  			case string:
   428  				objects[key] = v
   429  			default:
   430  				return nil, fmt.Errorf("values in map should be strings, found: %T", v)
   431  			}
   432  		}
   433  	case []any:
   434  		// process all links
   435  		for _, val := range ty {
   436  			switch link := val.(type) {
   437  			case string:
   438  				objName := path.Base(link)
   439  				if objName == "." || objName == "/" {
   440  					err := fmt.Errorf("failed to extract object name from the download %q", link)
   441  					// TODO: ignore and continue?
   442  					return nil, err
   443  				}
   444  				objects[objName] = link
   445  			default:
   446  				return nil, fmt.Errorf("expected download link to be a string, got: %T", link)
   447  			}
   448  		}
   449  	default:
   450  		return nil, fmt.Errorf("JSON body should be map (string -> string) or array of strings, found: %T", ty)
   451  	}
   452  	return objects, nil
   453  }
   454  
   455  func (b *MultiBody) Describe() string {
   456  	if b.Description != "" {
   457  		return b.Description
   458  	}
   459  	return fmt.Sprintf("multi-download -> %s", b.Bck)
   460  }
   461  
   462  func (b *MultiBody) String() string {
   463  	return fmt.Sprintf("bucket: %q", b.Bck)
   464  }
   465  
   466  /////////////////
   467  // BackendBody //
   468  /////////////////
   469  
   470  func (b *BackendBody) Validate() error { return b.Base.Validate() }
   471  
   472  func (b *BackendBody) Describe() string {
   473  	if b.Description != "" {
   474  		return b.Description
   475  	}
   476  	return fmt.Sprintf("remote bucket prefetch -> %s", b.Bck)
   477  }