github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dsort/api_metrics.go (about)

     1  // Package dsort provides distributed massively parallel resharding for very large datasets.
     2  /*
     3   * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package dsort
     6  
     7  import (
     8  	"math"
     9  	"sync"
    10  	"time"
    11  
    12  	"github.com/NVIDIA/aistore/cmn"
    13  	"github.com/NVIDIA/aistore/cmn/atomic"
    14  )
    15  
    16  const (
    17  	ExtractionPhase = "extraction"
    18  	SortingPhase    = "sorting"
    19  	CreationPhase   = "creation"
    20  )
    21  
    22  // internals
    23  type (
    24  	// TimeStats contains statistics about time spent on specific task. It calculates
    25  	// min, max and avg times.
    26  	TimeStats struct {
    27  		// Total contains total number of milliseconds spend on
    28  		// specific task.
    29  		Total int64 `json:"total_ms,string"`
    30  		// Count contains number of time specific task was triggered.
    31  		Count int64 `json:"count,string"`
    32  		MinMs int64 `json:"min_ms,string"`
    33  		MaxMs int64 `json:"max_ms,string"`
    34  		AvgMs int64 `json:"avg_ms,string"`
    35  	}
    36  
    37  	// included by 3 actual phases below
    38  	phaseBase struct {
    39  		Start time.Time `json:"started_time"`
    40  		End   time.Time `json:"end_time"`
    41  		// Elapsed time (in seconds) from start to given point of time or end when
    42  		// phase has finished.
    43  		Elapsed time.Duration `json:"elapsed"`
    44  		// Running specifies if phase is in progress.
    45  		Running bool `json:"running"`
    46  		// Finished specifies if phase has finished. If running and finished is
    47  		// false this means that the phase did not have started yet.
    48  		Finished bool `json:"finished"`
    49  		//
    50  		// private
    51  		//
    52  		mu sync.Mutex `json:"-"`
    53  	}
    54  )
    55  
    56  // phases
    57  type (
    58  	// LocalExtraction contains metrics for first phase of Dsort.
    59  	LocalExtraction struct {
    60  		phaseBase
    61  		// TotalCnt is the number of shards Dsort has to process in total.
    62  		TotalCnt int64 `json:"total_count,string"`
    63  		// ExtractedCnt is the cumulative number of extracted shards. In the
    64  		// end, this should be roughly equal to TotalCnt/#Targets.
    65  		ExtractedCnt int64 `json:"extracted_count,string"`
    66  		// ExtractedSize is uncompressed size of extracted shards.
    67  		ExtractedSize int64 `json:"extracted_size,string"`
    68  		// ExtractedRecordCnt - number of records extracted from all shards.
    69  		ExtractedRecordCnt int64 `json:"extracted_record_count,string"`
    70  		// ExtractedToDiskCnt describes number of shards extracted to the disk. To
    71  		// compute the number shards extracted to memory just subtract it from
    72  		// ExtractedCnt.
    73  		ExtractedToDiskCnt int64 `json:"extracted_to_disk_count,string"`
    74  		// ExtractedToDiskSize - uncompressed size of shards extracted to disk.
    75  		ExtractedToDiskSize int64 `json:"extracted_to_disk_size,string"`
    76  	}
    77  
    78  	// MetaSorting contains metrics for second phase of Dsort.
    79  	MetaSorting struct {
    80  		phaseBase
    81  		// SentStats - time statistics about records sent to another target
    82  		SentStats *TimeStats `json:"sent_stats,omitempty"`
    83  		// RecvStats - time statistics about records receivied from another target
    84  		RecvStats *TimeStats `json:"recv_stats,omitempty"`
    85  	}
    86  
    87  	// ShardCreation contains metrics for third and last phase of Dsort.
    88  	ShardCreation struct {
    89  		phaseBase
    90  		// ToCreate - number of shards that to be created in this phase.
    91  		ToCreate int64 `json:"to_create,string"`
    92  		// CreatedCnt the number of shards that have been so far created.
    93  		// Should match ToCreate when phase finishes.
    94  		CreatedCnt int64 `json:"created_count,string"`
    95  		// MovedShardCnt specifies the number of shards that have migrated from this
    96  		// to another target. Applies only when dealing with compressed
    97  		// data. Sometimes, rather than creating at the destination, it is faster
    98  		// to create a shard on a specific target and send it over (to the destination).
    99  		MovedShardCnt int64 `json:"moved_shard_count,string"`
   100  		// RequestStats - time statistics: requests to other targets.
   101  		RequestStats *TimeStats `json:"req_stats,omitempty"`
   102  		// ResponseStats - time statistics: responses to other targets.
   103  		ResponseStats *TimeStats `json:"resp_stats,omitempty"`
   104  	}
   105  )
   106  
   107  // main stats-and-status types
   108  type (
   109  	// Metrics is general struct which contains all stats about Dsort run.
   110  	Metrics struct {
   111  		Extraction *LocalExtraction `json:"local_extraction,omitempty"`
   112  		Sorting    *MetaSorting     `json:"meta_sorting,omitempty"`
   113  		Creation   *ShardCreation   `json:"shard_creation,omitempty"`
   114  
   115  		// job description
   116  		Description string `json:"description,omitempty"`
   117  
   118  		// warnings during the run
   119  		Warnings []string `json:"warnings,omitempty"`
   120  		// errors, if any
   121  		Errors []string `json:"errors,omitempty"`
   122  
   123  		// has been aborted
   124  		Aborted atomic.Bool `json:"aborted,omitempty"`
   125  		// has been archived to persistent storage
   126  		Archived atomic.Bool `json:"archived,omitempty"`
   127  	}
   128  
   129  	// JobInfo is a struct that contains stats that represent the Dsort run in a list
   130  	JobInfo struct {
   131  		ID                string        `json:"id"` // job ID == xact ID (aka managerUUID)
   132  		SrcBck            cmn.Bck       `json:"src-bck"`
   133  		DstBck            cmn.Bck       `json:"dst-bck"`
   134  		StartedTime       time.Time     `json:"started_time,omitempty"`
   135  		FinishTime        time.Time     `json:"finish_time,omitempty"`
   136  		ExtractedDuration time.Duration `json:"started_meta_sorting,omitempty"`
   137  		SortingDuration   time.Duration `json:"started_shard_creation,omitempty"`
   138  		CreationDuration  time.Duration `json:"finished_shard_creation,omitempty"`
   139  		Objs              int64         `json:"loc-objs,string"`  // locally processed
   140  		Bytes             int64         `json:"loc-bytes,string"` //
   141  		Metrics           *Metrics
   142  		Aborted           bool `json:"aborted"`
   143  		Archived          bool `json:"archived"`
   144  	}
   145  )
   146  
   147  ///////////////
   148  // phaseBase //
   149  ///////////////
   150  
   151  // begin marks phase as in progress.
   152  func (pi *phaseBase) begin() {
   153  	pi.mu.Lock()
   154  	pi.Running = true
   155  	pi.Start = time.Now()
   156  	pi.mu.Unlock()
   157  }
   158  
   159  // finish marks phase as finished.
   160  func (pi *phaseBase) finish() {
   161  	pi.mu.Lock()
   162  	pi.Running = false
   163  	pi.Finished = true
   164  	pi.End = time.Now()
   165  	pi.Elapsed = pi.End.Sub(pi.Start)
   166  	pi.mu.Unlock()
   167  }
   168  
   169  /////////////
   170  // Metrics //
   171  /////////////
   172  
   173  func newMetrics(description string) *Metrics {
   174  	return &Metrics{
   175  		Description: description,
   176  		Extraction:  &LocalExtraction{},
   177  		Sorting: &MetaSorting{
   178  			SentStats: newTimeStats(),
   179  			RecvStats: newTimeStats(),
   180  		},
   181  		Creation: &ShardCreation{},
   182  	}
   183  }
   184  
   185  // setAbortedTo updates aborted state of Dsort.
   186  func (m *Metrics) setAbortedTo(b bool) {
   187  	m.Aborted.Store(b)
   188  }
   189  
   190  // Lock locks all phases to make sure that all of them can be updated.
   191  func (m *Metrics) lock() {
   192  	m.Extraction.mu.Lock()
   193  	m.Sorting.mu.Lock()
   194  	m.Creation.mu.Lock()
   195  }
   196  
   197  // Unlock unlocks all phases.
   198  func (m *Metrics) unlock() {
   199  	m.Creation.mu.Unlock()
   200  	m.Sorting.mu.Unlock()
   201  	m.Extraction.mu.Unlock()
   202  }
   203  
   204  func (m *Metrics) ElapsedTime() time.Duration {
   205  	return m.Creation.End.Sub(m.Extraction.Start)
   206  }
   207  
   208  // update updates elapsed time for all the metrics.
   209  // NOTE: must be done under lock every time Metrics are about to be marshaled and sent through the network.
   210  func (m *Metrics) update() {
   211  	if m.Extraction.End.IsZero() && !m.Extraction.Start.IsZero() {
   212  		m.Extraction.Elapsed = time.Since(m.Extraction.Start)
   213  	}
   214  	if m.Sorting.End.IsZero() && !m.Sorting.Start.IsZero() {
   215  		m.Sorting.Elapsed = time.Since(m.Sorting.Start)
   216  	}
   217  	if m.Creation.End.IsZero() && !m.Creation.Start.IsZero() {
   218  		m.Creation.Elapsed = time.Since(m.Creation.Start)
   219  	}
   220  }
   221  
   222  func (m *Metrics) ToJobInfo(id string, pars *parsedReqSpec) JobInfo {
   223  	return JobInfo{
   224  		ID:                id,
   225  		SrcBck:            pars.InputBck,
   226  		DstBck:            pars.OutputBck,
   227  		StartedTime:       m.Extraction.Start,
   228  		FinishTime:        m.Creation.End,
   229  		ExtractedDuration: m.Extraction.Elapsed,
   230  		SortingDuration:   m.Sorting.Elapsed,
   231  		CreationDuration:  m.Creation.Elapsed,
   232  		Objs:              m.Extraction.ExtractedCnt,
   233  		Bytes:             m.Extraction.ExtractedSize,
   234  		Metrics:           m,
   235  		Aborted:           m.Aborted.Load(),
   236  		Archived:          m.Archived.Load(),
   237  	}
   238  }
   239  
   240  /////////////
   241  // JobInfo //
   242  /////////////
   243  
   244  func (j *JobInfo) Aggregate(other *JobInfo) {
   245  	j.StartedTime = startTime(j.StartedTime, other.StartedTime)
   246  	j.FinishTime = stopTime(j.FinishTime, other.FinishTime)
   247  
   248  	j.ExtractedDuration = max(j.ExtractedDuration, other.ExtractedDuration)
   249  	j.SortingDuration = max(j.SortingDuration, other.SortingDuration)
   250  	j.CreationDuration = max(j.CreationDuration, other.CreationDuration)
   251  
   252  	j.Aborted = j.Aborted || other.Aborted
   253  	j.Archived = j.Archived && other.Archived
   254  
   255  	j.Objs += other.Objs
   256  	j.Bytes += other.Bytes
   257  }
   258  
   259  func (j *JobInfo) IsRunning() bool {
   260  	return !j.Aborted && !j.Archived
   261  }
   262  
   263  func (j *JobInfo) IsFinished() bool {
   264  	return !j.IsRunning()
   265  }
   266  
   267  // startTime returns the start time of a,b. If either is zero, the other takes precedence.
   268  func startTime(a, b time.Time) time.Time {
   269  	if (a.Before(b) && !a.IsZero()) || b.IsZero() {
   270  		return a
   271  	}
   272  	return b
   273  }
   274  
   275  // stopTime returns the stop time of a,b. If either is zero it's unknown and returns 0.
   276  func stopTime(a, b time.Time) time.Time {
   277  	if (a.After(b) && !b.IsZero()) || a.IsZero() {
   278  		return a
   279  	}
   280  	return b
   281  }
   282  
   283  //
   284  // utility
   285  //
   286  
   287  func newTimeStats() *TimeStats {
   288  	return &TimeStats{
   289  		MinMs: math.MaxInt64,
   290  	}
   291  }
   292  
   293  func (ts *TimeStats) updateTime(newTime time.Duration) {
   294  	t := newTime.Nanoseconds() / int64(time.Millisecond)
   295  	ts.Total += t
   296  	ts.Count++
   297  	ts.MinMs = min(ts.MinMs, t)
   298  	ts.MaxMs = max(ts.MaxMs, t)
   299  	ts.AvgMs = ts.Total / ts.Count
   300  }