github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dsort/api_metrics.go (about) 1 // Package dsort provides distributed massively parallel resharding for very large datasets. 2 /* 3 * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package dsort 6 7 import ( 8 "math" 9 "sync" 10 "time" 11 12 "github.com/NVIDIA/aistore/cmn" 13 "github.com/NVIDIA/aistore/cmn/atomic" 14 ) 15 16 const ( 17 ExtractionPhase = "extraction" 18 SortingPhase = "sorting" 19 CreationPhase = "creation" 20 ) 21 22 // internals 23 type ( 24 // TimeStats contains statistics about time spent on specific task. It calculates 25 // min, max and avg times. 26 TimeStats struct { 27 // Total contains total number of milliseconds spend on 28 // specific task. 29 Total int64 `json:"total_ms,string"` 30 // Count contains number of time specific task was triggered. 31 Count int64 `json:"count,string"` 32 MinMs int64 `json:"min_ms,string"` 33 MaxMs int64 `json:"max_ms,string"` 34 AvgMs int64 `json:"avg_ms,string"` 35 } 36 37 // included by 3 actual phases below 38 phaseBase struct { 39 Start time.Time `json:"started_time"` 40 End time.Time `json:"end_time"` 41 // Elapsed time (in seconds) from start to given point of time or end when 42 // phase has finished. 43 Elapsed time.Duration `json:"elapsed"` 44 // Running specifies if phase is in progress. 45 Running bool `json:"running"` 46 // Finished specifies if phase has finished. If running and finished is 47 // false this means that the phase did not have started yet. 48 Finished bool `json:"finished"` 49 // 50 // private 51 // 52 mu sync.Mutex `json:"-"` 53 } 54 ) 55 56 // phases 57 type ( 58 // LocalExtraction contains metrics for first phase of Dsort. 59 LocalExtraction struct { 60 phaseBase 61 // TotalCnt is the number of shards Dsort has to process in total. 62 TotalCnt int64 `json:"total_count,string"` 63 // ExtractedCnt is the cumulative number of extracted shards. In the 64 // end, this should be roughly equal to TotalCnt/#Targets. 65 ExtractedCnt int64 `json:"extracted_count,string"` 66 // ExtractedSize is uncompressed size of extracted shards. 67 ExtractedSize int64 `json:"extracted_size,string"` 68 // ExtractedRecordCnt - number of records extracted from all shards. 69 ExtractedRecordCnt int64 `json:"extracted_record_count,string"` 70 // ExtractedToDiskCnt describes number of shards extracted to the disk. To 71 // compute the number shards extracted to memory just subtract it from 72 // ExtractedCnt. 73 ExtractedToDiskCnt int64 `json:"extracted_to_disk_count,string"` 74 // ExtractedToDiskSize - uncompressed size of shards extracted to disk. 75 ExtractedToDiskSize int64 `json:"extracted_to_disk_size,string"` 76 } 77 78 // MetaSorting contains metrics for second phase of Dsort. 79 MetaSorting struct { 80 phaseBase 81 // SentStats - time statistics about records sent to another target 82 SentStats *TimeStats `json:"sent_stats,omitempty"` 83 // RecvStats - time statistics about records receivied from another target 84 RecvStats *TimeStats `json:"recv_stats,omitempty"` 85 } 86 87 // ShardCreation contains metrics for third and last phase of Dsort. 88 ShardCreation struct { 89 phaseBase 90 // ToCreate - number of shards that to be created in this phase. 91 ToCreate int64 `json:"to_create,string"` 92 // CreatedCnt the number of shards that have been so far created. 93 // Should match ToCreate when phase finishes. 94 CreatedCnt int64 `json:"created_count,string"` 95 // MovedShardCnt specifies the number of shards that have migrated from this 96 // to another target. Applies only when dealing with compressed 97 // data. Sometimes, rather than creating at the destination, it is faster 98 // to create a shard on a specific target and send it over (to the destination). 99 MovedShardCnt int64 `json:"moved_shard_count,string"` 100 // RequestStats - time statistics: requests to other targets. 101 RequestStats *TimeStats `json:"req_stats,omitempty"` 102 // ResponseStats - time statistics: responses to other targets. 103 ResponseStats *TimeStats `json:"resp_stats,omitempty"` 104 } 105 ) 106 107 // main stats-and-status types 108 type ( 109 // Metrics is general struct which contains all stats about Dsort run. 110 Metrics struct { 111 Extraction *LocalExtraction `json:"local_extraction,omitempty"` 112 Sorting *MetaSorting `json:"meta_sorting,omitempty"` 113 Creation *ShardCreation `json:"shard_creation,omitempty"` 114 115 // job description 116 Description string `json:"description,omitempty"` 117 118 // warnings during the run 119 Warnings []string `json:"warnings,omitempty"` 120 // errors, if any 121 Errors []string `json:"errors,omitempty"` 122 123 // has been aborted 124 Aborted atomic.Bool `json:"aborted,omitempty"` 125 // has been archived to persistent storage 126 Archived atomic.Bool `json:"archived,omitempty"` 127 } 128 129 // JobInfo is a struct that contains stats that represent the Dsort run in a list 130 JobInfo struct { 131 ID string `json:"id"` // job ID == xact ID (aka managerUUID) 132 SrcBck cmn.Bck `json:"src-bck"` 133 DstBck cmn.Bck `json:"dst-bck"` 134 StartedTime time.Time `json:"started_time,omitempty"` 135 FinishTime time.Time `json:"finish_time,omitempty"` 136 ExtractedDuration time.Duration `json:"started_meta_sorting,omitempty"` 137 SortingDuration time.Duration `json:"started_shard_creation,omitempty"` 138 CreationDuration time.Duration `json:"finished_shard_creation,omitempty"` 139 Objs int64 `json:"loc-objs,string"` // locally processed 140 Bytes int64 `json:"loc-bytes,string"` // 141 Metrics *Metrics 142 Aborted bool `json:"aborted"` 143 Archived bool `json:"archived"` 144 } 145 ) 146 147 /////////////// 148 // phaseBase // 149 /////////////// 150 151 // begin marks phase as in progress. 152 func (pi *phaseBase) begin() { 153 pi.mu.Lock() 154 pi.Running = true 155 pi.Start = time.Now() 156 pi.mu.Unlock() 157 } 158 159 // finish marks phase as finished. 160 func (pi *phaseBase) finish() { 161 pi.mu.Lock() 162 pi.Running = false 163 pi.Finished = true 164 pi.End = time.Now() 165 pi.Elapsed = pi.End.Sub(pi.Start) 166 pi.mu.Unlock() 167 } 168 169 ///////////// 170 // Metrics // 171 ///////////// 172 173 func newMetrics(description string) *Metrics { 174 return &Metrics{ 175 Description: description, 176 Extraction: &LocalExtraction{}, 177 Sorting: &MetaSorting{ 178 SentStats: newTimeStats(), 179 RecvStats: newTimeStats(), 180 }, 181 Creation: &ShardCreation{}, 182 } 183 } 184 185 // setAbortedTo updates aborted state of Dsort. 186 func (m *Metrics) setAbortedTo(b bool) { 187 m.Aborted.Store(b) 188 } 189 190 // Lock locks all phases to make sure that all of them can be updated. 191 func (m *Metrics) lock() { 192 m.Extraction.mu.Lock() 193 m.Sorting.mu.Lock() 194 m.Creation.mu.Lock() 195 } 196 197 // Unlock unlocks all phases. 198 func (m *Metrics) unlock() { 199 m.Creation.mu.Unlock() 200 m.Sorting.mu.Unlock() 201 m.Extraction.mu.Unlock() 202 } 203 204 func (m *Metrics) ElapsedTime() time.Duration { 205 return m.Creation.End.Sub(m.Extraction.Start) 206 } 207 208 // update updates elapsed time for all the metrics. 209 // NOTE: must be done under lock every time Metrics are about to be marshaled and sent through the network. 210 func (m *Metrics) update() { 211 if m.Extraction.End.IsZero() && !m.Extraction.Start.IsZero() { 212 m.Extraction.Elapsed = time.Since(m.Extraction.Start) 213 } 214 if m.Sorting.End.IsZero() && !m.Sorting.Start.IsZero() { 215 m.Sorting.Elapsed = time.Since(m.Sorting.Start) 216 } 217 if m.Creation.End.IsZero() && !m.Creation.Start.IsZero() { 218 m.Creation.Elapsed = time.Since(m.Creation.Start) 219 } 220 } 221 222 func (m *Metrics) ToJobInfo(id string, pars *parsedReqSpec) JobInfo { 223 return JobInfo{ 224 ID: id, 225 SrcBck: pars.InputBck, 226 DstBck: pars.OutputBck, 227 StartedTime: m.Extraction.Start, 228 FinishTime: m.Creation.End, 229 ExtractedDuration: m.Extraction.Elapsed, 230 SortingDuration: m.Sorting.Elapsed, 231 CreationDuration: m.Creation.Elapsed, 232 Objs: m.Extraction.ExtractedCnt, 233 Bytes: m.Extraction.ExtractedSize, 234 Metrics: m, 235 Aborted: m.Aborted.Load(), 236 Archived: m.Archived.Load(), 237 } 238 } 239 240 ///////////// 241 // JobInfo // 242 ///////////// 243 244 func (j *JobInfo) Aggregate(other *JobInfo) { 245 j.StartedTime = startTime(j.StartedTime, other.StartedTime) 246 j.FinishTime = stopTime(j.FinishTime, other.FinishTime) 247 248 j.ExtractedDuration = max(j.ExtractedDuration, other.ExtractedDuration) 249 j.SortingDuration = max(j.SortingDuration, other.SortingDuration) 250 j.CreationDuration = max(j.CreationDuration, other.CreationDuration) 251 252 j.Aborted = j.Aborted || other.Aborted 253 j.Archived = j.Archived && other.Archived 254 255 j.Objs += other.Objs 256 j.Bytes += other.Bytes 257 } 258 259 func (j *JobInfo) IsRunning() bool { 260 return !j.Aborted && !j.Archived 261 } 262 263 func (j *JobInfo) IsFinished() bool { 264 return !j.IsRunning() 265 } 266 267 // startTime returns the start time of a,b. If either is zero, the other takes precedence. 268 func startTime(a, b time.Time) time.Time { 269 if (a.Before(b) && !a.IsZero()) || b.IsZero() { 270 return a 271 } 272 return b 273 } 274 275 // stopTime returns the stop time of a,b. If either is zero it's unknown and returns 0. 276 func stopTime(a, b time.Time) time.Time { 277 if (a.After(b) && !b.IsZero()) || a.IsZero() { 278 return a 279 } 280 return b 281 } 282 283 // 284 // utility 285 // 286 287 func newTimeStats() *TimeStats { 288 return &TimeStats{ 289 MinMs: math.MaxInt64, 290 } 291 } 292 293 func (ts *TimeStats) updateTime(newTime time.Duration) { 294 t := newTime.Nanoseconds() / int64(time.Millisecond) 295 ts.Total += t 296 ts.Count++ 297 ts.MinMs = min(ts.MinMs, t) 298 ts.MaxMs = max(ts.MaxMs, t) 299 ts.AvgMs = ts.Total / ts.Count 300 }