go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/internal/clustering/reclustering/worker.go (about) 1 // Copyright 2022 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package reclustering 16 17 import ( 18 "context" 19 "encoding/hex" 20 "fmt" 21 "math/big" 22 "time" 23 24 "go.opentelemetry.io/otel/attribute" 25 "google.golang.org/grpc/codes" 26 "google.golang.org/grpc/status" 27 "google.golang.org/protobuf/types/known/timestamppb" 28 29 "go.chromium.org/luci/common/clock" 30 "go.chromium.org/luci/common/errors" 31 "go.chromium.org/luci/common/retry" 32 "go.chromium.org/luci/common/retry/transient" 33 "go.chromium.org/luci/server/span" 34 35 "go.chromium.org/luci/analysis/internal/clustering/algorithms" 36 cpb "go.chromium.org/luci/analysis/internal/clustering/proto" 37 "go.chromium.org/luci/analysis/internal/clustering/shards" 38 "go.chromium.org/luci/analysis/internal/clustering/state" 39 "go.chromium.org/luci/analysis/internal/config/compiledcfg" 40 "go.chromium.org/luci/analysis/internal/tasks/taskspb" 41 "go.chromium.org/luci/analysis/internal/tracing" 42 ) 43 44 const ( 45 // batchSize is the number of chunks to read from Spanner at a time. 46 batchSize = 10 47 48 // TargetTaskDuration is the desired duration of a re-clustering task. 49 // If a task completes before the reclustering run has completed, a 50 // continuation task will be scheduled. 51 // 52 // Longer durations will incur lower task queuing/re-queueing overhead, 53 // but limit the ability of autoscaling to move tasks between instances 54 // in response to load. 55 TargetTaskDuration = 2 * time.Second 56 57 // ProgressInterval is the amount of time between progress updates. 58 // 59 // Note that this is the frequency at which updates should 60 // be reported for a shard of work; individual tasks are usually 61 // much shorter lived and consequently most will not report any progress 62 // (unless it is time for the shard to report progress again). 63 ProgressInterval = 5 * time.Second 64 ) 65 66 // ChunkStore is the interface for the blob store archiving chunks of test 67 // results for later re-clustering. 68 type ChunkStore interface { 69 // Get retrieves the chunk with the specified object ID and returns it. 70 Get(ctx context.Context, project, objectID string) (*cpb.Chunk, error) 71 } 72 73 // Worker provides methods to process re-clustering tasks. It is safe to be 74 // used by multiple threads concurrently. 75 type Worker struct { 76 chunkStore ChunkStore 77 analysis Analysis 78 } 79 80 // NewWorker initialises a new Worker. 81 func NewWorker(chunkStore ChunkStore, analysis Analysis) *Worker { 82 return &Worker{ 83 chunkStore: chunkStore, 84 analysis: analysis, 85 } 86 } 87 88 // taskContext provides objects relevant to working on a particular 89 // re-clustering task. 90 type taskContext struct { 91 worker *Worker 92 task *taskspb.ReclusterChunks 93 // nextReportDue is the time at which the next progress update is 94 // due. 95 nextReportDue time.Time 96 // currentChunkID is the exclusive lower bound of the range 97 // of ChunkIds still to re-cluster. 98 currentChunkID string 99 } 100 101 // Do works on a re-clustering task for approximately duration, returning a 102 // continuation task (if the run end time has not been reached). 103 // 104 // Continuation tasks are used to better integrate with GAE autoscaling, 105 // autoscaling work best when tasks are relatively small (so that work 106 // can be moved between instances in real time). 107 func (w *Worker) Do(ctx context.Context, task *taskspb.ReclusterChunks, duration time.Duration) (*taskspb.ReclusterChunks, error) { 108 if task.State == nil { 109 return nil, errors.New("task does not have state") 110 } 111 if task.ShardNumber <= 0 { 112 return nil, errors.New("task must have valid shard number") 113 } 114 if task.AlgorithmsVersion <= 0 { 115 return nil, errors.New("task must have valid algorithms version") 116 } 117 118 runEndTime := task.AttemptTime.AsTime() 119 120 if task.AlgorithmsVersion > algorithms.AlgorithmsVersion { 121 return nil, fmt.Errorf("running out-of-date algorithms version (task requires %v, worker running %v)", 122 task.AlgorithmsVersion, algorithms.AlgorithmsVersion) 123 } 124 125 tctx := &taskContext{ 126 worker: w, 127 task: task, 128 nextReportDue: task.State.NextReportDue.AsTime(), 129 currentChunkID: task.State.CurrentChunkId, 130 } 131 132 // softEndTime is the (soft) deadline for the run. 133 softEndTime := clock.Now(ctx).Add(duration) 134 if runEndTime.Before(softEndTime) { 135 // Stop by the run end time. 136 softEndTime = runEndTime 137 } 138 139 var done bool 140 for clock.Now(ctx).Before(softEndTime) && !done { 141 err := retry.Retry(ctx, transient.Only(retry.Default), func() error { 142 // Stop harder if retrying after the run end time, to avoid 143 // getting stuck in a retry loop if we are running in 144 // parallel with another worker. 145 if !clock.Now(ctx).Before(runEndTime) { 146 return nil 147 } 148 var err error 149 done, err = tctx.recluster(ctx) 150 return err 151 }, nil) 152 if err != nil { 153 return nil, err 154 } 155 } 156 157 var continuation *taskspb.ReclusterChunks 158 if softEndTime.Before(runEndTime) && !done { 159 continuation = &taskspb.ReclusterChunks{ 160 ShardNumber: task.ShardNumber, 161 Project: task.Project, 162 AttemptTime: task.AttemptTime, 163 StartChunkId: task.StartChunkId, 164 EndChunkId: task.EndChunkId, 165 AlgorithmsVersion: task.AlgorithmsVersion, 166 RulesVersion: task.RulesVersion, 167 ConfigVersion: task.ConfigVersion, 168 State: &taskspb.ReclusterChunkState{ 169 CurrentChunkId: tctx.currentChunkID, 170 NextReportDue: timestamppb.New(tctx.nextReportDue), 171 }, 172 } 173 } 174 return continuation, nil 175 } 176 177 // recluster tries to reclusters some chunks, advancing currentChunkID 178 // as it succeeds. It returns 'true' if all chunks to be re-clustered by 179 // the reclustering task were completed. 180 func (t *taskContext) recluster(ctx context.Context) (done bool, err error) { 181 ctx, s := tracing.Start(ctx, "go.chromium.org/luci/analysis/internal/clustering/reclustering.recluster", 182 attribute.String("project", t.task.Project), 183 attribute.String("currentChunkID", t.currentChunkID), 184 ) 185 defer func() { tracing.End(s, err) }() 186 187 readOpts := state.ReadNextOptions{ 188 StartChunkID: t.currentChunkID, 189 EndChunkID: t.task.EndChunkId, 190 AlgorithmsVersion: t.task.AlgorithmsVersion, 191 ConfigVersion: t.task.ConfigVersion.AsTime(), 192 RulesVersion: t.task.RulesVersion.AsTime(), 193 } 194 entries, err := state.ReadNextN(span.Single(ctx), t.task.Project, readOpts, batchSize) 195 if err != nil { 196 return false, errors.Annotate(err, "read next chunk state").Err() 197 } 198 if len(entries) == 0 { 199 // We have finished re-clustering. 200 err = t.updateProgress(ctx, shards.MaxProgress) 201 if err != nil { 202 return true, err 203 } 204 return true, nil 205 } 206 207 pendingUpdates := NewPendingUpdates(ctx) 208 209 for i, entry := range entries { 210 // Read the test results from GCS. 211 chunk, err := t.worker.chunkStore.Get(ctx, t.task.Project, entry.ObjectID) 212 if err != nil { 213 return false, errors.Annotate(err, "read chunk").Err() 214 } 215 216 // Obtain a recent ruleset of at least RulesVersion. 217 ruleset, err := Ruleset(ctx, t.task.Project, t.task.RulesVersion.AsTime()) 218 if err != nil { 219 return false, errors.Annotate(err, "obtain ruleset").Err() 220 } 221 222 // Obtain a recent configuration of at least ConfigVersion. 223 cfg, err := compiledcfg.Project(ctx, t.task.Project, t.task.ConfigVersion.AsTime()) 224 if err != nil { 225 return false, errors.Annotate(err, "obtain config").Err() 226 } 227 228 // Re-cluster the test results in spanner, then export 229 // the re-clustering to BigQuery for analysis. 230 update, err := PrepareUpdate(ctx, ruleset, cfg, chunk, entry) 231 if err != nil { 232 return false, errors.Annotate(err, "re-cluster chunk").Err() 233 } 234 235 pendingUpdates.Add(update) 236 237 if pendingUpdates.ShouldApply(ctx) || (i == len(entries)-1) { 238 if err := pendingUpdates.Apply(ctx, t.worker.analysis); err != nil { 239 if err == UpdateRaceErr { 240 // Our update raced with another update. 241 // This is retriable if we re-read the chunk again. 242 err = transient.Tag.Apply(err) 243 } 244 return false, err 245 } 246 pendingUpdates = NewPendingUpdates(ctx) 247 248 // Advance our position only on successful commit. 249 t.currentChunkID = entry.ChunkID 250 251 if err := t.calculateAndReportProgress(ctx); err != nil { 252 return false, err 253 } 254 } 255 } 256 257 // More to do. 258 return false, nil 259 } 260 261 // calculateAndReportProgress reports progress on the shard, based on the current 262 // value of t.currentChunkID. It can only be used to report interim progress (it 263 // will never report a progress value of 1000). 264 func (t *taskContext) calculateAndReportProgress(ctx context.Context) (err error) { 265 // Manage contention on the ReclusteringRun row by only periodically 266 // reporting progress. 267 if clock.Now(ctx).After(t.nextReportDue) { 268 progress, err := calculateProgress(t.task, t.currentChunkID) 269 if err != nil { 270 return errors.Annotate(err, "calculate progress").Err() 271 } 272 273 err = t.updateProgress(ctx, progress) 274 if err != nil { 275 return err 276 } 277 t.nextReportDue = t.nextReportDue.Add(ProgressInterval) 278 } 279 return nil 280 } 281 282 // updateProgress sets progress on the shard. 283 func (t *taskContext) updateProgress(ctx context.Context, value int) (err error) { 284 ctx, s := tracing.Start(ctx, "go.chromium.org/luci/analysis/internal/clustering/reclustering.updateProgress") 285 defer func() { tracing.End(s, err) }() 286 287 _, err = span.ReadWriteTransaction(ctx, func(ctx context.Context) error { 288 err = shards.UpdateProgress(ctx, t.task.ShardNumber, t.task.AttemptTime.AsTime(), value) 289 if err != nil { 290 return errors.Annotate(err, "update progress").Err() 291 } 292 return nil 293 }) 294 if err != nil { 295 if status.Code(err) == codes.NotFound { 296 // If the row for the shard has been deleted (i.e. because 297 // we have overrun the end of our reclustering run), drop 298 // the progress update. 299 return nil 300 } 301 return err 302 } 303 return nil 304 } 305 306 // calculateProgress calculates the progress of the worker through the task. 307 // Progress is the proportion of the keyspace re-clustered, as a value between 308 // 0 and 1000 (i.e. 0 = 0%, 1000 = 100.0%). 309 // 1000 is never returned by this method as the value passed is the nextChunkID 310 // (i.e. the next chunkID to re-cluster), not the last completed chunk ID, 311 // which implies progress is not complete. 312 func calculateProgress(task *taskspb.ReclusterChunks, nextChunkID string) (int, error) { 313 nextID, err := chunkIDAsBigInt(nextChunkID) 314 if err != nil { 315 return 0, err 316 } 317 startID, err := chunkIDAsBigInt(task.StartChunkId) 318 if err != nil { 319 return 0, err 320 } 321 endID, err := chunkIDAsBigInt(task.EndChunkId) 322 if err != nil { 323 return 0, err 324 } 325 if startID.Cmp(endID) >= 0 { 326 return 0, fmt.Errorf("end chunk ID %q is before or equal to start %q", task.EndChunkId, task.StartChunkId) 327 } 328 if nextID.Cmp(startID) <= 0 { 329 // Start is exclusive, not inclusive. 330 return 0, fmt.Errorf("next chunk ID %q is before or equal to start %q", nextChunkID, task.StartChunkId) 331 } 332 if nextID.Cmp(endID) > 0 { 333 return 0, fmt.Errorf("next chunk ID %q is after end %q", nextChunkID, task.EndChunkId) 334 } 335 336 // progress = (((nextID - 1) - startID) * shards.MaxProgress) / (endID - startID) 337 var numerator big.Int 338 numerator.Sub(nextID, big.NewInt(1)) 339 numerator.Sub(&numerator, startID) 340 numerator.Mul(&numerator, big.NewInt(shards.MaxProgress)) 341 342 var denominator big.Int 343 denominator.Sub(endID, startID) 344 345 var result big.Int 346 result.Div(&numerator, &denominator) 347 348 return int(result.Uint64()), nil 349 } 350 351 // chunkIDAsBigInt represents a 128-bit chunk ID 352 // (normally represented as 32 lowercase hexadecimal characters) 353 // as a big.Int. 354 func chunkIDAsBigInt(chunkID string) (*big.Int, error) { 355 if chunkID == "" { 356 // "" indicates start of table. This is one before 357 // ID 00000 .... 00000. 358 return big.NewInt(-1), nil 359 } 360 idBytes, err := hex.DecodeString(chunkID) 361 if err != nil { 362 return nil, err 363 } 364 id := big.NewInt(0) 365 id.SetBytes(idBytes) 366 return id, nil 367 }