go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/internal/clustering/reclustering/worker.go

go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/internal/clustering/reclustering/worker.go (about)

     1  // Copyright 2022 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package reclustering
    16  
    17  import (
    18  	"context"
    19  	"encoding/hex"
    20  	"fmt"
    21  	"math/big"
    22  	"time"
    23  
    24  	"go.opentelemetry.io/otel/attribute"
    25  	"google.golang.org/grpc/codes"
    26  	"google.golang.org/grpc/status"
    27  	"google.golang.org/protobuf/types/known/timestamppb"
    28  
    29  	"go.chromium.org/luci/common/clock"
    30  	"go.chromium.org/luci/common/errors"
    31  	"go.chromium.org/luci/common/retry"
    32  	"go.chromium.org/luci/common/retry/transient"
    33  	"go.chromium.org/luci/server/span"
    34  
    35  	"go.chromium.org/luci/analysis/internal/clustering/algorithms"
    36  	cpb "go.chromium.org/luci/analysis/internal/clustering/proto"
    37  	"go.chromium.org/luci/analysis/internal/clustering/shards"
    38  	"go.chromium.org/luci/analysis/internal/clustering/state"
    39  	"go.chromium.org/luci/analysis/internal/config/compiledcfg"
    40  	"go.chromium.org/luci/analysis/internal/tasks/taskspb"
    41  	"go.chromium.org/luci/analysis/internal/tracing"
    42  )
    43  
    44  const (
    45  	// batchSize is the number of chunks to read from Spanner at a time.
    46  	batchSize = 10
    47  
    48  	// TargetTaskDuration is the desired duration of a re-clustering task.
    49  	// If a task completes before the reclustering run has completed, a
    50  	// continuation task will be scheduled.
    51  	//
    52  	// Longer durations will incur lower task queuing/re-queueing overhead,
    53  	// but limit the ability of autoscaling to move tasks between instances
    54  	// in response to load.
    55  	TargetTaskDuration = 2 * time.Second
    56  
    57  	// ProgressInterval is the amount of time between progress updates.
    58  	//
    59  	// Note that this is the frequency at which updates should
    60  	// be reported for a shard of work; individual tasks are usually
    61  	// much shorter lived and consequently most will not report any progress
    62  	// (unless it is time for the shard to report progress again).
    63  	ProgressInterval = 5 * time.Second
    64  )
    65  
    66  // ChunkStore is the interface for the blob store archiving chunks of test
    67  // results for later re-clustering.
    68  type ChunkStore interface {
    69  	// Get retrieves the chunk with the specified object ID and returns it.
    70  	Get(ctx context.Context, project, objectID string) (*cpb.Chunk, error)
    71  }
    72  
    73  // Worker provides methods to process re-clustering tasks. It is safe to be
    74  // used by multiple threads concurrently.
    75  type Worker struct {
    76  	chunkStore ChunkStore
    77  	analysis   Analysis
    78  }
    79  
    80  // NewWorker initialises a new Worker.
    81  func NewWorker(chunkStore ChunkStore, analysis Analysis) *Worker {
    82  	return &Worker{
    83  		chunkStore: chunkStore,
    84  		analysis:   analysis,
    85  	}
    86  }
    87  
    88  // taskContext provides objects relevant to working on a particular
    89  // re-clustering task.
    90  type taskContext struct {
    91  	worker *Worker
    92  	task   *taskspb.ReclusterChunks
    93  	// nextReportDue is the time at which the next progress update is
    94  	// due.
    95  	nextReportDue time.Time
    96  	// currentChunkID is the exclusive lower bound of the range
    97  	// of ChunkIds still to re-cluster.
    98  	currentChunkID string
    99  }
   100  
   101  // Do works on a re-clustering task for approximately duration, returning a
   102  // continuation task (if the run end time has not been reached).
   103  //
   104  // Continuation tasks are used to better integrate with GAE autoscaling,
   105  // autoscaling work best when tasks are relatively small (so that work
   106  // can be moved between instances in real time).
   107  func (w *Worker) Do(ctx context.Context, task *taskspb.ReclusterChunks, duration time.Duration) (*taskspb.ReclusterChunks, error) {
   108  	if task.State == nil {
   109  		return nil, errors.New("task does not have state")
   110  	}
   111  	if task.ShardNumber <= 0 {
   112  		return nil, errors.New("task must have valid shard number")
   113  	}
   114  	if task.AlgorithmsVersion <= 0 {
   115  		return nil, errors.New("task must have valid algorithms version")
   116  	}
   117  
   118  	runEndTime := task.AttemptTime.AsTime()
   119  
   120  	if task.AlgorithmsVersion > algorithms.AlgorithmsVersion {
   121  		return nil, fmt.Errorf("running out-of-date algorithms version (task requires %v, worker running %v)",
   122  			task.AlgorithmsVersion, algorithms.AlgorithmsVersion)
   123  	}
   124  
   125  	tctx := &taskContext{
   126  		worker:         w,
   127  		task:           task,
   128  		nextReportDue:  task.State.NextReportDue.AsTime(),
   129  		currentChunkID: task.State.CurrentChunkId,
   130  	}
   131  
   132  	// softEndTime is the (soft) deadline for the run.
   133  	softEndTime := clock.Now(ctx).Add(duration)
   134  	if runEndTime.Before(softEndTime) {
   135  		// Stop by the run end time.
   136  		softEndTime = runEndTime
   137  	}
   138  
   139  	var done bool
   140  	for clock.Now(ctx).Before(softEndTime) && !done {
   141  		err := retry.Retry(ctx, transient.Only(retry.Default), func() error {
   142  			// Stop harder if retrying after the run end time, to avoid
   143  			// getting stuck in a retry loop if we are running in
   144  			// parallel with another worker.
   145  			if !clock.Now(ctx).Before(runEndTime) {
   146  				return nil
   147  			}
   148  			var err error
   149  			done, err = tctx.recluster(ctx)
   150  			return err
   151  		}, nil)
   152  		if err != nil {
   153  			return nil, err
   154  		}
   155  	}
   156  
   157  	var continuation *taskspb.ReclusterChunks
   158  	if softEndTime.Before(runEndTime) && !done {
   159  		continuation = &taskspb.ReclusterChunks{
   160  			ShardNumber:       task.ShardNumber,
   161  			Project:           task.Project,
   162  			AttemptTime:       task.AttemptTime,
   163  			StartChunkId:      task.StartChunkId,
   164  			EndChunkId:        task.EndChunkId,
   165  			AlgorithmsVersion: task.AlgorithmsVersion,
   166  			RulesVersion:      task.RulesVersion,
   167  			ConfigVersion:     task.ConfigVersion,
   168  			State: &taskspb.ReclusterChunkState{
   169  				CurrentChunkId: tctx.currentChunkID,
   170  				NextReportDue:  timestamppb.New(tctx.nextReportDue),
   171  			},
   172  		}
   173  	}
   174  	return continuation, nil
   175  }
   176  
   177  // recluster tries to reclusters some chunks, advancing currentChunkID
   178  // as it succeeds. It returns 'true' if all chunks to be re-clustered by
   179  // the reclustering task were completed.
   180  func (t *taskContext) recluster(ctx context.Context) (done bool, err error) {
   181  	ctx, s := tracing.Start(ctx, "go.chromium.org/luci/analysis/internal/clustering/reclustering.recluster",
   182  		attribute.String("project", t.task.Project),
   183  		attribute.String("currentChunkID", t.currentChunkID),
   184  	)
   185  	defer func() { tracing.End(s, err) }()
   186  
   187  	readOpts := state.ReadNextOptions{
   188  		StartChunkID:      t.currentChunkID,
   189  		EndChunkID:        t.task.EndChunkId,
   190  		AlgorithmsVersion: t.task.AlgorithmsVersion,
   191  		ConfigVersion:     t.task.ConfigVersion.AsTime(),
   192  		RulesVersion:      t.task.RulesVersion.AsTime(),
   193  	}
   194  	entries, err := state.ReadNextN(span.Single(ctx), t.task.Project, readOpts, batchSize)
   195  	if err != nil {
   196  		return false, errors.Annotate(err, "read next chunk state").Err()
   197  	}
   198  	if len(entries) == 0 {
   199  		// We have finished re-clustering.
   200  		err = t.updateProgress(ctx, shards.MaxProgress)
   201  		if err != nil {
   202  			return true, err
   203  		}
   204  		return true, nil
   205  	}
   206  
   207  	pendingUpdates := NewPendingUpdates(ctx)
   208  
   209  	for i, entry := range entries {
   210  		// Read the test results from GCS.
   211  		chunk, err := t.worker.chunkStore.Get(ctx, t.task.Project, entry.ObjectID)
   212  		if err != nil {
   213  			return false, errors.Annotate(err, "read chunk").Err()
   214  		}
   215  
   216  		// Obtain a recent ruleset of at least RulesVersion.
   217  		ruleset, err := Ruleset(ctx, t.task.Project, t.task.RulesVersion.AsTime())
   218  		if err != nil {
   219  			return false, errors.Annotate(err, "obtain ruleset").Err()
   220  		}
   221  
   222  		// Obtain a recent configuration of at least ConfigVersion.
   223  		cfg, err := compiledcfg.Project(ctx, t.task.Project, t.task.ConfigVersion.AsTime())
   224  		if err != nil {
   225  			return false, errors.Annotate(err, "obtain config").Err()
   226  		}
   227  
   228  		// Re-cluster the test results in spanner, then export
   229  		// the re-clustering to BigQuery for analysis.
   230  		update, err := PrepareUpdate(ctx, ruleset, cfg, chunk, entry)
   231  		if err != nil {
   232  			return false, errors.Annotate(err, "re-cluster chunk").Err()
   233  		}
   234  
   235  		pendingUpdates.Add(update)
   236  
   237  		if pendingUpdates.ShouldApply(ctx) || (i == len(entries)-1) {
   238  			if err := pendingUpdates.Apply(ctx, t.worker.analysis); err != nil {
   239  				if err == UpdateRaceErr {
   240  					// Our update raced with another update.
   241  					// This is retriable if we re-read the chunk again.
   242  					err = transient.Tag.Apply(err)
   243  				}
   244  				return false, err
   245  			}
   246  			pendingUpdates = NewPendingUpdates(ctx)
   247  
   248  			// Advance our position only on successful commit.
   249  			t.currentChunkID = entry.ChunkID
   250  
   251  			if err := t.calculateAndReportProgress(ctx); err != nil {
   252  				return false, err
   253  			}
   254  		}
   255  	}
   256  
   257  	// More to do.
   258  	return false, nil
   259  }
   260  
   261  // calculateAndReportProgress reports progress on the shard, based on the current
   262  // value of t.currentChunkID. It can only be used to report interim progress (it
   263  // will never report a progress value of 1000).
   264  func (t *taskContext) calculateAndReportProgress(ctx context.Context) (err error) {
   265  	// Manage contention on the ReclusteringRun row by only periodically
   266  	// reporting progress.
   267  	if clock.Now(ctx).After(t.nextReportDue) {
   268  		progress, err := calculateProgress(t.task, t.currentChunkID)
   269  		if err != nil {
   270  			return errors.Annotate(err, "calculate progress").Err()
   271  		}
   272  
   273  		err = t.updateProgress(ctx, progress)
   274  		if err != nil {
   275  			return err
   276  		}
   277  		t.nextReportDue = t.nextReportDue.Add(ProgressInterval)
   278  	}
   279  	return nil
   280  }
   281  
   282  // updateProgress sets progress on the shard.
   283  func (t *taskContext) updateProgress(ctx context.Context, value int) (err error) {
   284  	ctx, s := tracing.Start(ctx, "go.chromium.org/luci/analysis/internal/clustering/reclustering.updateProgress")
   285  	defer func() { tracing.End(s, err) }()
   286  
   287  	_, err = span.ReadWriteTransaction(ctx, func(ctx context.Context) error {
   288  		err = shards.UpdateProgress(ctx, t.task.ShardNumber, t.task.AttemptTime.AsTime(), value)
   289  		if err != nil {
   290  			return errors.Annotate(err, "update progress").Err()
   291  		}
   292  		return nil
   293  	})
   294  	if err != nil {
   295  		if status.Code(err) == codes.NotFound {
   296  			// If the row for the shard has been deleted (i.e. because
   297  			// we have overrun the end of our reclustering run), drop
   298  			// the progress update.
   299  			return nil
   300  		}
   301  		return err
   302  	}
   303  	return nil
   304  }
   305  
   306  // calculateProgress calculates the progress of the worker through the task.
   307  // Progress is the proportion of the keyspace re-clustered, as a value between
   308  // 0 and 1000 (i.e. 0 = 0%, 1000 = 100.0%).
   309  // 1000 is never returned by this method as the value passed is the nextChunkID
   310  // (i.e. the next chunkID to re-cluster), not the last completed chunk ID,
   311  // which implies progress is not complete.
   312  func calculateProgress(task *taskspb.ReclusterChunks, nextChunkID string) (int, error) {
   313  	nextID, err := chunkIDAsBigInt(nextChunkID)
   314  	if err != nil {
   315  		return 0, err
   316  	}
   317  	startID, err := chunkIDAsBigInt(task.StartChunkId)
   318  	if err != nil {
   319  		return 0, err
   320  	}
   321  	endID, err := chunkIDAsBigInt(task.EndChunkId)
   322  	if err != nil {
   323  		return 0, err
   324  	}
   325  	if startID.Cmp(endID) >= 0 {
   326  		return 0, fmt.Errorf("end chunk ID %q is before or equal to start %q", task.EndChunkId, task.StartChunkId)
   327  	}
   328  	if nextID.Cmp(startID) <= 0 {
   329  		// Start is exclusive, not inclusive.
   330  		return 0, fmt.Errorf("next chunk ID %q is before or equal to start %q", nextChunkID, task.StartChunkId)
   331  	}
   332  	if nextID.Cmp(endID) > 0 {
   333  		return 0, fmt.Errorf("next chunk ID %q is after end %q", nextChunkID, task.EndChunkId)
   334  	}
   335  
   336  	// progress = (((nextID - 1) - startID) * shards.MaxProgress) / (endID - startID)
   337  	var numerator big.Int
   338  	numerator.Sub(nextID, big.NewInt(1))
   339  	numerator.Sub(&numerator, startID)
   340  	numerator.Mul(&numerator, big.NewInt(shards.MaxProgress))
   341  
   342  	var denominator big.Int
   343  	denominator.Sub(endID, startID)
   344  
   345  	var result big.Int
   346  	result.Div(&numerator, &denominator)
   347  
   348  	return int(result.Uint64()), nil
   349  }
   350  
   351  // chunkIDAsBigInt represents a 128-bit chunk ID
   352  // (normally represented as 32 lowercase hexadecimal characters)
   353  // as a big.Int.
   354  func chunkIDAsBigInt(chunkID string) (*big.Int, error) {
   355  	if chunkID == "" {
   356  		// "" indicates start of table. This is one before
   357  		// ID 00000 .... 00000.
   358  		return big.NewInt(-1), nil
   359  	}
   360  	idBytes, err := hex.DecodeString(chunkID)
   361  	if err != nil {
   362  		return nil, err
   363  	}
   364  	id := big.NewInt(0)
   365  	id.SetBytes(idBytes)
   366  	return id, nil
   367  }