go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/internal/services/reclustering/reclustering.go (about)

     1  // Copyright 2022 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package reclustering
    16  
    17  import (
    18  	"context"
    19  	"crypto/rand"
    20  	"encoding/hex"
    21  	"fmt"
    22  	"time"
    23  
    24  	"google.golang.org/protobuf/proto"
    25  
    26  	"go.chromium.org/luci/common/errors"
    27  	"go.chromium.org/luci/common/logging"
    28  	"go.chromium.org/luci/common/retry"
    29  	"go.chromium.org/luci/common/retry/transient"
    30  	"go.chromium.org/luci/server"
    31  	"go.chromium.org/luci/server/tq"
    32  
    33  	"go.chromium.org/luci/analysis/internal/analysis"
    34  	"go.chromium.org/luci/analysis/internal/analysis/clusteredfailures"
    35  	"go.chromium.org/luci/analysis/internal/clustering/chunkstore"
    36  	"go.chromium.org/luci/analysis/internal/clustering/reclustering"
    37  	"go.chromium.org/luci/analysis/internal/config"
    38  	"go.chromium.org/luci/analysis/internal/tasks/taskspb"
    39  )
    40  
    41  const (
    42  	taskClass = "reclustering"
    43  	queue     = "reclustering"
    44  )
    45  
    46  var tc = tq.RegisterTaskClass(tq.TaskClass{
    47  	ID:        taskClass,
    48  	Prototype: &taskspb.ReclusterChunks{},
    49  	Queue:     queue,
    50  	Kind:      tq.NonTransactional,
    51  })
    52  
    53  // RegisterTaskHandler registers the handler for reclustering tasks.
    54  func RegisterTaskHandler(srv *server.Server) error {
    55  	ctx := srv.Context
    56  	cfg, err := config.Get(ctx)
    57  	if err != nil {
    58  		return err
    59  	}
    60  	chunkStore, err := chunkstore.NewClient(ctx, cfg.ChunkGcsBucket)
    61  	if err != nil {
    62  		return err
    63  	}
    64  	srv.RegisterCleanup(func(context.Context) {
    65  		chunkStore.Close()
    66  	})
    67  
    68  	cf, err := clusteredfailures.NewClient(ctx, srv.Options.CloudProject)
    69  	if err != nil {
    70  		return err
    71  	}
    72  	srv.RegisterCleanup(func(context.Context) {
    73  		cf.Close()
    74  	})
    75  
    76  	analysis := analysis.NewClusteringHandler(cf)
    77  	worker := reclustering.NewWorker(chunkStore, analysis)
    78  
    79  	handler := func(ctx context.Context, payload proto.Message) error {
    80  		task := payload.(*taskspb.ReclusterChunks)
    81  		return reclusterTestResults(ctx, worker, task)
    82  	}
    83  	tc.AttachHandler(handler)
    84  	return nil
    85  }
    86  
    87  // Schedule enqueues a task to recluster a range of chunks in a LUCI
    88  // Project.
    89  func Schedule(ctx context.Context, task *taskspb.ReclusterChunks) error {
    90  	title := fmt.Sprintf("%s-%s-shard-%v", task.Project, task.AttemptTime.AsTime().Format("20060102-150405"), task.EndChunkId)
    91  
    92  	dedupKey, err := randomDeduplicationKey()
    93  	if err != nil {
    94  		return errors.Annotate(err, "obtain deduplication key").Err()
    95  	}
    96  	taskProto := &tq.Task{
    97  		Title: title,
    98  		// Copy the task to avoid the caller retaining an alias to
    99  		// the task proto passed to tq.AddTask.
   100  		Payload: proto.Clone(task).(*taskspb.ReclusterChunks),
   101  		// Use a deduplication key to avoid retried task creations
   102  		// accidentally resulting in two tasks being created, in case
   103  		// of failure to receive CreateTask response.
   104  		// Note that this is only a best-effort deduplication, the
   105  		// task should still assume the possibility of multiple
   106  		// tasks being created and avoid data correctness issues
   107  		// in this case.
   108  		DeduplicationKey: dedupKey,
   109  	}
   110  
   111  	// After 50 seconds, task creation is probably pointless as
   112  	// each reclustering run takes 1 minute.
   113  	ctx, cancel := context.WithTimeout(ctx, 50*time.Second)
   114  	defer cancel()
   115  
   116  	// Manually retry transient errors. The Cloud Tasks client
   117  	// does not automatically retry CreateTask RPCs, presumably
   118  	// as the RPC does not offer strong guarantees against multiple
   119  	// task creation in case of retry.
   120  	err = retry.Retry(ctx, transient.Only(retry.Default), func() error {
   121  		err := tq.AddTask(ctx, taskProto)
   122  		if err != nil {
   123  			return errors.Annotate(err, "create task").Err()
   124  		}
   125  		return nil
   126  	}, nil)
   127  	return err
   128  }
   129  
   130  func randomDeduplicationKey() (string, error) {
   131  	var b [16]byte
   132  	_, err := rand.Read(b[:])
   133  	if err != nil {
   134  		return "", errors.Annotate(err, "read random bytes").Err()
   135  	}
   136  	return hex.EncodeToString(b[:]), nil
   137  }
   138  
   139  func reclusterTestResults(ctx context.Context, worker *reclustering.Worker, task *taskspb.ReclusterChunks) error {
   140  	next, err := worker.Do(ctx, task, reclustering.TargetTaskDuration)
   141  	if err != nil {
   142  		logging.Errorf(ctx, "Error re-clustering: %s", err)
   143  		return err
   144  	}
   145  	if next != nil {
   146  		if err := Schedule(ctx, next); err != nil {
   147  			logging.Errorf(ctx, "Error scheduling continuation: %s", err)
   148  			return err
   149  		}
   150  	}
   151  	return nil
   152  }