go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/internal/services/reclustering/reclustering.go (about) 1 // Copyright 2022 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package reclustering 16 17 import ( 18 "context" 19 "crypto/rand" 20 "encoding/hex" 21 "fmt" 22 "time" 23 24 "google.golang.org/protobuf/proto" 25 26 "go.chromium.org/luci/common/errors" 27 "go.chromium.org/luci/common/logging" 28 "go.chromium.org/luci/common/retry" 29 "go.chromium.org/luci/common/retry/transient" 30 "go.chromium.org/luci/server" 31 "go.chromium.org/luci/server/tq" 32 33 "go.chromium.org/luci/analysis/internal/analysis" 34 "go.chromium.org/luci/analysis/internal/analysis/clusteredfailures" 35 "go.chromium.org/luci/analysis/internal/clustering/chunkstore" 36 "go.chromium.org/luci/analysis/internal/clustering/reclustering" 37 "go.chromium.org/luci/analysis/internal/config" 38 "go.chromium.org/luci/analysis/internal/tasks/taskspb" 39 ) 40 41 const ( 42 taskClass = "reclustering" 43 queue = "reclustering" 44 ) 45 46 var tc = tq.RegisterTaskClass(tq.TaskClass{ 47 ID: taskClass, 48 Prototype: &taskspb.ReclusterChunks{}, 49 Queue: queue, 50 Kind: tq.NonTransactional, 51 }) 52 53 // RegisterTaskHandler registers the handler for reclustering tasks. 54 func RegisterTaskHandler(srv *server.Server) error { 55 ctx := srv.Context 56 cfg, err := config.Get(ctx) 57 if err != nil { 58 return err 59 } 60 chunkStore, err := chunkstore.NewClient(ctx, cfg.ChunkGcsBucket) 61 if err != nil { 62 return err 63 } 64 srv.RegisterCleanup(func(context.Context) { 65 chunkStore.Close() 66 }) 67 68 cf, err := clusteredfailures.NewClient(ctx, srv.Options.CloudProject) 69 if err != nil { 70 return err 71 } 72 srv.RegisterCleanup(func(context.Context) { 73 cf.Close() 74 }) 75 76 analysis := analysis.NewClusteringHandler(cf) 77 worker := reclustering.NewWorker(chunkStore, analysis) 78 79 handler := func(ctx context.Context, payload proto.Message) error { 80 task := payload.(*taskspb.ReclusterChunks) 81 return reclusterTestResults(ctx, worker, task) 82 } 83 tc.AttachHandler(handler) 84 return nil 85 } 86 87 // Schedule enqueues a task to recluster a range of chunks in a LUCI 88 // Project. 89 func Schedule(ctx context.Context, task *taskspb.ReclusterChunks) error { 90 title := fmt.Sprintf("%s-%s-shard-%v", task.Project, task.AttemptTime.AsTime().Format("20060102-150405"), task.EndChunkId) 91 92 dedupKey, err := randomDeduplicationKey() 93 if err != nil { 94 return errors.Annotate(err, "obtain deduplication key").Err() 95 } 96 taskProto := &tq.Task{ 97 Title: title, 98 // Copy the task to avoid the caller retaining an alias to 99 // the task proto passed to tq.AddTask. 100 Payload: proto.Clone(task).(*taskspb.ReclusterChunks), 101 // Use a deduplication key to avoid retried task creations 102 // accidentally resulting in two tasks being created, in case 103 // of failure to receive CreateTask response. 104 // Note that this is only a best-effort deduplication, the 105 // task should still assume the possibility of multiple 106 // tasks being created and avoid data correctness issues 107 // in this case. 108 DeduplicationKey: dedupKey, 109 } 110 111 // After 50 seconds, task creation is probably pointless as 112 // each reclustering run takes 1 minute. 113 ctx, cancel := context.WithTimeout(ctx, 50*time.Second) 114 defer cancel() 115 116 // Manually retry transient errors. The Cloud Tasks client 117 // does not automatically retry CreateTask RPCs, presumably 118 // as the RPC does not offer strong guarantees against multiple 119 // task creation in case of retry. 120 err = retry.Retry(ctx, transient.Only(retry.Default), func() error { 121 err := tq.AddTask(ctx, taskProto) 122 if err != nil { 123 return errors.Annotate(err, "create task").Err() 124 } 125 return nil 126 }, nil) 127 return err 128 } 129 130 func randomDeduplicationKey() (string, error) { 131 var b [16]byte 132 _, err := rand.Read(b[:]) 133 if err != nil { 134 return "", errors.Annotate(err, "read random bytes").Err() 135 } 136 return hex.EncodeToString(b[:]), nil 137 } 138 139 func reclusterTestResults(ctx context.Context, worker *reclustering.Worker, task *taskspb.ReclusterChunks) error { 140 next, err := worker.Do(ctx, task, reclustering.TargetTaskDuration) 141 if err != nil { 142 logging.Errorf(ctx, "Error re-clustering: %s", err) 143 return err 144 } 145 if next != nil { 146 if err := Schedule(ctx, next); err != nil { 147 logging.Errorf(ctx, "Error scheduling continuation: %s", err) 148 return err 149 } 150 } 151 return nil 152 }