go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/logdog/server/cmd/logdog_archivist/main.go (about) 1 // Copyright 2016 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package main 16 17 import ( 18 "context" 19 "flag" 20 "time" 21 22 cloudlogging "cloud.google.com/go/logging" 23 "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" 24 "google.golang.org/api/option" 25 "google.golang.org/grpc" 26 "google.golang.org/protobuf/types/known/durationpb" 27 28 "go.chromium.org/luci/common/clock" 29 "go.chromium.org/luci/common/errors" 30 "go.chromium.org/luci/common/gcloud/gs" 31 "go.chromium.org/luci/common/logging" 32 "go.chromium.org/luci/common/retry" 33 "go.chromium.org/luci/common/retry/transient" 34 "go.chromium.org/luci/common/sync/dispatcher" 35 "go.chromium.org/luci/common/sync/dispatcher/buffer" 36 "go.chromium.org/luci/common/tsmon/distribution" 37 "go.chromium.org/luci/common/tsmon/field" 38 "go.chromium.org/luci/common/tsmon/metric" 39 "go.chromium.org/luci/common/tsmon/types" 40 "go.chromium.org/luci/grpc/grpcmon" 41 "go.chromium.org/luci/server" 42 "go.chromium.org/luci/server/auth" 43 44 logdog "go.chromium.org/luci/logdog/api/endpoints/coordinator/services/v1" 45 "go.chromium.org/luci/logdog/server/archivist" 46 "go.chromium.org/luci/logdog/server/service" 47 ) 48 49 var ( 50 leaseRetryParams = func() retry.Iterator { 51 return &retry.ExponentialBackoff{ 52 Limited: retry.Limited{ 53 Delay: time.Second, 54 Retries: -1, 55 }, 56 Multiplier: 1.25, 57 MaxDelay: time.Minute * 10, 58 } 59 } 60 61 ackChannelOptions = &dispatcher.Options{ 62 Buffer: buffer.Options{ 63 MaxLeases: 10, 64 BatchItemsMax: 500, 65 BatchAgeMax: 10 * time.Minute, 66 FullBehavior: &buffer.BlockNewItems{ 67 MaxItems: 10 * 500, 68 }, 69 Retry: func() retry.Iterator { 70 return &retry.ExponentialBackoff{ 71 Limited: retry.Limited{ 72 Delay: time.Second, 73 Retries: 10, 74 }, 75 Multiplier: 1.25, 76 MaxDelay: time.Minute * 10, 77 } 78 }, 79 }, 80 } 81 82 mkJobChannelOptions = func(maxWorkers int) *dispatcher.Options { 83 return &dispatcher.Options{ 84 Buffer: buffer.Options{ 85 MaxLeases: maxWorkers, 86 BatchItemsMax: 1, 87 FullBehavior: &buffer.BlockNewItems{ 88 // Currently (2020Q2) it takes ~4s on average to process a task, and 89 // ~60s to lease a new batch. We never want to starve our job workers 90 // here, so we need at least 60s worth of tasks. We add 20% margin, 91 // just to be safe. 92 MaxItems: int(((60. / 4.) * float64(maxWorkers)) * 1.2), 93 }, 94 }, 95 } 96 } 97 98 // maxSleepTime is the max amount of time to sleep in-between errors, in seconds. 99 maxSleepTime = 32 100 101 // tsTaskProcessingTime measures the amount of time spent processing a single 102 // task. 103 // 104 // The "consumed" field is true if the underlying task was consumed and 105 // false if it was not. 106 tsTaskProcessingTime = metric.NewCumulativeDistribution("logdog/archivist/task_processing_time_ms_ng", 107 "The amount of time (in milliseconds) that a single task takes to process in the new pipeline.", 108 &types.MetricMetadata{Units: types.Milliseconds}, 109 distribution.DefaultBucketer, 110 field.Bool("consumed")) 111 112 tsLoopCycleTime = metric.NewCumulativeDistribution("logdog/archivist/loop_cycle_time_ms", 113 "The amount of time a single batch of leases takes to process.", 114 &types.MetricMetadata{Units: types.Milliseconds}, 115 distribution.DefaultBucketer) 116 117 tsLeaseCount = metric.NewCounter("logdog/archivist/tasks_leased", 118 "Number of tasks leased.", 119 nil) 120 121 tsNackCount = metric.NewCounter("logdog/archivist/tasks_not_acked", 122 "Number of tasks leased but failed.", 123 nil) 124 125 tsAckCount = metric.NewCounter("logdog/archivist/tasks_acked", 126 "Number of tasks successfully completed and acked.", 127 nil) 128 ) 129 130 // runForever runs the archivist loop forever. 131 func runForever(ctx context.Context, ar *archivist.Archivist, flags *CommandLineFlags) { 132 type archiveJob struct { 133 deadline time.Time 134 task *logdog.ArchiveTask 135 } 136 137 ackChan, err := dispatcher.NewChannel(ctx, ackChannelOptions, func(batch *buffer.Batch) error { 138 var req *logdog.DeleteRequest 139 if batch.Meta != nil { 140 req = batch.Meta.(*logdog.DeleteRequest) 141 } else { 142 tasks := make([]*logdog.ArchiveTask, len(batch.Data)) 143 for i, datum := range batch.Data { 144 tasks[i] = datum.Item.(*logdog.ArchiveTask) 145 batch.Data[i].Item = nil 146 } 147 req = &logdog.DeleteRequest{Tasks: tasks} 148 batch.Meta = req 149 } 150 _, err := ar.Service.DeleteArchiveTasks(ctx, req) 151 if err == nil { 152 tsAckCount.Add(ctx, int64(len(req.Tasks))) 153 } 154 return transient.Tag.Apply(err) 155 }) 156 if err != nil { 157 panic(err) // only occurs if Options is invalid 158 } 159 defer func() { 160 logging.Infof(ctx, "draining ACK channel") 161 ackChan.CloseAndDrain(ctx) 162 logging.Infof(ctx, "ACK channel drained") 163 }() 164 165 jobChanOpts := mkJobChannelOptions(flags.MaxConcurrentTasks) 166 jobChan, err := dispatcher.NewChannel(ctx, jobChanOpts, func(data *buffer.Batch) error { 167 job := data.Data[0].Item.(*archiveJob) 168 169 nc, cancel := context.WithDeadline(ctx, job.deadline) 170 defer cancel() 171 172 nc = logging.SetFields(nc, logging.Fields{ 173 "project": job.task.Project, 174 "id": job.task.Id, 175 }) 176 177 startTime := clock.Now(nc) 178 err := ar.ArchiveTask(nc, job.task) 179 duration := clock.Now(nc).Sub(startTime) 180 181 tsTaskProcessingTime.Add(ctx, float64(duration.Nanoseconds())/1000000, err == nil) 182 183 if err == nil { 184 select { 185 case ackChan.C <- job.task: 186 case <-ctx.Done(): 187 logging.Errorf(nc, "Failed to ACK task %v due to context: %s", job.task, ctx.Err()) 188 } 189 } else { 190 tsNackCount.Add(ctx, 1) 191 logging.Errorf(nc, "Failed to archive task %v: %s", job.task, err) 192 } 193 194 return nil 195 }) 196 if err != nil { 197 panic(err) // only occurs if Options is invalid 198 } 199 defer func() { 200 logging.Infof(ctx, "Job channel draining") 201 jobChan.CloseAndDrain(ctx) 202 logging.Infof(ctx, "Job channel drained") 203 }() 204 205 // now we spin forever, pushing items into jobChan. 206 sleepTime := 1 207 var previousCycle time.Time 208 for ctx.Err() == nil { 209 var tasks *logdog.LeaseResponse 210 var deadline time.Time 211 212 err := retry.Retry(ctx, leaseRetryParams, func() (err error) { 213 logging.Infof(ctx, "Leasing max %d tasks for %s", flags.LeaseBatchSize, flags.LeaseTime) 214 deadline = clock.Now(ctx).Add(flags.LeaseTime) 215 tasks, err = ar.Service.LeaseArchiveTasks(ctx, &logdog.LeaseRequest{ 216 MaxTasks: int64(flags.LeaseBatchSize), 217 LeaseTime: durationpb.New(flags.LeaseTime), 218 }) 219 return 220 }, retry.LogCallback(ctx, "LeaseArchiveTasks")) 221 if ctx.Err() != nil { 222 logging.Infof(ctx, "lease thread got context err in RPC: %s", ctx.Err()) 223 break 224 } 225 if err != nil { 226 panic("impossible: infinite retry stopped: " + err.Error()) 227 } 228 229 now := clock.Now(ctx) 230 if !previousCycle.IsZero() { 231 tsLoopCycleTime.Add(ctx, float64(now.Sub(previousCycle).Nanoseconds()/1000000)) 232 } 233 previousCycle = now 234 235 if len(tasks.Tasks) == 0 { 236 sleepTime *= 2 237 if sleepTime > maxSleepTime { 238 sleepTime = maxSleepTime 239 } 240 logging.Infof(ctx, "no work to do, sleeping for %d seconds", sleepTime) 241 clock.Sleep(ctx, time.Duration(sleepTime)*time.Second) 242 continue 243 } else { 244 tsLeaseCount.Add(ctx, int64(len(tasks.Tasks))) 245 sleepTime = 1 246 } 247 248 for _, task := range tasks.Tasks { 249 select { 250 case jobChan.C <- &archiveJob{deadline, task}: 251 case <-ctx.Done(): 252 logging.Infof(ctx, "lease thread got context err in jobChan push: %s", ctx.Err()) 253 break 254 } 255 } 256 } 257 } 258 259 // googleStorageClient returns an authenticated Google Storage client instance. 260 func googleStorageClient(ctx context.Context, luciProject string) (gs.Client, error) { 261 // TODO(vadimsh): Switch to AsProject + WithProject(project) once 262 // we are ready to roll out project scoped service accounts in Logdog. 263 tr, err := auth.GetRPCTransport(ctx, auth.AsSelf, auth.WithScopes(auth.CloudOAuthScopes...)) 264 if err != nil { 265 return nil, errors.Annotate(err, "failed to get the authenticating transport").Err() 266 } 267 client, err := gs.NewProdClient(ctx, tr) 268 if err != nil { 269 return nil, errors.Annotate(err, "failed to create Google Storage client").Err() 270 } 271 return client, nil 272 } 273 274 // cloudLoggingClient returns an authenticated Cloud Logging client instance. 275 func cloudLoggingClient(ctx context.Context, luciProject, cloudProject string, onError func(err error)) (archivist.CLClient, error) { 276 cred, err := auth.GetPerRPCCredentials( 277 ctx, auth.AsProject, 278 auth.WithScopes(auth.CloudOAuthScopes...), 279 auth.WithProject(luciProject), 280 ) 281 if err != nil { 282 return nil, errors.Annotate(err, "failed to get per RPC credentials").Err() 283 } 284 cl, err := cloudlogging.NewClient( 285 ctx, cloudProject, 286 option.WithGRPCDialOption(grpc.WithPerRPCCredentials(cred)), 287 option.WithGRPCDialOption(grpc.WithStatsHandler(&grpcmon.ClientRPCStatsMonitor{})), 288 option.WithGRPCDialOption(grpc.WithUnaryInterceptor(otelgrpc.UnaryClientInterceptor())), 289 option.WithGRPCDialOption(grpc.WithStreamInterceptor(otelgrpc.StreamClientInterceptor())), 290 ) 291 if err != nil { 292 return nil, err 293 } 294 cl.OnError = onError 295 return cl, nil 296 } 297 298 // Entry point. 299 func main() { 300 flags := DefaultCommandLineFlags() 301 flags.Register(flag.CommandLine) 302 303 cfg := service.MainCfg{BigTableAppProfile: "archivist"} 304 service.Main(cfg, func(srv *server.Server, impl *service.Implementations) error { 305 if err := flags.Validate(); err != nil { 306 return err 307 } 308 309 // Initialize the archivist. 310 ar := &archivist.Archivist{ 311 Service: impl.Coordinator, 312 SettingsLoader: GetSettingsLoader(srv.Options.CloudProject, &flags), 313 Storage: impl.Storage, 314 GSClientFactory: googleStorageClient, 315 CLClientFactory: cloudLoggingClient, 316 } 317 318 // Run the archivist loop until the server closes. 319 srv.RunInBackground("archivist", func(ctx context.Context) { 320 runForever(ctx, ar, &flags) 321 }) 322 return nil 323 }) 324 }