go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/logdog/server/cmd/logdog_archivist/main.go (about)

     1  // Copyright 2016 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package main
    16  
    17  import (
    18  	"context"
    19  	"flag"
    20  	"time"
    21  
    22  	cloudlogging "cloud.google.com/go/logging"
    23  	"go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
    24  	"google.golang.org/api/option"
    25  	"google.golang.org/grpc"
    26  	"google.golang.org/protobuf/types/known/durationpb"
    27  
    28  	"go.chromium.org/luci/common/clock"
    29  	"go.chromium.org/luci/common/errors"
    30  	"go.chromium.org/luci/common/gcloud/gs"
    31  	"go.chromium.org/luci/common/logging"
    32  	"go.chromium.org/luci/common/retry"
    33  	"go.chromium.org/luci/common/retry/transient"
    34  	"go.chromium.org/luci/common/sync/dispatcher"
    35  	"go.chromium.org/luci/common/sync/dispatcher/buffer"
    36  	"go.chromium.org/luci/common/tsmon/distribution"
    37  	"go.chromium.org/luci/common/tsmon/field"
    38  	"go.chromium.org/luci/common/tsmon/metric"
    39  	"go.chromium.org/luci/common/tsmon/types"
    40  	"go.chromium.org/luci/grpc/grpcmon"
    41  	"go.chromium.org/luci/server"
    42  	"go.chromium.org/luci/server/auth"
    43  
    44  	logdog "go.chromium.org/luci/logdog/api/endpoints/coordinator/services/v1"
    45  	"go.chromium.org/luci/logdog/server/archivist"
    46  	"go.chromium.org/luci/logdog/server/service"
    47  )
    48  
    49  var (
    50  	leaseRetryParams = func() retry.Iterator {
    51  		return &retry.ExponentialBackoff{
    52  			Limited: retry.Limited{
    53  				Delay:   time.Second,
    54  				Retries: -1,
    55  			},
    56  			Multiplier: 1.25,
    57  			MaxDelay:   time.Minute * 10,
    58  		}
    59  	}
    60  
    61  	ackChannelOptions = &dispatcher.Options{
    62  		Buffer: buffer.Options{
    63  			MaxLeases:     10,
    64  			BatchItemsMax: 500,
    65  			BatchAgeMax:   10 * time.Minute,
    66  			FullBehavior: &buffer.BlockNewItems{
    67  				MaxItems: 10 * 500,
    68  			},
    69  			Retry: func() retry.Iterator {
    70  				return &retry.ExponentialBackoff{
    71  					Limited: retry.Limited{
    72  						Delay:   time.Second,
    73  						Retries: 10,
    74  					},
    75  					Multiplier: 1.25,
    76  					MaxDelay:   time.Minute * 10,
    77  				}
    78  			},
    79  		},
    80  	}
    81  
    82  	mkJobChannelOptions = func(maxWorkers int) *dispatcher.Options {
    83  		return &dispatcher.Options{
    84  			Buffer: buffer.Options{
    85  				MaxLeases:     maxWorkers,
    86  				BatchItemsMax: 1,
    87  				FullBehavior: &buffer.BlockNewItems{
    88  					// Currently (2020Q2) it takes ~4s on average to process a task, and
    89  					// ~60s to lease a new batch. We never want to starve our job workers
    90  					// here, so we need at least 60s worth of tasks. We add 20% margin,
    91  					// just to be safe.
    92  					MaxItems: int(((60. / 4.) * float64(maxWorkers)) * 1.2),
    93  				},
    94  			},
    95  		}
    96  	}
    97  
    98  	// maxSleepTime is the max amount of time to sleep in-between errors, in seconds.
    99  	maxSleepTime = 32
   100  
   101  	// tsTaskProcessingTime measures the amount of time spent processing a single
   102  	// task.
   103  	//
   104  	// The "consumed" field is true if the underlying task was consumed and
   105  	// false if it was not.
   106  	tsTaskProcessingTime = metric.NewCumulativeDistribution("logdog/archivist/task_processing_time_ms_ng",
   107  		"The amount of time (in milliseconds) that a single task takes to process in the new pipeline.",
   108  		&types.MetricMetadata{Units: types.Milliseconds},
   109  		distribution.DefaultBucketer,
   110  		field.Bool("consumed"))
   111  
   112  	tsLoopCycleTime = metric.NewCumulativeDistribution("logdog/archivist/loop_cycle_time_ms",
   113  		"The amount of time a single batch of leases takes to process.",
   114  		&types.MetricMetadata{Units: types.Milliseconds},
   115  		distribution.DefaultBucketer)
   116  
   117  	tsLeaseCount = metric.NewCounter("logdog/archivist/tasks_leased",
   118  		"Number of tasks leased.",
   119  		nil)
   120  
   121  	tsNackCount = metric.NewCounter("logdog/archivist/tasks_not_acked",
   122  		"Number of tasks leased but failed.",
   123  		nil)
   124  
   125  	tsAckCount = metric.NewCounter("logdog/archivist/tasks_acked",
   126  		"Number of tasks successfully completed and acked.",
   127  		nil)
   128  )
   129  
   130  // runForever runs the archivist loop forever.
   131  func runForever(ctx context.Context, ar *archivist.Archivist, flags *CommandLineFlags) {
   132  	type archiveJob struct {
   133  		deadline time.Time
   134  		task     *logdog.ArchiveTask
   135  	}
   136  
   137  	ackChan, err := dispatcher.NewChannel(ctx, ackChannelOptions, func(batch *buffer.Batch) error {
   138  		var req *logdog.DeleteRequest
   139  		if batch.Meta != nil {
   140  			req = batch.Meta.(*logdog.DeleteRequest)
   141  		} else {
   142  			tasks := make([]*logdog.ArchiveTask, len(batch.Data))
   143  			for i, datum := range batch.Data {
   144  				tasks[i] = datum.Item.(*logdog.ArchiveTask)
   145  				batch.Data[i].Item = nil
   146  			}
   147  			req = &logdog.DeleteRequest{Tasks: tasks}
   148  			batch.Meta = req
   149  		}
   150  		_, err := ar.Service.DeleteArchiveTasks(ctx, req)
   151  		if err == nil {
   152  			tsAckCount.Add(ctx, int64(len(req.Tasks)))
   153  		}
   154  		return transient.Tag.Apply(err)
   155  	})
   156  	if err != nil {
   157  		panic(err) // only occurs if Options is invalid
   158  	}
   159  	defer func() {
   160  		logging.Infof(ctx, "draining ACK channel")
   161  		ackChan.CloseAndDrain(ctx)
   162  		logging.Infof(ctx, "ACK channel drained")
   163  	}()
   164  
   165  	jobChanOpts := mkJobChannelOptions(flags.MaxConcurrentTasks)
   166  	jobChan, err := dispatcher.NewChannel(ctx, jobChanOpts, func(data *buffer.Batch) error {
   167  		job := data.Data[0].Item.(*archiveJob)
   168  
   169  		nc, cancel := context.WithDeadline(ctx, job.deadline)
   170  		defer cancel()
   171  
   172  		nc = logging.SetFields(nc, logging.Fields{
   173  			"project": job.task.Project,
   174  			"id":      job.task.Id,
   175  		})
   176  
   177  		startTime := clock.Now(nc)
   178  		err := ar.ArchiveTask(nc, job.task)
   179  		duration := clock.Now(nc).Sub(startTime)
   180  
   181  		tsTaskProcessingTime.Add(ctx, float64(duration.Nanoseconds())/1000000, err == nil)
   182  
   183  		if err == nil {
   184  			select {
   185  			case ackChan.C <- job.task:
   186  			case <-ctx.Done():
   187  				logging.Errorf(nc, "Failed to ACK task %v due to context: %s", job.task, ctx.Err())
   188  			}
   189  		} else {
   190  			tsNackCount.Add(ctx, 1)
   191  			logging.Errorf(nc, "Failed to archive task %v: %s", job.task, err)
   192  		}
   193  
   194  		return nil
   195  	})
   196  	if err != nil {
   197  		panic(err) // only occurs if Options is invalid
   198  	}
   199  	defer func() {
   200  		logging.Infof(ctx, "Job channel draining")
   201  		jobChan.CloseAndDrain(ctx)
   202  		logging.Infof(ctx, "Job channel drained")
   203  	}()
   204  
   205  	// now we spin forever, pushing items into jobChan.
   206  	sleepTime := 1
   207  	var previousCycle time.Time
   208  	for ctx.Err() == nil {
   209  		var tasks *logdog.LeaseResponse
   210  		var deadline time.Time
   211  
   212  		err := retry.Retry(ctx, leaseRetryParams, func() (err error) {
   213  			logging.Infof(ctx, "Leasing max %d tasks for %s", flags.LeaseBatchSize, flags.LeaseTime)
   214  			deadline = clock.Now(ctx).Add(flags.LeaseTime)
   215  			tasks, err = ar.Service.LeaseArchiveTasks(ctx, &logdog.LeaseRequest{
   216  				MaxTasks:  int64(flags.LeaseBatchSize),
   217  				LeaseTime: durationpb.New(flags.LeaseTime),
   218  			})
   219  			return
   220  		}, retry.LogCallback(ctx, "LeaseArchiveTasks"))
   221  		if ctx.Err() != nil {
   222  			logging.Infof(ctx, "lease thread got context err in RPC: %s", ctx.Err())
   223  			break
   224  		}
   225  		if err != nil {
   226  			panic("impossible: infinite retry stopped: " + err.Error())
   227  		}
   228  
   229  		now := clock.Now(ctx)
   230  		if !previousCycle.IsZero() {
   231  			tsLoopCycleTime.Add(ctx, float64(now.Sub(previousCycle).Nanoseconds()/1000000))
   232  		}
   233  		previousCycle = now
   234  
   235  		if len(tasks.Tasks) == 0 {
   236  			sleepTime *= 2
   237  			if sleepTime > maxSleepTime {
   238  				sleepTime = maxSleepTime
   239  			}
   240  			logging.Infof(ctx, "no work to do, sleeping for %d seconds", sleepTime)
   241  			clock.Sleep(ctx, time.Duration(sleepTime)*time.Second)
   242  			continue
   243  		} else {
   244  			tsLeaseCount.Add(ctx, int64(len(tasks.Tasks)))
   245  			sleepTime = 1
   246  		}
   247  
   248  		for _, task := range tasks.Tasks {
   249  			select {
   250  			case jobChan.C <- &archiveJob{deadline, task}:
   251  			case <-ctx.Done():
   252  				logging.Infof(ctx, "lease thread got context err in jobChan push: %s", ctx.Err())
   253  				break
   254  			}
   255  		}
   256  	}
   257  }
   258  
   259  // googleStorageClient returns an authenticated Google Storage client instance.
   260  func googleStorageClient(ctx context.Context, luciProject string) (gs.Client, error) {
   261  	// TODO(vadimsh): Switch to AsProject + WithProject(project) once
   262  	// we are ready to roll out project scoped service accounts in Logdog.
   263  	tr, err := auth.GetRPCTransport(ctx, auth.AsSelf, auth.WithScopes(auth.CloudOAuthScopes...))
   264  	if err != nil {
   265  		return nil, errors.Annotate(err, "failed to get the authenticating transport").Err()
   266  	}
   267  	client, err := gs.NewProdClient(ctx, tr)
   268  	if err != nil {
   269  		return nil, errors.Annotate(err, "failed to create Google Storage client").Err()
   270  	}
   271  	return client, nil
   272  }
   273  
   274  // cloudLoggingClient returns an authenticated Cloud Logging client instance.
   275  func cloudLoggingClient(ctx context.Context, luciProject, cloudProject string, onError func(err error)) (archivist.CLClient, error) {
   276  	cred, err := auth.GetPerRPCCredentials(
   277  		ctx, auth.AsProject,
   278  		auth.WithScopes(auth.CloudOAuthScopes...),
   279  		auth.WithProject(luciProject),
   280  	)
   281  	if err != nil {
   282  		return nil, errors.Annotate(err, "failed to get per RPC credentials").Err()
   283  	}
   284  	cl, err := cloudlogging.NewClient(
   285  		ctx, cloudProject,
   286  		option.WithGRPCDialOption(grpc.WithPerRPCCredentials(cred)),
   287  		option.WithGRPCDialOption(grpc.WithStatsHandler(&grpcmon.ClientRPCStatsMonitor{})),
   288  		option.WithGRPCDialOption(grpc.WithUnaryInterceptor(otelgrpc.UnaryClientInterceptor())),
   289  		option.WithGRPCDialOption(grpc.WithStreamInterceptor(otelgrpc.StreamClientInterceptor())),
   290  	)
   291  	if err != nil {
   292  		return nil, err
   293  	}
   294  	cl.OnError = onError
   295  	return cl, nil
   296  }
   297  
   298  // Entry point.
   299  func main() {
   300  	flags := DefaultCommandLineFlags()
   301  	flags.Register(flag.CommandLine)
   302  
   303  	cfg := service.MainCfg{BigTableAppProfile: "archivist"}
   304  	service.Main(cfg, func(srv *server.Server, impl *service.Implementations) error {
   305  		if err := flags.Validate(); err != nil {
   306  			return err
   307  		}
   308  
   309  		// Initialize the archivist.
   310  		ar := &archivist.Archivist{
   311  			Service:         impl.Coordinator,
   312  			SettingsLoader:  GetSettingsLoader(srv.Options.CloudProject, &flags),
   313  			Storage:         impl.Storage,
   314  			GSClientFactory: googleStorageClient,
   315  			CLClientFactory: cloudLoggingClient,
   316  		}
   317  
   318  		// Run the archivist loop until the server closes.
   319  		srv.RunInBackground("archivist", func(ctx context.Context) {
   320  			runForever(ctx, ar, &flags)
   321  		})
   322  		return nil
   323  	})
   324  }