go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/logdog/server/cmd/logdog_collector/main.go (about)

     1  // Copyright 2016 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package main
    16  
    17  import (
    18  	"context"
    19  	"flag"
    20  	"time"
    21  
    22  	"cloud.google.com/go/pubsub"
    23  	"google.golang.org/api/option"
    24  
    25  	"go.chromium.org/luci/common/clock"
    26  	"go.chromium.org/luci/common/errors"
    27  	"go.chromium.org/luci/common/logging"
    28  	"go.chromium.org/luci/common/retry"
    29  	"go.chromium.org/luci/common/retry/transient"
    30  	"go.chromium.org/luci/common/tsmon/distribution"
    31  	"go.chromium.org/luci/common/tsmon/field"
    32  	"go.chromium.org/luci/common/tsmon/metric"
    33  	"go.chromium.org/luci/common/tsmon/types"
    34  	"go.chromium.org/luci/server"
    35  	"go.chromium.org/luci/server/auth"
    36  
    37  	"go.chromium.org/luci/logdog/server/collector"
    38  	"go.chromium.org/luci/logdog/server/collector/coordinator"
    39  	"go.chromium.org/luci/logdog/server/service"
    40  )
    41  
    42  var (
    43  	// tsPubsubCount counts the number of Pub/Sub messages processed by the
    44  	// Archivist.
    45  	//
    46  	// Result tracks the outcome of each message, either "success", "failure", or
    47  	// "transient_failure".
    48  	tsPubsubCount = metric.NewCounter("logdog/collector/subscription/count",
    49  		"The number of Pub/Sub messages pulled.",
    50  		nil,
    51  		field.String("result"))
    52  
    53  	// tsTaskProcessingTime tracks the amount of time a single subscription
    54  	// message takes to process, in milliseconds.
    55  	tsTaskProcessingTime = metric.NewCumulativeDistribution("logdog/collector/subscription/processing_time_ms",
    56  		"Amount of time in milliseconds that a single Pub/Sub message takes to process.",
    57  		&types.MetricMetadata{Units: types.Milliseconds},
    58  		distribution.DefaultBucketer)
    59  )
    60  
    61  // runForever runs the collector loop until the context closes.
    62  func runForever(ctx context.Context, coll *collector.Collector, sub *pubsub.Subscription) {
    63  	retryForever := func() retry.Iterator {
    64  		return &retry.ExponentialBackoff{
    65  			Limited: retry.Limited{
    66  				Delay:   200 * time.Millisecond,
    67  				Retries: -1, // Unlimited.
    68  			},
    69  			MaxDelay:   10 * time.Second,
    70  			Multiplier: 2,
    71  		}
    72  	}
    73  
    74  	retry.Retry(ctx, retryForever, func() error {
    75  		return sub.Receive(ctx, func(ctx context.Context, msg *pubsub.Message) {
    76  			ctx = logging.SetField(ctx, "messageID", msg.ID)
    77  			if processMessage(ctx, coll, msg) {
    78  				// ACK the message, removing it from Pub/Sub.
    79  				msg.Ack()
    80  			} else {
    81  				// NACK the message. It will be redelivered and processed.
    82  				msg.Nack()
    83  			}
    84  		})
    85  	}, func(err error, d time.Duration) {
    86  		logging.Fields{
    87  			"error": err,
    88  			"delay": d,
    89  		}.Errorf(ctx, "Error during subscription Receive loop; retrying...")
    90  	})
    91  }
    92  
    93  // processMessage returns true if the message should be ACK'd (deleted from
    94  // Pub/Sub) or false if the message should not be ACK'd.
    95  func processMessage(ctx context.Context, coll *collector.Collector, msg *pubsub.Message) bool {
    96  	startTime := clock.Now(ctx)
    97  	err := coll.Process(ctx, msg.Data)
    98  	duration := clock.Now(ctx).Sub(startTime)
    99  
   100  	// We track processing time in milliseconds.
   101  	tsTaskProcessingTime.Add(ctx, duration.Seconds()*1000)
   102  
   103  	switch {
   104  	case transient.Tag.In(err) || errors.Contains(err, context.Canceled):
   105  		// Do not consume
   106  		logging.Fields{
   107  			"error":    err,
   108  			"duration": duration,
   109  		}.Warningf(ctx, "TRANSIENT error ingesting Pub/Sub message.")
   110  		tsPubsubCount.Add(ctx, 1, "transient_failure")
   111  		return false
   112  
   113  	case err == nil:
   114  		tsPubsubCount.Add(ctx, 1, "success")
   115  		return true
   116  
   117  	default:
   118  		logging.Fields{
   119  			"error":    err,
   120  			"size":     len(msg.Data),
   121  			"duration": duration,
   122  		}.Errorf(ctx, "Non-transient error ingesting Pub/Sub message; ACKing.")
   123  		tsPubsubCount.Add(ctx, 1, "failure")
   124  		return true
   125  	}
   126  }
   127  
   128  // pubSubClient returns an authenticated Google PubSub client instance.
   129  func pubSubClient(ctx context.Context, cloudProject string) (*pubsub.Client, error) {
   130  	ts, err := auth.GetTokenSource(ctx, auth.AsSelf, auth.WithScopes(auth.CloudOAuthScopes...))
   131  	if err != nil {
   132  		return nil, errors.Annotate(err, "failed to get the token source").Err()
   133  	}
   134  	client, err := pubsub.NewClient(ctx, cloudProject, option.WithTokenSource(ts))
   135  	if err != nil {
   136  		return nil, errors.Annotate(err, "failed to create the PubSub client").Err()
   137  	}
   138  	return client, nil
   139  }
   140  
   141  // Entry point.
   142  func main() {
   143  	flags := CommandLineFlags{}
   144  	flags.Register(flag.CommandLine)
   145  
   146  	cfg := service.MainCfg{BigTableAppProfile: "collector"}
   147  	service.Main(cfg, func(srv *server.Server, impl *service.Implementations) error {
   148  		if err := flags.Validate(); err != nil {
   149  			return err
   150  		}
   151  
   152  		// Initialize our Collector service object using a caching Coordinator
   153  		// interface.
   154  		coll := &collector.Collector{
   155  			Coordinator: coordinator.NewCache(
   156  				coordinator.NewCoordinator(impl.Coordinator),
   157  				flags.StateCacheSize,
   158  				flags.StateCacheExpiration,
   159  			),
   160  			Storage:           impl.Storage,
   161  			MaxMessageWorkers: flags.MaxMessageWorkers,
   162  		}
   163  		srv.RegisterCleanup(func(context.Context) { coll.Close() })
   164  
   165  		// Initialize a Subscription object ready to pull messages.
   166  		psClient, err := pubSubClient(srv.Context, flags.PubSubProject)
   167  		if err != nil {
   168  			return err
   169  		}
   170  		psSub := psClient.Subscription(flags.PubSubSubscription)
   171  		psSub.ReceiveSettings = pubsub.ReceiveSettings{
   172  			MaxExtension:           24 * time.Hour,
   173  			MaxOutstandingMessages: flags.MaxConcurrentMessages, // If < 1, default.
   174  			MaxOutstandingBytes:    0,                           // Default.
   175  		}
   176  
   177  		// Run the collector loop until the server closes.
   178  		srv.RunInBackground("collector", func(ctx context.Context) {
   179  			runForever(ctx, coll, psSub)
   180  		})
   181  		return nil
   182  	})
   183  }