go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/logdog/server/cmd/logdog_collector/main.go (about) 1 // Copyright 2016 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package main 16 17 import ( 18 "context" 19 "flag" 20 "time" 21 22 "cloud.google.com/go/pubsub" 23 "google.golang.org/api/option" 24 25 "go.chromium.org/luci/common/clock" 26 "go.chromium.org/luci/common/errors" 27 "go.chromium.org/luci/common/logging" 28 "go.chromium.org/luci/common/retry" 29 "go.chromium.org/luci/common/retry/transient" 30 "go.chromium.org/luci/common/tsmon/distribution" 31 "go.chromium.org/luci/common/tsmon/field" 32 "go.chromium.org/luci/common/tsmon/metric" 33 "go.chromium.org/luci/common/tsmon/types" 34 "go.chromium.org/luci/server" 35 "go.chromium.org/luci/server/auth" 36 37 "go.chromium.org/luci/logdog/server/collector" 38 "go.chromium.org/luci/logdog/server/collector/coordinator" 39 "go.chromium.org/luci/logdog/server/service" 40 ) 41 42 var ( 43 // tsPubsubCount counts the number of Pub/Sub messages processed by the 44 // Archivist. 45 // 46 // Result tracks the outcome of each message, either "success", "failure", or 47 // "transient_failure". 48 tsPubsubCount = metric.NewCounter("logdog/collector/subscription/count", 49 "The number of Pub/Sub messages pulled.", 50 nil, 51 field.String("result")) 52 53 // tsTaskProcessingTime tracks the amount of time a single subscription 54 // message takes to process, in milliseconds. 55 tsTaskProcessingTime = metric.NewCumulativeDistribution("logdog/collector/subscription/processing_time_ms", 56 "Amount of time in milliseconds that a single Pub/Sub message takes to process.", 57 &types.MetricMetadata{Units: types.Milliseconds}, 58 distribution.DefaultBucketer) 59 ) 60 61 // runForever runs the collector loop until the context closes. 62 func runForever(ctx context.Context, coll *collector.Collector, sub *pubsub.Subscription) { 63 retryForever := func() retry.Iterator { 64 return &retry.ExponentialBackoff{ 65 Limited: retry.Limited{ 66 Delay: 200 * time.Millisecond, 67 Retries: -1, // Unlimited. 68 }, 69 MaxDelay: 10 * time.Second, 70 Multiplier: 2, 71 } 72 } 73 74 retry.Retry(ctx, retryForever, func() error { 75 return sub.Receive(ctx, func(ctx context.Context, msg *pubsub.Message) { 76 ctx = logging.SetField(ctx, "messageID", msg.ID) 77 if processMessage(ctx, coll, msg) { 78 // ACK the message, removing it from Pub/Sub. 79 msg.Ack() 80 } else { 81 // NACK the message. It will be redelivered and processed. 82 msg.Nack() 83 } 84 }) 85 }, func(err error, d time.Duration) { 86 logging.Fields{ 87 "error": err, 88 "delay": d, 89 }.Errorf(ctx, "Error during subscription Receive loop; retrying...") 90 }) 91 } 92 93 // processMessage returns true if the message should be ACK'd (deleted from 94 // Pub/Sub) or false if the message should not be ACK'd. 95 func processMessage(ctx context.Context, coll *collector.Collector, msg *pubsub.Message) bool { 96 startTime := clock.Now(ctx) 97 err := coll.Process(ctx, msg.Data) 98 duration := clock.Now(ctx).Sub(startTime) 99 100 // We track processing time in milliseconds. 101 tsTaskProcessingTime.Add(ctx, duration.Seconds()*1000) 102 103 switch { 104 case transient.Tag.In(err) || errors.Contains(err, context.Canceled): 105 // Do not consume 106 logging.Fields{ 107 "error": err, 108 "duration": duration, 109 }.Warningf(ctx, "TRANSIENT error ingesting Pub/Sub message.") 110 tsPubsubCount.Add(ctx, 1, "transient_failure") 111 return false 112 113 case err == nil: 114 tsPubsubCount.Add(ctx, 1, "success") 115 return true 116 117 default: 118 logging.Fields{ 119 "error": err, 120 "size": len(msg.Data), 121 "duration": duration, 122 }.Errorf(ctx, "Non-transient error ingesting Pub/Sub message; ACKing.") 123 tsPubsubCount.Add(ctx, 1, "failure") 124 return true 125 } 126 } 127 128 // pubSubClient returns an authenticated Google PubSub client instance. 129 func pubSubClient(ctx context.Context, cloudProject string) (*pubsub.Client, error) { 130 ts, err := auth.GetTokenSource(ctx, auth.AsSelf, auth.WithScopes(auth.CloudOAuthScopes...)) 131 if err != nil { 132 return nil, errors.Annotate(err, "failed to get the token source").Err() 133 } 134 client, err := pubsub.NewClient(ctx, cloudProject, option.WithTokenSource(ts)) 135 if err != nil { 136 return nil, errors.Annotate(err, "failed to create the PubSub client").Err() 137 } 138 return client, nil 139 } 140 141 // Entry point. 142 func main() { 143 flags := CommandLineFlags{} 144 flags.Register(flag.CommandLine) 145 146 cfg := service.MainCfg{BigTableAppProfile: "collector"} 147 service.Main(cfg, func(srv *server.Server, impl *service.Implementations) error { 148 if err := flags.Validate(); err != nil { 149 return err 150 } 151 152 // Initialize our Collector service object using a caching Coordinator 153 // interface. 154 coll := &collector.Collector{ 155 Coordinator: coordinator.NewCache( 156 coordinator.NewCoordinator(impl.Coordinator), 157 flags.StateCacheSize, 158 flags.StateCacheExpiration, 159 ), 160 Storage: impl.Storage, 161 MaxMessageWorkers: flags.MaxMessageWorkers, 162 } 163 srv.RegisterCleanup(func(context.Context) { coll.Close() }) 164 165 // Initialize a Subscription object ready to pull messages. 166 psClient, err := pubSubClient(srv.Context, flags.PubSubProject) 167 if err != nil { 168 return err 169 } 170 psSub := psClient.Subscription(flags.PubSubSubscription) 171 psSub.ReceiveSettings = pubsub.ReceiveSettings{ 172 MaxExtension: 24 * time.Hour, 173 MaxOutstandingMessages: flags.MaxConcurrentMessages, // If < 1, default. 174 MaxOutstandingBytes: 0, // Default. 175 } 176 177 // Run the collector loop until the server closes. 178 srv.RunInBackground("collector", func(ctx context.Context) { 179 runForever(ctx, coll, psSub) 180 }) 181 return nil 182 }) 183 }