go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/buildbucket/listener/listener.go (about) 1 // Copyright 2021 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package bblistener 16 17 import ( 18 "bytes" 19 "compress/zlib" 20 "context" 21 "encoding/json" 22 "fmt" 23 "io" 24 "regexp" 25 "sync/atomic" 26 "time" 27 28 "cloud.google.com/go/pubsub" 29 "google.golang.org/protobuf/encoding/protojson" 30 "google.golang.org/protobuf/proto" 31 "google.golang.org/protobuf/types/known/durationpb" 32 33 buildbucketpb "go.chromium.org/luci/buildbucket/proto" 34 "go.chromium.org/luci/common/clock" 35 "go.chromium.org/luci/common/errors" 36 "go.chromium.org/luci/common/logging" 37 "go.chromium.org/luci/common/retry/transient" 38 "go.chromium.org/luci/common/sync/parallel" 39 "go.chromium.org/luci/server/tq" 40 41 "go.chromium.org/luci/cv/internal/buildbucket" 42 "go.chromium.org/luci/cv/internal/common" 43 "go.chromium.org/luci/cv/internal/tryjob" 44 ) 45 46 const ( 47 // NumConcurrentListeners defines the number of Buildbucket Pub/Sub 48 // listeners that run concurrently. 49 // 50 // Increase this value if the notification processing speed can't keep up 51 // with the incoming speed. 52 NumConcurrentListeners = 5 53 // SubscriptionID is the default subscription ID for listening to 54 // Buildbucket build updates. 55 SubscriptionID = "buildbucket-builds" 56 // ListenDuration is how long each listener will running for. 57 // 58 // This should be in sync with the interval of the cron job that kicks the 59 // listener to ensure continuous processing of Buildbucket Pub/Sub events. 60 ListenDuration = 5 * time.Minute 61 ) 62 63 // topicNameRegexp - the Cloud project Pub/Sub topic name regex expression. 64 var topicNameRegexp = regexp.MustCompile(`^projects/(.*)/topics/(.*)$`) 65 66 // This interface encapsulate the communication with tryjob component. 67 type tryjobNotifier interface { 68 ScheduleUpdate(context.Context, common.TryjobID, tryjob.ExternalID) error 69 } 70 71 type tryjobUpdater interface { 72 // Update updates the Tryjob entity associated with the given `eid`. 73 // 74 // `data` should contain the latest information of the Tryjob from the 75 // Tryjob backend system (e.g. Build proto from Buildbucket pubsub). 76 // 77 // No-op if the Tryjob data stored in CV appears to be newer than the provided 78 // data (e.g. has newer Tryjob.Result.UpdateTime) 79 Update(ctx context.Context, eid tryjob.ExternalID, data any) error 80 } 81 82 // Register registers tasks for listener and returns a function to kick off 83 // `NumConcurrentListeners` listeners. 84 func Register(tqd *tq.Dispatcher, projectID string, tjNotifier tryjobNotifier, tjUpdater tryjobUpdater) func(context.Context) error { 85 _ = tqd.RegisterTaskClass(tq.TaskClass{ 86 ID: "listen-bb-pubsub", 87 Prototype: &ListenBBPubsubTask{}, 88 Queue: "listen-bb-pubsub", 89 Kind: tq.NonTransactional, 90 Quiet: true, 91 QuietOnError: true, 92 Handler: func(ctx context.Context, payload proto.Message) error { 93 client, err := pubsub.NewClient(ctx, projectID) 94 if err != nil { 95 return err 96 } 97 defer func() { 98 if err := client.Close(); err != nil { 99 logging.Errorf(ctx, "failed to close PubSub client: %s", err) 100 } 101 }() 102 103 sub := client.Subscription(SubscriptionID) 104 subConfig, err := sub.Config(ctx) 105 if err != nil { 106 return errors.Annotate(err, "failed to get configuration for the subscription %s", sub.String()).Err() 107 } 108 subscribedProj, err := extractTopicProject(subConfig.Topic.String()) 109 if err != nil { 110 return errors.Annotate(err, "for subscription %s", sub.String()).Err() 111 } 112 l := &listener{ 113 bbHost: fmt.Sprintf("%s.appspot.com", subscribedProj), 114 subscription: sub, 115 tjNotifier: tjNotifier, 116 tjUpdater: tjUpdater, 117 } 118 defer l.reportStats(ctx) 119 duration := payload.(*ListenBBPubsubTask).GetDuration().AsDuration() 120 if duration == 0 { 121 duration = ListenDuration 122 } 123 cctx, cancel := clock.WithTimeout(ctx, duration) 124 defer cancel() 125 if err := l.start(cctx); err != nil { 126 // Never retry the tasks because the listener will be started 127 // periodically by the Cron. 128 return common.TQIfy{NeverRetry: true}.Error(ctx, err) 129 } 130 return nil 131 }, 132 }) 133 return func(ctx context.Context) error { 134 return parallel.FanOutIn(func(workCh chan<- func() error) { 135 for i := 0; i < NumConcurrentListeners; i++ { 136 i := i 137 workCh <- func() error { 138 return tqd.AddTask(ctx, &tq.Task{ 139 Title: fmt.Sprintf("listener-%d", i), 140 Payload: &ListenBBPubsubTask{ 141 Duration: durationpb.New(ListenDuration), 142 }, 143 }) 144 } 145 } 146 }) 147 } 148 } 149 150 // StartListenerForTest starts a buildbucket listener for testing purpose. 151 // 152 // Returns a callback function to stop the listener. 153 func StartListenerForTest(ctx context.Context, sub *pubsub.Subscription, tjNotifier tryjobNotifier) func() { 154 cctx, cancel := context.WithCancel(ctx) 155 go func() { 156 l := &listener{ 157 subscription: sub, 158 tjNotifier: tjNotifier, 159 } 160 if err := l.start(cctx); err != nil { 161 logging.Errorf(ctx, "encounter error in buildbucket listener: %s", err) 162 } 163 }() 164 return cancel 165 } 166 167 type listener struct { 168 bbHost string // Buildbucket host that the subscription subscribes to. e.g. cr-buildbucket.appspot.com 169 subscription *pubsub.Subscription 170 tjNotifier tryjobNotifier 171 tjUpdater tryjobUpdater 172 173 stats listenerStats 174 processedCh chan string // for testing only 175 } 176 177 type listenerStats struct { 178 totalProcessedCount, transientErrCount, permanentErrCount int64 179 } 180 181 func (l *listener) start(ctx context.Context) error { 182 return l.subscription.Receive(ctx, func(ctx context.Context, msg *pubsub.Message) { 183 switch err := l.processMsg(ctx, msg); { 184 case err == nil: 185 msg.Ack() 186 case transient.Tag.In(err): 187 logging.Warningf(ctx, "encounter transient error when processing buildbucket pubsub message %q; Reason: %s", string(msg.Data), err) 188 msg.Nack() 189 atomic.AddInt64(&l.stats.totalProcessedCount, 1) 190 default: 191 logging.Errorf(ctx, "encounter non-transient error when processing buildbucket pubsub message: %q; Reason: %s", string(msg.Data), err) 192 // Dismiss non-transient failure. 193 msg.Ack() 194 atomic.AddInt64(&l.stats.permanentErrCount, 1) 195 } 196 atomic.AddInt64(&l.stats.transientErrCount, 1) 197 if l.processedCh != nil { 198 l.processedCh <- msg.ID 199 } 200 }) 201 } 202 203 func (l *listener) processMsg(ctx context.Context, msg *pubsub.Message) error { 204 var isV2Msg bool 205 var buildID int64 206 var hostname string 207 var build *buildbucketpb.Build 208 var err error 209 if v, ok := msg.Attributes["version"]; ok && v == "v2" { 210 isV2Msg = true 211 if build, err = parseV2Data(msg); err != nil { 212 return err 213 } 214 hostname, buildID = build.GetInfra().GetBuildbucket().GetHostname(), build.GetId() 215 } else { 216 // TODO(crbug.com/1406393): delete it once the migration is done. And the 217 // above pre-declared variables can also be deleted to make code more clean. 218 parsedMsg, err := parseV1Data(ctx, msg.Data) 219 if err != nil { 220 return err 221 } 222 hostname, buildID = parsedMsg.Hostname, parsedMsg.Build.ID 223 } 224 225 if hostname == "" { 226 logging.Warningf(ctx, "received pubsub message with empty hostname for build %d, using the computed one %s", build.GetId(), l.bbHost) 227 hostname = l.bbHost 228 } 229 eid, err := tryjob.BuildbucketID(hostname, buildID) 230 if err != nil { 231 return err 232 } 233 switch ids, err := tryjob.Resolve(ctx, eid); { 234 case err != nil: 235 return err 236 case len(ids) != 1: 237 panic(fmt.Errorf("impossible; requested to resolve 1 external ID %s, got %d", eid, len(ids))) 238 case ids[0] != 0: 239 // Build that is tracked by LUCI CV. 240 if !isV2Msg { 241 return l.tjNotifier.ScheduleUpdate(ctx, ids[0], eid) 242 } 243 // TODO(crbug.com/1406393): remove the debugging once the migration is done. 244 logging.Debugf(ctx, "builds_v2 pubsub listener: updating tryjob %s", eid) 245 return l.tjUpdater.Update(ctx, eid, build) 246 } 247 return nil 248 } 249 250 // parseV2Data parses Buildbucket new `builds_v2` pubsub message data. 251 func parseV2Data(msg *pubsub.Message) (*buildbucketpb.Build, error) { 252 buildsV2Msg := &buildbucketpb.BuildsV2PubSub{} 253 if err := (protojson.UnmarshalOptions{DiscardUnknown: true}).Unmarshal(msg.Data, buildsV2Msg); err != nil { 254 return nil, errors.Annotate(err, "failed to unmarshal pubsub message into BuildsV2PubSub proto").Err() 255 } 256 largeFieldsData, err := zlibDecompress(buildsV2Msg.BuildLargeFields) 257 if err != nil { 258 return nil, errors.Annotate(err, "failed to decompress build_large_fields for build %d", buildsV2Msg.Build.GetId()).Err() 259 } 260 largeFields := &buildbucketpb.Build{} 261 if err := (proto.UnmarshalOptions{DiscardUnknown: true}).Unmarshal(largeFieldsData, largeFields); err != nil { 262 return nil, errors.Annotate(err, "failed to unmarshal build_large_fields for build %d", buildsV2Msg.Build.GetId()).Err() 263 } 264 proto.Merge(buildsV2Msg.Build, largeFields) 265 266 return buildsV2Msg.Build, nil 267 } 268 269 // parseV1Data extracts the relevant information from Buildbucket old `builds` 270 // topic Pub/Sub message data. 271 func parseV1Data(ctx context.Context, data []byte) (buildbucket.PubsubMessage, error) { 272 message := buildbucket.PubsubMessage{} 273 // Extra fields that are not in the struct are ignored by json.Unmarshal. 274 if err := json.Unmarshal(data, &message); err != nil { 275 return buildbucket.PubsubMessage{}, errors.Annotate(err, "while unmarshalling build pubsub message").Err() 276 } 277 if message.Hostname == "" || message.Build.ID == 0 { 278 return buildbucket.PubsubMessage{}, errors.Reason("missing build details in pubsub message: %s", data).Err() 279 } 280 return message, nil 281 } 282 283 func (l *listener) reportStats(ctx context.Context) { 284 logging.Infof(ctx, "processed %d buildbucket pubsub messages in total. %d of them have transient failure. %d of them have non-transient failure", 285 l.stats.totalProcessedCount, 286 l.stats.transientErrCount, 287 l.stats.permanentErrCount) 288 // TODO(yiwzhang): send tsmon metrics. Especially for non-transient count to 289 // to alert on. 290 } 291 292 // zlibDecompress decompresses data using zlib. 293 func zlibDecompress(compressed []byte) ([]byte, error) { 294 r, err := zlib.NewReader(bytes.NewReader(compressed)) 295 if err != nil { 296 return nil, err 297 } 298 defer func() { _ = r.Close() }() 299 return io.ReadAll(r) 300 } 301 302 // extractTopicProject extracts the subscribed project name from the given subscription. 303 func extractTopicProject(topic string) (string, error) { 304 matches := topicNameRegexp.FindStringSubmatch(topic) 305 if len(matches) != 3 { 306 return "", errors.Reason("topic %s doesn't match %q", topic, topicNameRegexp.String()).Err() 307 } 308 return matches[1], nil 309 }