go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/buildbucket/listener/listener.go (about)

     1  // Copyright 2021 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package bblistener
    16  
    17  import (
    18  	"bytes"
    19  	"compress/zlib"
    20  	"context"
    21  	"encoding/json"
    22  	"fmt"
    23  	"io"
    24  	"regexp"
    25  	"sync/atomic"
    26  	"time"
    27  
    28  	"cloud.google.com/go/pubsub"
    29  	"google.golang.org/protobuf/encoding/protojson"
    30  	"google.golang.org/protobuf/proto"
    31  	"google.golang.org/protobuf/types/known/durationpb"
    32  
    33  	buildbucketpb "go.chromium.org/luci/buildbucket/proto"
    34  	"go.chromium.org/luci/common/clock"
    35  	"go.chromium.org/luci/common/errors"
    36  	"go.chromium.org/luci/common/logging"
    37  	"go.chromium.org/luci/common/retry/transient"
    38  	"go.chromium.org/luci/common/sync/parallel"
    39  	"go.chromium.org/luci/server/tq"
    40  
    41  	"go.chromium.org/luci/cv/internal/buildbucket"
    42  	"go.chromium.org/luci/cv/internal/common"
    43  	"go.chromium.org/luci/cv/internal/tryjob"
    44  )
    45  
    46  const (
    47  	// NumConcurrentListeners defines the number of Buildbucket Pub/Sub
    48  	// listeners that run concurrently.
    49  	//
    50  	// Increase this value if the notification processing speed can't keep up
    51  	// with the incoming speed.
    52  	NumConcurrentListeners = 5
    53  	// SubscriptionID is the default subscription ID for listening to
    54  	// Buildbucket build updates.
    55  	SubscriptionID = "buildbucket-builds"
    56  	// ListenDuration is how long each listener will running for.
    57  	//
    58  	// This should be in sync with the interval of the cron job that kicks the
    59  	// listener to ensure continuous processing of Buildbucket Pub/Sub events.
    60  	ListenDuration = 5 * time.Minute
    61  )
    62  
    63  // topicNameRegexp - the Cloud project Pub/Sub topic name regex expression.
    64  var topicNameRegexp = regexp.MustCompile(`^projects/(.*)/topics/(.*)$`)
    65  
    66  // This interface encapsulate the communication with tryjob component.
    67  type tryjobNotifier interface {
    68  	ScheduleUpdate(context.Context, common.TryjobID, tryjob.ExternalID) error
    69  }
    70  
    71  type tryjobUpdater interface {
    72  	// Update updates the Tryjob entity associated with the given `eid`.
    73  	//
    74  	// `data` should contain the latest information of the Tryjob from the
    75  	// Tryjob backend system (e.g. Build proto from Buildbucket pubsub).
    76  	//
    77  	// No-op if the Tryjob data stored in CV appears to be newer than the provided
    78  	// data (e.g. has newer Tryjob.Result.UpdateTime)
    79  	Update(ctx context.Context, eid tryjob.ExternalID, data any) error
    80  }
    81  
    82  // Register registers tasks for listener and returns a function to kick off
    83  // `NumConcurrentListeners` listeners.
    84  func Register(tqd *tq.Dispatcher, projectID string, tjNotifier tryjobNotifier, tjUpdater tryjobUpdater) func(context.Context) error {
    85  	_ = tqd.RegisterTaskClass(tq.TaskClass{
    86  		ID:           "listen-bb-pubsub",
    87  		Prototype:    &ListenBBPubsubTask{},
    88  		Queue:        "listen-bb-pubsub",
    89  		Kind:         tq.NonTransactional,
    90  		Quiet:        true,
    91  		QuietOnError: true,
    92  		Handler: func(ctx context.Context, payload proto.Message) error {
    93  			client, err := pubsub.NewClient(ctx, projectID)
    94  			if err != nil {
    95  				return err
    96  			}
    97  			defer func() {
    98  				if err := client.Close(); err != nil {
    99  					logging.Errorf(ctx, "failed to close PubSub client: %s", err)
   100  				}
   101  			}()
   102  
   103  			sub := client.Subscription(SubscriptionID)
   104  			subConfig, err := sub.Config(ctx)
   105  			if err != nil {
   106  				return errors.Annotate(err, "failed to get configuration for the subscription %s", sub.String()).Err()
   107  			}
   108  			subscribedProj, err := extractTopicProject(subConfig.Topic.String())
   109  			if err != nil {
   110  				return errors.Annotate(err, "for subscription %s", sub.String()).Err()
   111  			}
   112  			l := &listener{
   113  				bbHost:       fmt.Sprintf("%s.appspot.com", subscribedProj),
   114  				subscription: sub,
   115  				tjNotifier:   tjNotifier,
   116  				tjUpdater:    tjUpdater,
   117  			}
   118  			defer l.reportStats(ctx)
   119  			duration := payload.(*ListenBBPubsubTask).GetDuration().AsDuration()
   120  			if duration == 0 {
   121  				duration = ListenDuration
   122  			}
   123  			cctx, cancel := clock.WithTimeout(ctx, duration)
   124  			defer cancel()
   125  			if err := l.start(cctx); err != nil {
   126  				// Never retry the tasks because the listener will be started
   127  				// periodically by the Cron.
   128  				return common.TQIfy{NeverRetry: true}.Error(ctx, err)
   129  			}
   130  			return nil
   131  		},
   132  	})
   133  	return func(ctx context.Context) error {
   134  		return parallel.FanOutIn(func(workCh chan<- func() error) {
   135  			for i := 0; i < NumConcurrentListeners; i++ {
   136  				i := i
   137  				workCh <- func() error {
   138  					return tqd.AddTask(ctx, &tq.Task{
   139  						Title: fmt.Sprintf("listener-%d", i),
   140  						Payload: &ListenBBPubsubTask{
   141  							Duration: durationpb.New(ListenDuration),
   142  						},
   143  					})
   144  				}
   145  			}
   146  		})
   147  	}
   148  }
   149  
   150  // StartListenerForTest starts a buildbucket listener for testing purpose.
   151  //
   152  // Returns a callback function to stop the listener.
   153  func StartListenerForTest(ctx context.Context, sub *pubsub.Subscription, tjNotifier tryjobNotifier) func() {
   154  	cctx, cancel := context.WithCancel(ctx)
   155  	go func() {
   156  		l := &listener{
   157  			subscription: sub,
   158  			tjNotifier:   tjNotifier,
   159  		}
   160  		if err := l.start(cctx); err != nil {
   161  			logging.Errorf(ctx, "encounter error in buildbucket listener: %s", err)
   162  		}
   163  	}()
   164  	return cancel
   165  }
   166  
   167  type listener struct {
   168  	bbHost       string // Buildbucket host that the subscription subscribes to. e.g. cr-buildbucket.appspot.com
   169  	subscription *pubsub.Subscription
   170  	tjNotifier   tryjobNotifier
   171  	tjUpdater    tryjobUpdater
   172  
   173  	stats       listenerStats
   174  	processedCh chan string // for testing only
   175  }
   176  
   177  type listenerStats struct {
   178  	totalProcessedCount, transientErrCount, permanentErrCount int64
   179  }
   180  
   181  func (l *listener) start(ctx context.Context) error {
   182  	return l.subscription.Receive(ctx, func(ctx context.Context, msg *pubsub.Message) {
   183  		switch err := l.processMsg(ctx, msg); {
   184  		case err == nil:
   185  			msg.Ack()
   186  		case transient.Tag.In(err):
   187  			logging.Warningf(ctx, "encounter transient error when processing buildbucket pubsub message %q; Reason: %s", string(msg.Data), err)
   188  			msg.Nack()
   189  			atomic.AddInt64(&l.stats.totalProcessedCount, 1)
   190  		default:
   191  			logging.Errorf(ctx, "encounter non-transient error when processing  buildbucket pubsub message: %q; Reason: %s", string(msg.Data), err)
   192  			// Dismiss non-transient failure.
   193  			msg.Ack()
   194  			atomic.AddInt64(&l.stats.permanentErrCount, 1)
   195  		}
   196  		atomic.AddInt64(&l.stats.transientErrCount, 1)
   197  		if l.processedCh != nil {
   198  			l.processedCh <- msg.ID
   199  		}
   200  	})
   201  }
   202  
   203  func (l *listener) processMsg(ctx context.Context, msg *pubsub.Message) error {
   204  	var isV2Msg bool
   205  	var buildID int64
   206  	var hostname string
   207  	var build *buildbucketpb.Build
   208  	var err error
   209  	if v, ok := msg.Attributes["version"]; ok && v == "v2" {
   210  		isV2Msg = true
   211  		if build, err = parseV2Data(msg); err != nil {
   212  			return err
   213  		}
   214  		hostname, buildID = build.GetInfra().GetBuildbucket().GetHostname(), build.GetId()
   215  	} else {
   216  		// TODO(crbug.com/1406393): delete it once the migration is done. And the
   217  		// above pre-declared variables can also be deleted to make code more clean.
   218  		parsedMsg, err := parseV1Data(ctx, msg.Data)
   219  		if err != nil {
   220  			return err
   221  		}
   222  		hostname, buildID = parsedMsg.Hostname, parsedMsg.Build.ID
   223  	}
   224  
   225  	if hostname == "" {
   226  		logging.Warningf(ctx, "received pubsub message with empty hostname for build %d, using the computed one %s", build.GetId(), l.bbHost)
   227  		hostname = l.bbHost
   228  	}
   229  	eid, err := tryjob.BuildbucketID(hostname, buildID)
   230  	if err != nil {
   231  		return err
   232  	}
   233  	switch ids, err := tryjob.Resolve(ctx, eid); {
   234  	case err != nil:
   235  		return err
   236  	case len(ids) != 1:
   237  		panic(fmt.Errorf("impossible; requested to resolve 1 external ID %s, got %d", eid, len(ids)))
   238  	case ids[0] != 0:
   239  		// Build that is tracked by LUCI CV.
   240  		if !isV2Msg {
   241  			return l.tjNotifier.ScheduleUpdate(ctx, ids[0], eid)
   242  		}
   243  		// TODO(crbug.com/1406393): remove the debugging once the migration is done.
   244  		logging.Debugf(ctx, "builds_v2 pubsub listener: updating tryjob %s", eid)
   245  		return l.tjUpdater.Update(ctx, eid, build)
   246  	}
   247  	return nil
   248  }
   249  
   250  // parseV2Data parses Buildbucket new `builds_v2` pubsub message data.
   251  func parseV2Data(msg *pubsub.Message) (*buildbucketpb.Build, error) {
   252  	buildsV2Msg := &buildbucketpb.BuildsV2PubSub{}
   253  	if err := (protojson.UnmarshalOptions{DiscardUnknown: true}).Unmarshal(msg.Data, buildsV2Msg); err != nil {
   254  		return nil, errors.Annotate(err, "failed to unmarshal pubsub message into BuildsV2PubSub proto").Err()
   255  	}
   256  	largeFieldsData, err := zlibDecompress(buildsV2Msg.BuildLargeFields)
   257  	if err != nil {
   258  		return nil, errors.Annotate(err, "failed to decompress build_large_fields for build %d", buildsV2Msg.Build.GetId()).Err()
   259  	}
   260  	largeFields := &buildbucketpb.Build{}
   261  	if err := (proto.UnmarshalOptions{DiscardUnknown: true}).Unmarshal(largeFieldsData, largeFields); err != nil {
   262  		return nil, errors.Annotate(err, "failed to unmarshal build_large_fields for build %d", buildsV2Msg.Build.GetId()).Err()
   263  	}
   264  	proto.Merge(buildsV2Msg.Build, largeFields)
   265  
   266  	return buildsV2Msg.Build, nil
   267  }
   268  
   269  // parseV1Data extracts the relevant information from Buildbucket old `builds`
   270  // topic Pub/Sub message data.
   271  func parseV1Data(ctx context.Context, data []byte) (buildbucket.PubsubMessage, error) {
   272  	message := buildbucket.PubsubMessage{}
   273  	// Extra fields that are not in the struct are ignored by json.Unmarshal.
   274  	if err := json.Unmarshal(data, &message); err != nil {
   275  		return buildbucket.PubsubMessage{}, errors.Annotate(err, "while unmarshalling build pubsub message").Err()
   276  	}
   277  	if message.Hostname == "" || message.Build.ID == 0 {
   278  		return buildbucket.PubsubMessage{}, errors.Reason("missing build details in pubsub message: %s", data).Err()
   279  	}
   280  	return message, nil
   281  }
   282  
   283  func (l *listener) reportStats(ctx context.Context) {
   284  	logging.Infof(ctx, "processed %d buildbucket pubsub messages in total. %d of them have transient failure. %d of them have non-transient failure",
   285  		l.stats.totalProcessedCount,
   286  		l.stats.transientErrCount,
   287  		l.stats.permanentErrCount)
   288  	// TODO(yiwzhang): send tsmon metrics. Especially for non-transient count to
   289  	// to alert on.
   290  }
   291  
   292  // zlibDecompress decompresses data using zlib.
   293  func zlibDecompress(compressed []byte) ([]byte, error) {
   294  	r, err := zlib.NewReader(bytes.NewReader(compressed))
   295  	if err != nil {
   296  		return nil, err
   297  	}
   298  	defer func() { _ = r.Close() }()
   299  	return io.ReadAll(r)
   300  }
   301  
   302  // extractTopicProject extracts the subscribed project name from the given subscription.
   303  func extractTopicProject(topic string) (string, error) {
   304  	matches := topicNameRegexp.FindStringSubmatch(topic)
   305  	if len(matches) != 3 {
   306  		return "", errors.Reason("topic %s doesn't match %q", topic, topicNameRegexp.String()).Err()
   307  	}
   308  	return matches[1], nil
   309  }