go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/luci_notify/notify/pubsub.go (about)

     1  // Copyright 2017 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package notify
    16  
    17  import (
    18  	"bytes"
    19  	"compress/zlib"
    20  	"context"
    21  	"encoding/json"
    22  	"fmt"
    23  	"io"
    24  	"math/rand"
    25  	"net/http"
    26  	"sort"
    27  	"strings"
    28  	"time"
    29  
    30  	"google.golang.org/genproto/protobuf/field_mask"
    31  	"google.golang.org/grpc/codes"
    32  	"google.golang.org/grpc/status"
    33  	"google.golang.org/protobuf/encoding/protojson"
    34  	"google.golang.org/protobuf/proto"
    35  
    36  	buildbucketpb "go.chromium.org/luci/buildbucket/proto"
    37  	bbv1 "go.chromium.org/luci/common/api/buildbucket/buildbucket/v1"
    38  	"go.chromium.org/luci/common/api/gitiles"
    39  	"go.chromium.org/luci/common/clock"
    40  	"go.chromium.org/luci/common/errors"
    41  	"go.chromium.org/luci/common/logging"
    42  	"go.chromium.org/luci/common/retry/transient"
    43  	"go.chromium.org/luci/common/sync/parallel"
    44  	"go.chromium.org/luci/gae/service/datastore"
    45  	"go.chromium.org/luci/grpc/grpcutil"
    46  	"go.chromium.org/luci/grpc/prpc"
    47  	"go.chromium.org/luci/server/auth"
    48  	"go.chromium.org/luci/server/router"
    49  
    50  	notifypb "go.chromium.org/luci/luci_notify/api/config"
    51  	"go.chromium.org/luci/luci_notify/config"
    52  )
    53  
    54  func getBuilderID(b *buildbucketpb.Build) string {
    55  	return fmt.Sprintf("%s/%s", b.Builder.Bucket, b.Builder.Builder)
    56  }
    57  
    58  // EmailNotify contains information for delivery and personalization of notification emails.
    59  type EmailNotify struct {
    60  	Email         string `json:"email"`
    61  	Template      string `json:"template"`
    62  	MatchingSteps []*buildbucketpb.Step
    63  }
    64  
    65  // sortEmailNotify sorts a list of EmailNotify by Email, then Template.
    66  func sortEmailNotify(en []EmailNotify) {
    67  	sort.Slice(en, func(i, j int) bool {
    68  		first := en[i]
    69  		second := en[j]
    70  		emailResult := strings.Compare(first.Email, second.Email)
    71  		if emailResult == 0 {
    72  			return strings.Compare(first.Template, second.Template) < 0
    73  		}
    74  		return emailResult < 0
    75  	})
    76  }
    77  
    78  // extractEmailNotifyValues extracts EmailNotify slice from the build.
    79  // TODO(nodir): remove parametersJSON once clients move to properties.
    80  func extractEmailNotifyValues(build *buildbucketpb.Build, parametersJSON string) ([]EmailNotify, error) {
    81  	const propertyName = "email_notify"
    82  	value := build.GetOutput().GetProperties().GetFields()[propertyName]
    83  	if value == nil {
    84  		value = build.GetInput().GetProperties().GetFields()[propertyName]
    85  	}
    86  	if value != nil {
    87  		notifiesPB := value.GetListValue().GetValues()
    88  		ret := make([]EmailNotify, len(notifiesPB))
    89  		for i, notifyPB := range notifiesPB {
    90  			notifyFields := notifyPB.GetStructValue().GetFields()
    91  			ret[i] = EmailNotify{
    92  				Email:    notifyFields["email"].GetStringValue(),
    93  				Template: notifyFields["template"].GetStringValue(),
    94  				// MatchingSteps is left blank, as it is only available for recipients
    95  				// derived from Notifications with step filters.
    96  			}
    97  		}
    98  		return ret, nil
    99  	}
   100  
   101  	if parametersJSON == "" {
   102  		return nil, nil
   103  	}
   104  	// json equivalent: {"email_notify": [{"email": "<address>"}, ...]}
   105  	var output struct {
   106  		EmailNotify []EmailNotify `json:"email_notify"`
   107  	}
   108  
   109  	if err := json.NewDecoder(strings.NewReader(parametersJSON)).Decode(&output); err != nil {
   110  		return nil, errors.Annotate(err, "invalid msg.ParametersJson").Err()
   111  	}
   112  	return output.EmailNotify, nil
   113  }
   114  
   115  func randInRange(upper, lower float64) float64 {
   116  	return lower + rand.Float64()*(upper-lower)
   117  }
   118  
   119  func putWithRetry(c context.Context, entity any) error {
   120  	// Retry with exponential backoff. The limit we care about here is the limit
   121  	// on writes to entities within an entity group, which is once per second.
   122  	//
   123  	// By waiting for 1 second we'll guarantee not to conflict with the write
   124  	// that caused contention the first time we failed. We double the delay each
   125  	// time, and randomly shift over the interval to avoid clustering.
   126  
   127  	delay := 1.0
   128  	var err error
   129  
   130  	// Retry up to 4 times. Past that point the delay is becoming too long to be
   131  	// reasonable.
   132  	for i := 0; i < 4; i++ {
   133  		err = datastore.Put(c, entity)
   134  		if err == nil {
   135  			return nil
   136  		} else if status.Code(err) != codes.Aborted {
   137  			// Datastore documentation says that Aborted is returned in the case
   138  			// of contention, which is the only case we want to retry on.
   139  			// TODO: Perhaps also retry on some others like Unavailable or
   140  			// DeadlineExceeded?
   141  			return err
   142  		}
   143  
   144  		clock.Sleep(c, time.Duration(float64(time.Second)*randInRange(delay, delay*2)))
   145  		delay *= 2
   146  	}
   147  
   148  	return err
   149  }
   150  
   151  // handleBuild processes a Build and sends appropriate notifications.
   152  //
   153  // This function should serve as documentation of the process of going from
   154  // a Build to sent notifications. It also should explicitly handle ACLs and
   155  // stop the process of handling notifications early to avoid wasting compute
   156  // time.
   157  //
   158  // getCheckout produces the associated source checkout for a build, if available.
   159  // It's passed in as a parameter in order to mock it for testing.
   160  //
   161  // history is a function that contacts gitiles to obtain the git history for
   162  // revision ordering purposes. It's passed in as a parameter in order to mock it
   163  // for testing.
   164  func handleBuild(c context.Context, build *Build, getCheckout CheckoutFunc, history HistoryFunc) error {
   165  	luciProject := build.Builder.Project
   166  	project := &config.Project{Name: luciProject}
   167  	switch ex, err := datastore.Exists(c, project); {
   168  	case err != nil:
   169  		return err
   170  	case !ex.All():
   171  		logging.Infof(c, "Build project not tracked by LUCI Notify, ignoring...")
   172  		return nil // This project is not tracked by luci-notify
   173  	}
   174  
   175  	// checkout is only used to compute the blamelist
   176  	// As blamelist is not a "critical" feature of luci-notify, if there is an
   177  	// error getting the checkout (mostly because there is no source manifest)
   178  	// we should not throw 500, but just log the error, and inform the builder
   179  	// owner that they are missing source manifest in their builds
   180  	logdogContext, _ := context.WithTimeout(c, LOGDOG_REQUEST_TIMEOUT)
   181  	checkout, err := getCheckout(logdogContext, build)
   182  	if err != nil {
   183  		// TODO (crbug.com/1058190): log the error and let the owner know
   184  		logging.Warningf(c, "Got error when getting source manifest for build %v", err)
   185  	}
   186  
   187  	// Get the Builder for the first time, and initialize if there's nothing there.
   188  	builderID := getBuilderID(&build.Build)
   189  	templateInput := &notifypb.TemplateInput{
   190  		BuildbucketHostname: build.BuildbucketHostname,
   191  		Build:               &build.Build,
   192  	}
   193  
   194  	// Set up the initial list of recipients, derived from the build.
   195  	recipients := make([]EmailNotify, len(build.EmailNotify))
   196  	copy(recipients, build.EmailNotify)
   197  
   198  	// Helper functions for notifying and updating tree closer status.
   199  	notifyNoBlame := func(c context.Context, b config.Builder, oldStatus buildbucketpb.Status) error {
   200  		notifications := Filter(c, &b.Notifications, oldStatus, &build.Build)
   201  		recipients = append(recipients, ComputeRecipients(c, notifications, nil, nil)...)
   202  		templateInput.OldStatus = oldStatus
   203  		return Notify(c, recipients, templateInput)
   204  	}
   205  	notifyAndUpdateTrees := func(c context.Context, b config.Builder, oldStatus buildbucketpb.Status) error {
   206  		return parallel.FanOutIn(func(ch chan<- func() error) {
   207  			ch <- func() error { return notifyNoBlame(c, b, oldStatus) }
   208  			ch <- func() error { return UpdateTreeClosers(c, build, oldStatus) }
   209  		})
   210  	}
   211  
   212  	gCommit := build.Input.GetGitilesCommit()
   213  	buildCreateTime := build.CreateTime.AsTime()
   214  
   215  	var keepGoing bool
   216  	var builder config.Builder
   217  	err = datastore.RunInTransaction(c, func(c context.Context) error {
   218  		// Reset these values everytime the transaction is retried, to avoid
   219  		// leaking state from a previous attempt to this attempt.
   220  		keepGoing = false
   221  		builder = config.Builder{
   222  			ProjectKey: datastore.KeyForObj(c, project),
   223  			ID:         builderID,
   224  		}
   225  
   226  		switch err := datastore.Get(c, &builder); {
   227  		case err == datastore.ErrNoSuchEntity:
   228  			// Even if the builder isn't found, we may still want to notify if the build
   229  			// specifies email addresses to notify.
   230  			logging.Infof(c, "No builder %q found for project %q", builderID, luciProject)
   231  			return Notify(c, recipients, templateInput)
   232  		case err != nil:
   233  			return errors.Annotate(err, "failed to get builder").Tag(transient.Tag).Err()
   234  		}
   235  
   236  		// Create a new builder as a copy of the old, updated with build information.
   237  		updatedBuilder := builder
   238  		updatedBuilder.Status = build.Status
   239  		updatedBuilder.BuildTime = buildCreateTime
   240  		if len(checkout) > 0 {
   241  			updatedBuilder.GitilesCommits = checkout.ToGitilesCommits()
   242  		}
   243  
   244  		if builder.Repository == "" {
   245  			// No repository specified, so we follow build time ordering.
   246  			if builder.BuildTime.Before(buildCreateTime) {
   247  				// The build is in-order with respect to build time, so notify normally.
   248  				if err := notifyAndUpdateTrees(c, builder, builder.Status); err != nil {
   249  					return err
   250  				}
   251  				return putWithRetry(c, &updatedBuilder)
   252  			}
   253  			logging.Infof(c, "Found build with old time")
   254  
   255  			// Don't update trees, since it's out of order.
   256  			return notifyNoBlame(c, builder, 0)
   257  		}
   258  		// Repository specified, so we follow commit ordering.
   259  
   260  		if gCommit == nil || (gCommit != nil && gCommit.Id == "") {
   261  			// If there's no revision information, the build must be treated as out-
   262  			// of-order. For such builds we only perform non-on_change notifications,
   263  			// and do not update trees.
   264  			logging.Infof(c, "No revision information found for this build, treating as out-of-order...")
   265  			return notifyNoBlame(c, builder, 0)
   266  		}
   267  
   268  		builderRepoHost, builderRepoProject, _ := gitiles.ParseRepoURL(builder.Repository)
   269  		if builderRepoHost != gCommit.Host || builderRepoProject != gCommit.Project {
   270  			logging.Infof(c, "Builder %s triggered by commit to https://%s/%s"+
   271  				"instead of known https://%s, treating as out-of-order...",
   272  				builderID, gCommit.Host, gCommit.Project, builder.Repository)
   273  			// Only perform non-on_change notifications.
   274  			return notifyNoBlame(c, builder, 0)
   275  		}
   276  
   277  		// Update the new builder with revision information as we know it's now available.
   278  		updatedBuilder.Revision = gCommit.Id
   279  
   280  		// If there's no revision information on the Builder, this means the Builder
   281  		// is uninitialized. Notify about the build as best as we can and then store
   282  		// the updated builder.
   283  		if builder.Revision == "" {
   284  			if err := notifyAndUpdateTrees(c, builder, 0); err != nil {
   285  				return err
   286  			}
   287  			return putWithRetry(c, &updatedBuilder)
   288  		}
   289  		keepGoing = true
   290  		return nil
   291  	}, nil)
   292  	if err != nil {
   293  		return err
   294  	}
   295  	if !keepGoing {
   296  		return nil
   297  	}
   298  
   299  	// We have a builder with a Repository set (i.e. following commit ordering),
   300  	// and a build with commit ID set. Continue processing notifications.
   301  
   302  	// Get the revision history for the build-related commit.
   303  	commits, err := history(c, luciProject, gCommit.Host, gCommit.Project, builder.Revision, gCommit.Id)
   304  	if err != nil {
   305  		return errors.Annotate(err, "failed to retrieve git history for input commit").Err()
   306  	}
   307  	if len(commits) == 0 {
   308  		logging.Debugf(c, "Found build with old commit, not updating tree closers")
   309  		return notifyNoBlame(c, builder, 0)
   310  	}
   311  
   312  	// Get the blamelist logs, if needed.
   313  	var aggregateLogs Logs
   314  	aggregateRepoAllowset := BlamelistRepoAllowset(builder.Notifications)
   315  	if len(aggregateRepoAllowset) > 0 && len(checkout) > 0 {
   316  		oldCheckout := NewCheckout(builder.GitilesCommits)
   317  		aggregateLogs, err = ComputeLogs(c, luciProject, oldCheckout, checkout.Filter(aggregateRepoAllowset), history)
   318  		if err != nil {
   319  			return errors.Annotate(err, "failed to compute logs").Err()
   320  		}
   321  	}
   322  
   323  	// Update `builder`, and check if we need to store a newer version, then store it.
   324  	oldRepository := builder.Repository
   325  	err = datastore.RunInTransaction(c, func(c context.Context) error {
   326  		switch err := datastore.Get(c, &builder); {
   327  		case err == datastore.ErrNoSuchEntity:
   328  			return errors.New("builder deleted between datastore.Get calls")
   329  		case err != nil:
   330  			return err
   331  		}
   332  
   333  		// If the builder's repository got updated in the meanwhile, we need to throw a
   334  		// transient error and retry this whole thing.
   335  		if builder.Repository != oldRepository {
   336  			return errors.Reason("failed to notify because builder repository updated").Tag(transient.Tag).Err()
   337  		}
   338  
   339  		// Create a new builder as a copy of the old, updated with build information.
   340  		updatedBuilder := builder
   341  		updatedBuilder.Status = build.Status
   342  		updatedBuilder.BuildTime = buildCreateTime
   343  		updatedBuilder.Revision = gCommit.Id
   344  		if len(checkout) > 0 {
   345  			updatedBuilder.GitilesCommits = checkout.ToGitilesCommits()
   346  		}
   347  
   348  		index := commitIndex(commits, builder.Revision)
   349  		outOfOrder := false
   350  		switch {
   351  		// If the revision is not found, we can conclude that the Builder has
   352  		// advanced beyond gCommit.Revision. This is because:
   353  		//   1) builder.Revision only ever moves forward.
   354  		//   2) commits contains the git history up to gCommit.Revision.
   355  		case index < 0:
   356  			logging.Debugf(c, "Found build with old commit during transaction.")
   357  			outOfOrder = true
   358  
   359  		// If the revision is current, check build creation time.
   360  		case index == 0 && builder.BuildTime.After(buildCreateTime):
   361  			logging.Debugf(c, "Found build with the same commit but an old time.")
   362  			outOfOrder = true
   363  		}
   364  
   365  		if outOfOrder {
   366  			// If the build is out-of-order, we want to ignore only on_change notifications,
   367  			// and not update trees.
   368  			return notifyNoBlame(c, builder, 0)
   369  		}
   370  
   371  		// Notify, and include the blamelist.
   372  		n := Filter(c, &builder.Notifications, builder.Status, &build.Build)
   373  		recipients = append(recipients, ComputeRecipients(c, n, commits[:index], aggregateLogs)...)
   374  		templateInput.OldStatus = builder.Status
   375  
   376  		return parallel.FanOutIn(func(ch chan<- func() error) {
   377  			ch <- func() error { return Notify(c, recipients, templateInput) }
   378  			ch <- func() error { return putWithRetry(c, &updatedBuilder) }
   379  			ch <- func() error { return UpdateTreeClosers(c, build, 0) }
   380  		})
   381  	}, nil)
   382  	return errors.Annotate(err, "failed to save builder").Tag(transient.Tag).Err()
   383  }
   384  
   385  func newBuildsClient(c context.Context, host, project string) (buildbucketpb.BuildsClient, error) {
   386  	t, err := auth.GetRPCTransport(c, auth.AsProject, auth.WithProject(project))
   387  	if err != nil {
   388  		return nil, err
   389  	}
   390  	return buildbucketpb.NewBuildsPRPCClient(&prpc.Client{
   391  		C:    &http.Client{Transport: t},
   392  		Host: host,
   393  	}), nil
   394  }
   395  
   396  // BuildbucketPubSubHandler is the main entrypoint for a new update from buildbucket's pubsub.
   397  //
   398  // This handler delegates the actual processing of the build to handleBuild.
   399  // Its primary purpose is to unwrap context boilerplate and deal with progress-stopping errors.
   400  func BuildbucketPubSubHandler(ctx *router.Context) error {
   401  	c := ctx.Request.Context()
   402  	build, err := extractBuild(c, ctx.Request)
   403  	switch {
   404  	case err != nil:
   405  		return errors.Annotate(err, "failed to extract build").Err()
   406  
   407  	case build == nil:
   408  		// Ignore.
   409  		return nil
   410  
   411  	default:
   412  		return handleBuild(c, build, srcmanCheckout, gitilesHistory)
   413  	}
   414  }
   415  
   416  // Build is buildbucketpb.Build along with the parsed 'email_notify' values.
   417  type Build struct {
   418  	BuildbucketHostname string
   419  	buildbucketpb.Build
   420  	EmailNotify []EmailNotify
   421  }
   422  
   423  // extractBuild constructs a Build from the PubSub HTTP request.
   424  func extractBuild(c context.Context, r *http.Request) (*Build, error) {
   425  	// sent by pubsub.
   426  	// This struct is just convenient for unwrapping the json message
   427  	// See https://cloud.google.com/pubsub/docs/push#receive_push
   428  	var msg struct {
   429  		Message struct {
   430  			Data       []byte
   431  			Attributes map[string]any
   432  		}
   433  	}
   434  	if err := json.NewDecoder(r.Body).Decode(&msg); err != nil {
   435  		return nil, errors.Annotate(err, "could not decode message").Err()
   436  	}
   437  
   438  	// Handle the message from `builds_v2` pubsub topic.
   439  	if v, ok := msg.Message.Attributes["version"].(string); ok && v == "v2" {
   440  		buildsV2Msg := &buildbucketpb.BuildsV2PubSub{}
   441  		opts := protojson.UnmarshalOptions{AllowPartial: true, DiscardUnknown: true}
   442  		if err := opts.Unmarshal(msg.Message.Data, buildsV2Msg); err != nil {
   443  			return nil, errors.Annotate(err, "failed to unmarshal pubsub message into BuildsV2PubSub proto").Err()
   444  		}
   445  		// TODO(crbug.com/1408909): remove the logging after migration is done.
   446  		logging.Debugf(c, "receiving builds_v2 pubsub msg for build %d, status %s", buildsV2Msg.Build.GetId(), buildsV2Msg.Build.GetStatus())
   447  		// Double check the received build is completed, although the new subscription has a filter for it.
   448  		if buildsV2Msg.Build.Status&buildbucketpb.Status_ENDED_MASK != buildbucketpb.Status_ENDED_MASK {
   449  			logging.Infof(c, "Received build %d that hasn't completed yet, ignoring...", buildsV2Msg.Build.GetId())
   450  			return nil, nil
   451  		}
   452  		largeFieldsData, err := zlibDecompress(buildsV2Msg.BuildLargeFields)
   453  		if err != nil {
   454  			return nil, errors.Annotate(err, "failed to decompress build_large_fields for build %d", buildsV2Msg.Build.GetId()).Err()
   455  		}
   456  		largeFields := &buildbucketpb.Build{}
   457  		if err := proto.Unmarshal(largeFieldsData, largeFields); err != nil {
   458  			return nil, errors.Annotate(err, "failed to unmarshal build_large_fields for build %d", buildsV2Msg.Build.GetId()).Err()
   459  		}
   460  		proto.Merge(buildsV2Msg.Build, largeFields)
   461  
   462  		emails, err := extractEmailNotifyValues(buildsV2Msg.Build, "")
   463  		if err != nil {
   464  			return nil, errors.Annotate(err, "could not decode email_notify in builds_v2 pubsub message for build %d", buildsV2Msg.Build.GetId()).Err()
   465  		}
   466  
   467  		return &Build{
   468  			BuildbucketHostname: buildsV2Msg.Build.GetInfra().GetBuildbucket().GetHostname(),
   469  			Build:               *buildsV2Msg.Build,
   470  			EmailNotify:         emails,
   471  		}, nil
   472  	}
   473  
   474  	// TODO(crbug.com/1408909): remove the handling for old messages after migration is done.
   475  	// Handle the message from `builds` pubsub topic.
   476  	var message struct {
   477  		Build    bbv1.LegacyApiCommonBuildMessage
   478  		Hostname string
   479  	}
   480  	switch err := json.Unmarshal(msg.Message.Data, &message); {
   481  	case err != nil:
   482  		return nil, errors.Annotate(err, "could not parse pubsub message data").Err()
   483  	case !strings.HasPrefix(message.Build.Bucket, "luci."):
   484  		logging.Infof(c, "Received build that isn't part of LUCI, ignoring...")
   485  		return nil, nil
   486  	case message.Build.Status != bbv1.StatusCompleted:
   487  		logging.Infof(c, "Received build that hasn't completed yet, ignoring...")
   488  		return nil, nil
   489  	}
   490  
   491  	buildsClient, err := newBuildsClient(c, message.Hostname, message.Build.Project)
   492  	if err != nil {
   493  		return nil, err
   494  	}
   495  
   496  	logging.Infof(c, "fetching build %d", message.Build.Id)
   497  	res, err := buildsClient.GetBuild(c, &buildbucketpb.GetBuildRequest{
   498  		Id: message.Build.Id,
   499  		Fields: &field_mask.FieldMask{
   500  			Paths: []string{"*"},
   501  		},
   502  	})
   503  	switch {
   504  	case status.Code(err) == codes.NotFound:
   505  		logging.Warningf(c, "no access to build %d", message.Build.Id)
   506  		return nil, nil
   507  	case err != nil:
   508  		err = grpcutil.WrapIfTransient(err)
   509  		err = errors.Annotate(err, "could not fetch buildbucket build %d", message.Build.Id).Err()
   510  		return nil, err
   511  	}
   512  
   513  	emails, err := extractEmailNotifyValues(res, message.Build.ParametersJson)
   514  	if err != nil {
   515  		return nil, errors.Annotate(err, "could not decode email_notify").Err()
   516  	}
   517  
   518  	return &Build{
   519  		BuildbucketHostname: message.Hostname,
   520  		Build:               *res,
   521  		EmailNotify:         emails,
   522  	}, nil
   523  }
   524  
   525  // zlibDecompress decompresses data using zlib.
   526  func zlibDecompress(compressed []byte) ([]byte, error) {
   527  	r, err := zlib.NewReader(bytes.NewReader(compressed))
   528  	if err != nil {
   529  		return nil, err
   530  	}
   531  	originalData, err := io.ReadAll(r)
   532  	if err != nil {
   533  		return nil, err
   534  	}
   535  	if err := r.Close(); err != nil {
   536  		return nil, err
   537  	}
   538  	return originalData, nil
   539  }