go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/milo/internal/buildsource/buildbucket/build_sync.go (about)

     1  // Copyright 2017 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package buildbucket
    16  
    17  import (
    18  	"context"
    19  	"encoding/json"
    20  	"fmt"
    21  	"net/http"
    22  	"time"
    23  
    24  	"github.com/gomodule/redigo/redis"
    25  	"google.golang.org/genproto/protobuf/field_mask"
    26  	"google.golang.org/grpc/codes"
    27  	"google.golang.org/grpc/status"
    28  	"google.golang.org/protobuf/encoding/protojson"
    29  
    30  	buildbucketpb "go.chromium.org/luci/buildbucket/proto"
    31  	bbv1 "go.chromium.org/luci/common/api/buildbucket/buildbucket/v1"
    32  	"go.chromium.org/luci/common/clock"
    33  	"go.chromium.org/luci/common/errors"
    34  	"go.chromium.org/luci/common/logging"
    35  	"go.chromium.org/luci/common/retry/transient"
    36  	"go.chromium.org/luci/common/sync/parallel"
    37  	"go.chromium.org/luci/common/tsmon/field"
    38  	"go.chromium.org/luci/common/tsmon/metric"
    39  	"go.chromium.org/luci/gae/service/datastore"
    40  	"go.chromium.org/luci/server/auth"
    41  	"go.chromium.org/luci/server/redisconn"
    42  	"go.chromium.org/luci/server/router"
    43  
    44  	"go.chromium.org/luci/milo/internal/model"
    45  	"go.chromium.org/luci/milo/internal/model/milostatus"
    46  	"go.chromium.org/luci/milo/internal/utils"
    47  )
    48  
    49  // BuildSummaryStorageDuration is the maximum lifetime of a BuildSummary.
    50  //
    51  // Lifetime is the time elapsed since the Build creation time.
    52  // Cron runs periodically to scan and remove all the Builds of which lifetime
    53  // exceeded this duration.
    54  //
    55  // BuildSummaries are kept alive longer than builds in buildbuckets. So we can
    56  // compute blamelist for builds that are at the end of their lifetime.
    57  //
    58  // TODO(weiweilin): expose BuildStorageDuration from buildbucket and compute
    59  // BuildSummaryStorageDuration base on that (e.g. add two months). So we can
    60  // ensure BuildSummaries are kept alive longer than builds.
    61  const BuildSummaryStorageDuration = time.Hour * 24 * 30 * 20 // ~20 months
    62  
    63  // BuildSummarySyncThreshold is the maximum duration before Milo attempts to
    64  // sync non-terminal builds with buildbucket.
    65  //
    66  // Cron runs periodically to scan and sync all the non-terminal Builds of
    67  // that was updated more than `BuildSummarySyncThreshold` ago.
    68  const BuildSummarySyncThreshold = time.Hour * 24 * 2 // 2 days
    69  
    70  var (
    71  	deletedBuildsCounter = metric.NewCounter(
    72  		"luci/milo/cron/delete-builds/delete-count",
    73  		"The number of BuildSummaries deleted by Milo delete-builds cron job",
    74  		nil,
    75  	)
    76  
    77  	updatedBuildsCounter = metric.NewCounter(
    78  		"luci/milo/cron/sync-builds/updated-count",
    79  		"The number of BuildSummaries that are out of sync and updated by Milo sync-builds cron job",
    80  		nil,
    81  		field.String("project"),
    82  		field.String("bucket"),
    83  		field.String("builder"),
    84  		field.Int("number"),
    85  		field.Int("ID"),
    86  	)
    87  )
    88  
    89  // DeleteOldBuilds is a cron job that deletes BuildSummaries that are older than
    90  // BuildSummaryStorageDuration.
    91  func DeleteOldBuilds(c context.Context) error {
    92  	const (
    93  		batchSize = 1024
    94  		nWorkers  = 8
    95  	)
    96  
    97  	buildPurgeCutoffTime := clock.Now(c).Add(-BuildSummaryStorageDuration)
    98  	q := datastore.NewQuery("BuildSummary").
    99  		Lt("Created", buildPurgeCutoffTime).
   100  		Order("Created").
   101  		// Apply a limit so the call won't timeout.
   102  		Limit(batchSize * nWorkers * 32).
   103  		KeysOnly(true)
   104  
   105  	return parallel.FanOutIn(func(taskC chan<- func() error) {
   106  		buildsC := make(chan []*datastore.Key, nWorkers)
   107  		statsC := make(chan int, nWorkers)
   108  
   109  		// Collect stats.
   110  		taskC <- func() error {
   111  			for deletedCount := range statsC {
   112  				deletedBuildsCounter.Add(c, int64(deletedCount))
   113  			}
   114  
   115  			return nil
   116  		}
   117  
   118  		// Find builds to delete.
   119  		taskC <- func() error {
   120  			defer close(buildsC)
   121  
   122  			bsKeys := make([]*datastore.Key, 0, batchSize)
   123  			err := datastore.RunBatch(c, batchSize, q, func(key *datastore.Key) error {
   124  				bsKeys = append(bsKeys, key)
   125  				if len(bsKeys) == batchSize {
   126  					buildsC <- bsKeys
   127  					bsKeys = make([]*datastore.Key, 0, batchSize)
   128  				}
   129  				return nil
   130  			})
   131  			if err != nil {
   132  				return err
   133  			}
   134  
   135  			if len(bsKeys) > 0 {
   136  				buildsC <- bsKeys
   137  			}
   138  			return nil
   139  		}
   140  
   141  		// Spawn workers to delete builds.
   142  		taskC <- func() error {
   143  			defer close(statsC)
   144  
   145  			return parallel.WorkPool(nWorkers, func(workC chan<- func() error) {
   146  				for bks := range buildsC {
   147  					// Bind to a local variable so each worker can have their own copy.
   148  					bks := bks
   149  					workC <- func() error {
   150  						// Flatten first w/o filtering to calculate how many builds were
   151  						// actually removed.
   152  						err := errors.Flatten(datastore.Delete(c, bks))
   153  
   154  						allErrs := 0
   155  						if err != nil {
   156  							allErrs = len(err.(errors.MultiError))
   157  						}
   158  
   159  						statsC <- len(bks) - allErrs
   160  
   161  						return err
   162  					}
   163  				}
   164  			})
   165  		}
   166  	})
   167  }
   168  
   169  var summaryBuildMask = &field_mask.FieldMask{
   170  	Paths: []string{
   171  		"id",
   172  		"builder",
   173  		"number",
   174  		"create_time",
   175  		"start_time",
   176  		"end_time",
   177  		"update_time",
   178  		"status",
   179  		"summary_markdown",
   180  		"tags",
   181  		"infra.buildbucket.hostname",
   182  		"infra.swarming",
   183  		"input.experimental",
   184  		"input.gitiles_commit",
   185  		"output.properties",
   186  		"critical",
   187  	},
   188  }
   189  
   190  // V2PubSubHandler is a webhook that stores the builds coming in from pubsub.
   191  func V2PubSubHandler(ctx *router.Context) {
   192  	err := v2PubSubHandlerImpl(ctx.Request.Context(), ctx.Request)
   193  	if err != nil {
   194  		logging.Errorf(ctx.Request.Context(), "error while handling pubsub event")
   195  		errors.Log(ctx.Request.Context(), err)
   196  	}
   197  	if transient.Tag.In(err) {
   198  		// Transient errors are 4xx so that PubSub retries them.
   199  		// TODO(crbug.com/1099036): Address High traffic builders causing errors.
   200  		ctx.Writer.WriteHeader(http.StatusTooEarly)
   201  		return
   202  	}
   203  	// No errors or non-transient errors are 200s so that PubSub does not retry
   204  	// them.
   205  	ctx.Writer.WriteHeader(http.StatusOK)
   206  }
   207  
   208  // v2PubSubHandlerImpl takes the http.Request, expects to find
   209  // a common.PubSubSubscription JSON object in the Body, containing a
   210  // BuildsV2PubSub, and handles the contents with generateSummary.
   211  func v2PubSubHandlerImpl(c context.Context, r *http.Request) error {
   212  	msg := utils.PubSubSubscription{}
   213  	if err := json.NewDecoder(r.Body).Decode(&msg); err != nil {
   214  		// This might be a transient error, e.g. when the json format changes
   215  		// and Milo isn't updated yet.
   216  		return errors.Annotate(err, "could not decode message").Tag(transient.Tag).Err()
   217  	}
   218  	bData, err := msg.GetData()
   219  	if err != nil {
   220  		return errors.Annotate(err, "could not parse pubsub message string").Err()
   221  	}
   222  	buildsV2Msg := &buildbucketpb.BuildsV2PubSub{}
   223  	opts := protojson.UnmarshalOptions{AllowPartial: true, DiscardUnknown: true}
   224  	if err := opts.Unmarshal(bData, buildsV2Msg); err != nil {
   225  		return err
   226  	}
   227  
   228  	return processBuild(c, buildsV2Msg.Build)
   229  }
   230  
   231  // PubSubHandler is a webhook that stores the builds coming in from pubsub.
   232  func PubSubHandler(ctx *router.Context) {
   233  	err := pubSubHandlerImpl(ctx.Request.Context(), ctx.Request)
   234  	if err != nil {
   235  		logging.Errorf(ctx.Request.Context(), "error while handling pubsub event")
   236  		errors.Log(ctx.Request.Context(), err)
   237  	}
   238  	if transient.Tag.In(err) {
   239  		// Transient errors are 4xx so that PubSub retries them.
   240  		// TODO(crbug.com/1099036): Address High traffic builders causing errors.
   241  		ctx.Writer.WriteHeader(http.StatusTooEarly)
   242  		return
   243  	}
   244  	// No errors or non-transient errors are 200s so that PubSub does not retry
   245  	// them.
   246  	ctx.Writer.WriteHeader(http.StatusOK)
   247  }
   248  
   249  // pubSubHandlerImpl takes the http.Request, expects to find
   250  // a common.PubSubSubscription JSON object in the Body, containing a bbPSEvent,
   251  // and handles the contents with generateSummary.
   252  func pubSubHandlerImpl(c context.Context, r *http.Request) error {
   253  	msg := utils.PubSubSubscription{}
   254  	if err := json.NewDecoder(r.Body).Decode(&msg); err != nil {
   255  		// This might be a transient error, e.g. when the json format changes
   256  		// and Milo isn't updated yet.
   257  		return errors.Annotate(err, "could not decode message").Tag(transient.Tag).Err()
   258  	}
   259  	if v, ok := msg.Message.Attributes["version"].(string); ok && v != "v1" {
   260  		// TODO(nodir): switch to v2, crbug.com/826006
   261  		logging.Debugf(c, "unsupported pubsub message version %q. Ignoring", v)
   262  		return nil
   263  	}
   264  	bData, err := msg.GetData()
   265  	if err != nil {
   266  		return errors.Annotate(err, "could not parse pubsub message string").Err()
   267  	}
   268  
   269  	event := struct {
   270  		Build    bbv1.LegacyApiCommonBuildMessage `json:"build"`
   271  		Hostname string                           `json:"hostname"`
   272  	}{}
   273  	if err := json.Unmarshal(bData, &event); err != nil {
   274  		return errors.Annotate(err, "could not parse pubsub message data").Err()
   275  	}
   276  
   277  	client, err := BuildsClient(c, event.Hostname, auth.AsSelf)
   278  	if err != nil {
   279  		return err
   280  	}
   281  	build, err := client.GetBuild(c, &buildbucketpb.GetBuildRequest{
   282  		Id:     event.Build.Id,
   283  		Fields: summaryBuildMask,
   284  	})
   285  	if err != nil {
   286  		return err
   287  	}
   288  	return processBuild(c, build)
   289  }
   290  
   291  func processBuild(c context.Context, build *buildbucketpb.Build) error {
   292  	// TODO(iannucci,nodir): get the bot context too
   293  	// TODO(iannucci,nodir): support manifests/got_revision
   294  	bs, err := model.BuildSummaryFromBuild(c, build.Infra.Buildbucket.Hostname, build)
   295  	if err != nil {
   296  		return err
   297  	}
   298  	if err := bs.AddManifestKeysFromBuildSets(c); err != nil {
   299  		return err
   300  	}
   301  
   302  	logging.Debugf(c, "Received from %s: build %s (%s)\n%v",
   303  		build.Infra.Buildbucket.Hostname, bs.ProjectID, bs.BuildID, bs.Summary.Status, bs)
   304  
   305  	updateBuilderSummary, err := shouldUpdateBuilderSummary(c, bs)
   306  	if err != nil {
   307  		updateBuilderSummary = true
   308  		logging.WithError(err).Warningf(c, "failed to determine whether the builder summary from %s should be updated. Fallback to always update.", bs.BuilderID)
   309  	}
   310  	err = transient.Tag.Apply(datastore.RunInTransaction(c, func(c context.Context) error {
   311  		curBS := &model.BuildSummary{BuildKey: bs.BuildKey}
   312  		if err := datastore.Get(c, curBS); err != nil && err != datastore.ErrNoSuchEntity {
   313  			return errors.Annotate(err, "reading current BuildSummary").Err()
   314  		}
   315  
   316  		if bs.Version <= curBS.Version {
   317  			logging.Warningf(c, "current BuildSummary is newer: %d <= %d",
   318  				bs.Version, curBS.Version)
   319  			return nil
   320  		}
   321  
   322  		if err := datastore.Put(c, bs); err != nil {
   323  			return err
   324  		}
   325  
   326  		if !updateBuilderSummary {
   327  			return nil
   328  		}
   329  
   330  		return model.UpdateBuilderForBuild(c, bs)
   331  	}, nil))
   332  
   333  	return err
   334  }
   335  
   336  var (
   337  	// Sustained datastore updates should not be higher than once per second per
   338  	// entity.
   339  	// See https://cloud.google.com/datastore/docs/concepts/limits.
   340  	entityUpdateIntervalInS int64 = 2
   341  
   342  	// A Redis LUA script that update the key with the new integer value if and
   343  	// only if the provided value is greater than the recorded value (0 if none
   344  	// were recorded). It returns 1 if the value is updated and 0 otherwise.
   345  	updateIfLargerScript = redis.NewScript(1, `
   346  local newValueStr = ARGV[1]
   347  local newValue = tonumber(newValueStr)
   348  local existingValueStr = redis.call('GET', KEYS[1])
   349  local existingValue = tonumber(existingValueStr) or 0
   350  if newValue < existingValue then
   351  	return 0
   352  elseif newValue == existingValue then
   353  	-- large u64/i64 (>2^53) may lose precision after being converted to f64.
   354  	-- Compare the last 5 digits if the f64 presentation of the integers are the
   355  	-- same.
   356  	local newValue = tonumber(string.sub(newValueStr, -5)) or 0
   357  	local existingValue = tonumber(string.sub(existingValueStr, -5)) or 0
   358  	if newValue <= existingValue then
   359  		return 0
   360  	end
   361  end
   362  
   363  redis.call('SET', KEYS[1], newValueStr, 'EX', ARGV[2])
   364  return 1
   365  `)
   366  )
   367  
   368  // shouldUpdateBuilderSummary determines whether the builder summary should be
   369  // updated with the provided build summary.
   370  //
   371  // If the function is called with builds from the same builder multiple times
   372  // within a time bucket, it will block the function until the start of the next
   373  // time bucket and only return true for the call with the lastest created build
   374  // with a terminal status, and return false for other calls occurred within the
   375  // same time bucket.
   376  func shouldUpdateBuilderSummary(c context.Context, buildSummary *model.BuildSummary) (bool, error) {
   377  	if !buildSummary.Summary.Status.Terminal() {
   378  		return false, nil
   379  	}
   380  
   381  	conn, err := redisconn.Get(c)
   382  	if err != nil {
   383  		return true, err
   384  	}
   385  	defer conn.Close()
   386  
   387  	createdAt := buildSummary.Created.UnixNano()
   388  
   389  	now := clock.Now(c).Unix()
   390  	thisTimeBucket := time.Unix(now-now%entityUpdateIntervalInS, 0)
   391  	thisTimeBucketKey := fmt.Sprintf("%s:%v", buildSummary.BuilderID, thisTimeBucket.Unix())
   392  
   393  	// Check if there's a builder summary update occurred in this time bucket.
   394  	_, err = redis.String(conn.Do("SET", thisTimeBucketKey, createdAt, "NX", "EX", entityUpdateIntervalInS+1))
   395  	switch err {
   396  	case redis.ErrNil:
   397  		// continue
   398  	case nil:
   399  		// There's no builder summary update occurred in this time bucket yet, we
   400  		// should run the update.
   401  		return true, nil
   402  	default:
   403  		return true, err
   404  	}
   405  
   406  	// There's already a builder summary update occurred in this time bucket. Try
   407  	// scheduling the update for the next time bucket instead.
   408  	nextTimeBucket := thisTimeBucket.Add(time.Duration(entityUpdateIntervalInS * int64(time.Second)))
   409  	nextTimeBucketKey := fmt.Sprintf("%s:%v", buildSummary.BuilderID, nextTimeBucket.Unix())
   410  	replaced, err := redis.Int(updateIfLargerScript.Do(conn, nextTimeBucketKey, createdAt, entityUpdateIntervalInS+1))
   411  	if err != nil {
   412  		return true, err
   413  	}
   414  
   415  	if replaced == 0 {
   416  		// There's already an update with a newer build scheduled for the next time
   417  		// bucket, skip the update.
   418  		return false, nil
   419  	}
   420  
   421  	// Wait until the start of the next time bucket.
   422  	clock.Sleep(c, clock.Until(c, nextTimeBucket))
   423  	newCreatedAt, err := redis.Int64(conn.Do("GET", nextTimeBucketKey))
   424  	if err != nil {
   425  		return true, err
   426  	}
   427  
   428  	// If the update event was not replaced by event by a newer build, we should
   429  	// run the update with this build.
   430  	if newCreatedAt == createdAt {
   431  		return true, nil
   432  	}
   433  
   434  	// Otherwise, skip the update.
   435  	logging.Infof(c, "skipping BuilderSummary update from builder %s because a newer build was found", buildSummary.BuilderID)
   436  	return false, nil
   437  }
   438  
   439  // SyncBuilds is a cron job that sync BuildSummaries that are not in terminal
   440  // state and haven't been updated recently.
   441  func SyncBuilds(c context.Context) error {
   442  	c = WithBuildsClientFactory(c, ProdBuildsClientFactory)
   443  	return syncBuildsImpl(c)
   444  }
   445  
   446  func syncBuildsImpl(c context.Context) error {
   447  	const (
   448  		batchSize = 16
   449  		nWorkers  = 64
   450  	)
   451  
   452  	buildSyncCutoffTime := clock.Now(c).Add(-BuildSummarySyncThreshold)
   453  
   454  	return parallel.FanOutIn(func(taskC chan<- func() error) {
   455  		buildsC := make(chan []*model.BuildSummary, nWorkers)
   456  		updatedBuildC := make(chan *buildbucketpb.Build, nWorkers)
   457  
   458  		// Collect stats.
   459  		taskC <- func() error {
   460  			for b := range updatedBuildC {
   461  				updatedBuildsCounter.Add(c, 1, b.Builder.Project, b.Builder.Bucket, b.Builder.Builder, b.Number, b.Id)
   462  			}
   463  			return nil
   464  		}
   465  
   466  		// Find builds to update.
   467  		taskC <- func() error {
   468  			defer close(buildsC)
   469  
   470  			batch := make([]*model.BuildSummary, 0, batchSize)
   471  			findBuilds := func(q *datastore.Query) error {
   472  				return datastore.RunBatch(c, batchSize, q, func(bs *model.BuildSummary) error {
   473  					batch = append(batch, bs)
   474  					if len(batch) == batchSize {
   475  						buildsC <- batch
   476  						batch = make([]*model.BuildSummary, 0, batchSize)
   477  					}
   478  					return nil
   479  				})
   480  			}
   481  
   482  			pendingBuildsQuery := datastore.NewQuery("BuildSummary").
   483  				Eq("Summary.Status", milostatus.NotRun).
   484  				Lt("Version", buildSyncCutoffTime.UnixNano()).
   485  				// Apply a limit so the call won't timeout.
   486  				Limit(batchSize * nWorkers * 16)
   487  			err := findBuilds(pendingBuildsQuery)
   488  			if err != nil {
   489  				return err
   490  			}
   491  
   492  			runningBuildsQuery := datastore.NewQuery("BuildSummary").
   493  				Eq("Summary.Status", milostatus.Running).
   494  				Lt("Version", buildSyncCutoffTime.UnixNano()).
   495  				// Apply a limit so the call won't timeout.
   496  				Limit(batchSize * nWorkers * 16)
   497  			err = findBuilds(runningBuildsQuery)
   498  			if err != nil {
   499  				return err
   500  			}
   501  
   502  			if len(batch) > 0 {
   503  				buildsC <- batch
   504  			}
   505  			return nil
   506  		}
   507  
   508  		// Spawn workers to update builds.
   509  		taskC <- func() error {
   510  			defer close(updatedBuildC)
   511  
   512  			return parallel.WorkPool(nWorkers, func(workC chan<- func() error) {
   513  				for builds := range buildsC {
   514  					// Bind to a local variable so each worker can have their own copy.
   515  					builds := builds
   516  					workC <- func() error {
   517  						for _, bs := range builds {
   518  							host, err := bs.GetHost()
   519  							if err != nil {
   520  								logging.WithError(err).Errorf(c, "failed to get host for build summary %v, skipping", bs.BuildKey)
   521  								continue
   522  							}
   523  
   524  							client, err := BuildsClient(c, host, auth.AsSelf)
   525  							if err != nil {
   526  								return err
   527  							}
   528  
   529  							var buildID int64 = 0
   530  							builder, buildNum, err := utils.ParseLegacyBuildbucketBuildID(bs.BuildID)
   531  							if err != nil {
   532  								// If the BuildID is not the legacy build ID, trying parsing it as
   533  								// the new build ID.
   534  								buildID, err = utils.ParseBuildbucketBuildID(bs.BuildID)
   535  								if err != nil {
   536  									return err
   537  								}
   538  							}
   539  
   540  							req := &buildbucketpb.GetBuildRequest{
   541  								Id:          buildID,
   542  								BuildNumber: buildNum,
   543  								Builder:     builder,
   544  								Fields:      summaryBuildMask,
   545  							}
   546  							newBuild, err := client.GetBuild(c, req)
   547  							if status.Code(err) == codes.NotFound {
   548  								logging.Warningf(c, "build %v not found on buildbucket. deleting it.\nrequest: %v", bs.BuildKey, req)
   549  								err := datastore.Delete(c, bs)
   550  								if err != nil {
   551  									err = errors.Annotate(err, "failed to delete build %v", bs.BuildKey).Err()
   552  									return err
   553  								}
   554  								continue
   555  							} else if err != nil {
   556  								err = errors.Annotate(err, "could not fetch build %v from buildbucket.\nrequest: %v", bs.BuildKey, req).Err()
   557  								return err
   558  							}
   559  
   560  							newBS, err := model.BuildSummaryFromBuild(c, host, newBuild)
   561  							if err != nil {
   562  								return err
   563  							}
   564  							if err := newBS.AddManifestKeysFromBuildSets(c); err != nil {
   565  								return err
   566  							}
   567  
   568  							// Ensure the new BuildSummary has the same key as the old
   569  							// BuildSummary, so we won't ends up having two copies of the same
   570  							// BuildSummary in the datastore.
   571  							newBS.BuildKey = bs.BuildKey
   572  							newBS.BuildID = bs.BuildID
   573  
   574  							if !newBS.Summary.Status.Terminal() {
   575  								continue
   576  							}
   577  
   578  							if err := datastore.Put(c, newBS); err != nil {
   579  								return err
   580  							}
   581  
   582  							updatedBuildC <- newBuild
   583  						}
   584  
   585  						return nil
   586  					}
   587  				}
   588  			})
   589  		}
   590  	})
   591  }