go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/buildbucket/appengine/tasks/swarming.go (about)

     1  // Copyright 2022 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tasks
    16  
    17  import (
    18  	"context"
    19  	"encoding/base64"
    20  	"encoding/json"
    21  	"fmt"
    22  	"io"
    23  	"path/filepath"
    24  	"sort"
    25  	"strconv"
    26  	"strings"
    27  	"time"
    28  
    29  	"github.com/google/uuid"
    30  	"google.golang.org/api/googleapi"
    31  	"google.golang.org/protobuf/proto"
    32  
    33  	"go.chromium.org/luci/common/clock"
    34  	"go.chromium.org/luci/common/errors"
    35  	"go.chromium.org/luci/common/logging"
    36  	"go.chromium.org/luci/common/retry/transient"
    37  	"go.chromium.org/luci/gae/service/datastore"
    38  	"go.chromium.org/luci/gae/service/info"
    39  	"go.chromium.org/luci/server/auth/realms"
    40  	"go.chromium.org/luci/server/caching"
    41  	"go.chromium.org/luci/server/tq"
    42  	apipb "go.chromium.org/luci/swarming/proto/api_v2"
    43  
    44  	"go.chromium.org/luci/buildbucket"
    45  	"go.chromium.org/luci/buildbucket/appengine/internal/buildstatus"
    46  	"go.chromium.org/luci/buildbucket/appengine/internal/buildtoken"
    47  	"go.chromium.org/luci/buildbucket/appengine/internal/clients"
    48  	"go.chromium.org/luci/buildbucket/appengine/internal/metrics"
    49  	"go.chromium.org/luci/buildbucket/appengine/model"
    50  	taskdefs "go.chromium.org/luci/buildbucket/appengine/tasks/defs"
    51  	"go.chromium.org/luci/buildbucket/cmd/bbagent/bbinput"
    52  	pb "go.chromium.org/luci/buildbucket/proto"
    53  	"go.chromium.org/luci/buildbucket/protoutil"
    54  )
    55  
    56  const (
    57  	// cacheDir is the path, relative to the swarming run dir, to the directory that
    58  	// contains the mounted swarming named caches. It will be prepended to paths of
    59  	// caches defined in global or builder configs.
    60  	cacheDir = "cache"
    61  
    62  	// pubsubTopicTemplate is the topic template where Swarming publishes
    63  	// notifications on the task update.
    64  	pubsubTopicTemplate = "projects/%s/topics/swarming-go"
    65  
    66  	// swarmingCreateTaskGiveUpTimeout indicates how long to retry
    67  	// the createSwarmingTask before giving up with INFRA_FAILURE.
    68  	swarmingCreateTaskGiveUpTimeout = 10 * 60 * time.Second
    69  
    70  	// swarmingTimeFormat is time format used by swarming.
    71  	swarmingTimeFormat = "2006-01-02T15:04:05.999999999"
    72  )
    73  
    74  // userdata will be sent back and forth between Swarming and Buildbucket.
    75  type userdata struct {
    76  	BuildID          int64  `json:"build_id"`
    77  	CreatedTS        int64  `json:"created_ts"`
    78  	SwarmingHostname string `json:"swarming_hostname"`
    79  }
    80  
    81  // notification captures all fields that Buildbucket needs from the message of Swarming notification subscription.
    82  type notification struct {
    83  	messageID string
    84  	taskID    string
    85  	*userdata
    86  }
    87  
    88  // SyncBuild synchronizes the build with Swarming.
    89  // If the swarming task does not exist yet, creates it.
    90  // Otherwise, updates the build state to match swarming task state.
    91  // Enqueues a new sync push task if the build did not end.
    92  //
    93  // Cloud tasks handler will retry the task if any error is thrown, unless it's
    94  // tagged with tq.Fatal.
    95  func SyncBuild(ctx context.Context, buildID int64, generation int64) error {
    96  	bld := &model.Build{ID: buildID}
    97  	infra := &model.BuildInfra{Build: datastore.KeyForObj(ctx, bld)}
    98  	switch err := datastore.Get(ctx, bld, infra); {
    99  	case errors.Contains(err, datastore.ErrNoSuchEntity):
   100  		return tq.Fatal.Apply(errors.Annotate(err, "build %d or buildInfra not found", buildID).Err())
   101  	case err != nil:
   102  		return transient.Tag.Apply(errors.Annotate(err, "failed to fetch build %d or buildInfra", buildID).Err())
   103  	}
   104  	if protoutil.IsEnded(bld.Status) {
   105  		logging.Infof(ctx, "build %d is ended", buildID)
   106  		return nil
   107  	}
   108  	if clock.Now(ctx).Sub(bld.CreateTime) > model.BuildMaxCompletionTime {
   109  		logging.Infof(ctx, "build %d (create_time:%s) has passed the sync deadline: %s", buildID, bld.CreateTime, model.BuildMaxCompletionTime)
   110  		return nil
   111  	}
   112  
   113  	bld.Proto.Infra = infra.Proto
   114  	swarm, err := clients.NewSwarmingClient(ctx, infra.Proto.Swarming.Hostname, bld.Project)
   115  	if err != nil {
   116  		logging.Errorf(ctx, "failed to create a swarming client for build %d (%s), in %s: %s", buildID, bld.Project, infra.Proto.Swarming.Hostname, err)
   117  		return failBuild(ctx, buildID, fmt.Sprintf("failed to create a swarming client:%s", err))
   118  	}
   119  	if bld.Proto.Infra.Swarming.TaskId == "" {
   120  		if err := createSwarmingTask(ctx, bld, swarm); err != nil {
   121  			// Mark build as Infra_failure for fatal and non-retryable errors.
   122  			if tq.Fatal.In(err) {
   123  				return failBuild(ctx, bld.ID, err.Error())
   124  			}
   125  			return err
   126  		}
   127  	} else {
   128  		if err := syncBuildWithTaskResult(ctx, bld.ID, bld.Proto.Infra.Swarming.TaskId, swarm); err != nil {
   129  			// Tq should retry non-fatal errors.
   130  			if !tq.Fatal.In(err) {
   131  				return err
   132  			}
   133  			// For fatal errors, we just log it and continue to the part of enqueueing
   134  			// the next generation sync task.
   135  			logging.Errorf(ctx, "Dropping the sync task due to the fatal error: %s", err)
   136  		}
   137  	}
   138  
   139  	// Enqueue a continuation sync task in 5m.
   140  	if clock.Now(ctx).Sub(bld.CreateTime) < model.BuildMaxCompletionTime {
   141  		if err := SyncSwarmingBuildTask(ctx, &taskdefs.SyncSwarmingBuildTask{BuildId: buildID, Generation: generation + 1}, 5*time.Minute); err != nil {
   142  			return transient.Tag.Apply(errors.Annotate(err, "failed to enqueue the continuation sync task for build %d", buildID).Err())
   143  		}
   144  	}
   145  	return nil
   146  }
   147  
   148  // SubNotify handles swarming-go PubSub push messages produced by Swarming.
   149  // For a retryable error, it will be tagged with transient.Tag.
   150  func SubNotify(ctx context.Context, body io.Reader) error {
   151  	nt, err := unpackMsg(ctx, body)
   152  	if err != nil {
   153  		return err
   154  	}
   155  	// TODO(crbug/1328646): delete the log once the new Go flow becomes stable.
   156  	logging.Infof(ctx, "Received message - messageID:%s, taskID:%s, userdata:%+v", nt.messageID, nt.taskID, nt.userdata)
   157  
   158  	// Try not to process same message more than once.
   159  	cache := caching.GlobalCache(ctx, "swarming-pubsub-msg-id")
   160  	if cache == nil {
   161  		return errors.Reason("global cache is not found").Tag(transient.Tag).Err()
   162  	}
   163  	msgCached, err := cache.Get(ctx, nt.messageID)
   164  	switch {
   165  	case err == caching.ErrCacheMiss: // no-op, continue
   166  	case err != nil:
   167  		return errors.Annotate(err, "failed to read %s from the global cache", nt.messageID).Tag(transient.Tag).Err()
   168  	case msgCached != nil:
   169  		logging.Infof(ctx, "seen this message %s before, ignoring", nt.messageID)
   170  		return nil
   171  	}
   172  
   173  	taskURL := func(hostname, taskID string) string {
   174  		return fmt.Sprintf("https://%s/task?id=%s", hostname, taskID)
   175  	}
   176  	// load build and build infra.
   177  	logging.Infof(ctx, "received swarming notification for build %d", nt.BuildID)
   178  	bld := &model.Build{ID: nt.BuildID}
   179  	infra := &model.BuildInfra{Build: datastore.KeyForObj(ctx, bld)}
   180  	switch err := datastore.Get(ctx, bld, infra); {
   181  	case errors.Contains(err, datastore.ErrNoSuchEntity):
   182  		if clock.Now(ctx).Sub(time.Unix(0, nt.CreatedTS*int64(time.Microsecond)).UTC()) < time.Minute {
   183  			return errors.Annotate(err, "Build %d or BuildInfra for task %s not found yet", nt.BuildID, taskURL(nt.SwarmingHostname, nt.taskID)).Tag(transient.Tag).Err()
   184  		}
   185  		return errors.Annotate(err, "Build %d or BuildInfra for task %s not found", nt.BuildID, taskURL(nt.SwarmingHostname, nt.taskID)).Err()
   186  	case err != nil:
   187  		return errors.Annotate(err, "failed to fetch build %d or buildInfra", nt.BuildID).Tag(transient.Tag).Err()
   188  	}
   189  	if protoutil.IsEnded(bld.Status) {
   190  		logging.Infof(ctx, "build(%d) is completed and immutable.", nt.BuildID)
   191  		return nil
   192  	}
   193  
   194  	// ensure the loaded build is associated with the task.
   195  	bld.Proto.Infra = infra.Proto
   196  	sw := bld.Proto.GetInfra().GetSwarming()
   197  	if nt.SwarmingHostname != sw.GetHostname() {
   198  		return errors.Reason("swarming_hostname %s of build %d does not match %s", sw.Hostname, nt.BuildID, nt.SwarmingHostname).Err()
   199  	}
   200  	if strings.TrimSpace(sw.GetTaskId()) == "" {
   201  		return errors.Reason("build %d is not associated with a task", nt.BuildID).Tag(transient.Tag).Err()
   202  	}
   203  	if nt.taskID != sw.GetTaskId() {
   204  		return errors.Reason("swarming_task_id %s of build %d does not match %s", sw.TaskId, nt.BuildID, nt.taskID).Err()
   205  	}
   206  
   207  	// update build.
   208  	swarm, err := clients.NewSwarmingClient(ctx, sw.Hostname, bld.Project)
   209  	if err != nil {
   210  		return errors.Annotate(err, "failed to create a swarming client for build %d (%s), in %s", nt.BuildID, bld.Project, sw.Hostname).Err()
   211  	}
   212  	if err := syncBuildWithTaskResult(ctx, nt.BuildID, sw.TaskId, swarm); err != nil {
   213  		return err
   214  	}
   215  
   216  	return cache.Set(ctx, nt.messageID, []byte{1}, 10*time.Minute)
   217  }
   218  
   219  func HandleCancelSwarmingTask(ctx context.Context, hostname string, taskID string, realm string) error {
   220  	// Validate
   221  	switch err := realms.ValidateRealmName(realm, realms.GlobalScope); {
   222  	case err != nil:
   223  		return tq.Fatal.Apply(err)
   224  	case hostname == "":
   225  		return errors.Reason("hostname is empty").Tag(tq.Fatal).Err()
   226  	case taskID == "":
   227  		return errors.Reason("taskID is empty").Tag(tq.Fatal).Err()
   228  	}
   229  
   230  	// Send the cancellation request.
   231  	project, _ := realms.Split(realm)
   232  	swarm, err := clients.NewSwarmingClient(ctx, hostname, project)
   233  	if err != nil {
   234  		return errors.Annotate(err, "failed to create a swarming client for task %s in %s", taskID, hostname).Tag(tq.Fatal).Err()
   235  	}
   236  	res, err := swarm.CancelTask(ctx, &apipb.TaskCancelRequest{KillRunning: true, TaskId: taskID})
   237  	if err != nil {
   238  		if apiErr, ok := err.(*googleapi.Error); ok && apiErr.Code >= 500 {
   239  			return errors.Annotate(err, "transient error in cancelling the task %s", taskID).Tag(transient.Tag).Err()
   240  		}
   241  		return errors.Annotate(err, "fatal error in cancelling the task %s", taskID).Tag(tq.Fatal).Err()
   242  	}
   243  
   244  	// Non-Canceled in the body indicates the task may have already ended. Hence, just logging it.
   245  	if !res.Canceled {
   246  		logging.Warningf(ctx, "Swarming response for cancelling task %s: %+v", taskID, res)
   247  	}
   248  	return nil
   249  }
   250  
   251  // unpackMsg unpacks swarming-go pubsub message and extracts message id,
   252  // swarming hostname, creation time, task id and build id.
   253  func unpackMsg(ctx context.Context, body io.Reader) (*notification, error) {
   254  	blob, err := io.ReadAll(body)
   255  	if err != nil {
   256  		return nil, errors.Annotate(err, "failed to read the request body").Tag(transient.Tag).Err()
   257  	}
   258  
   259  	// process pubsub message
   260  	// See https://cloud.google.com/pubsub/docs/push#receive_push
   261  	var msg struct {
   262  		Message struct {
   263  			Attributes map[string]string `json:"attributes,omitempty"`
   264  			Data       string            `json:"data,omitempty"`
   265  			MessageID  string            `json:"messageId,omitempty"`
   266  		} `json:"message"`
   267  	}
   268  	if err := json.Unmarshal(blob, &msg); err != nil {
   269  		return nil, errors.Annotate(err, "failed to unmarshal swarming PubSub message").Err()
   270  	}
   271  
   272  	// process swarming message data
   273  	swarmData, err := base64.StdEncoding.DecodeString(msg.Message.Data)
   274  	if err != nil {
   275  		return nil, errors.Annotate(err, "cannot decode message data as base64").Err()
   276  	}
   277  	var data struct {
   278  		TaskID   string `json:"task_id"`
   279  		Userdata string `json:"userdata"`
   280  	}
   281  	if err := json.Unmarshal(swarmData, &data); err != nil {
   282  		return nil, errors.Annotate(err, "failed to unmarshal the swarming pubsub data").Err()
   283  	}
   284  	ud := &userdata{}
   285  	if err := json.Unmarshal([]byte(data.Userdata), ud); err != nil {
   286  		return nil, errors.Annotate(err, "failed to unmarshal userdata").Err()
   287  	}
   288  
   289  	// validate swarming message data
   290  	switch {
   291  	case strings.TrimSpace(data.TaskID) == "":
   292  		return nil, errors.Reason("task_id not found in message data").Err()
   293  	case ud.BuildID <= 0:
   294  		return nil, errors.Reason("invalid build_id %d", ud.BuildID).Err()
   295  	case ud.CreatedTS <= 0:
   296  		return nil, errors.Reason("invalid created_ts %d", ud.CreatedTS).Err()
   297  	case strings.TrimSpace(ud.SwarmingHostname) == "":
   298  		return nil, errors.Reason("swarming hostname not found in userdata").Err()
   299  	case strings.Contains(ud.SwarmingHostname, "://"):
   300  		return nil, errors.Reason("swarming hostname %s must not contain '://'", ud.SwarmingHostname).Err()
   301  	}
   302  
   303  	return &notification{
   304  		messageID: msg.Message.MessageID,
   305  		taskID:    data.TaskID,
   306  		userdata:  ud,
   307  	}, nil
   308  }
   309  
   310  // syncBuildWithTaskResult syncs Build entity in the datastore with a result of the swarming task.
   311  func syncBuildWithTaskResult(ctx context.Context, buildID int64, taskID string, swarm clients.SwarmingClient) error {
   312  	taskResult, err := swarm.GetTaskResult(ctx, taskID)
   313  	if err != nil {
   314  		logging.Errorf(ctx, "failed to fetch swarming task %s for build %d: %s", taskID, buildID, err)
   315  		if apiErr, ok := err.(*googleapi.Error); ok && apiErr.Code >= 400 && apiErr.Code < 500 {
   316  			return failBuild(ctx, buildID, fmt.Sprintf("invalid swarming task %s", taskID))
   317  		}
   318  		return transient.Tag.Apply(err)
   319  	}
   320  	if taskResult == nil {
   321  		return failBuild(ctx, buildID, fmt.Sprintf("Swarming task %s unexpectedly disappeared", taskID))
   322  	}
   323  
   324  	var statusChanged bool
   325  	bld := &model.Build{
   326  		ID: buildID,
   327  	}
   328  	err = datastore.RunInTransaction(ctx, func(ctx context.Context) (err error) {
   329  		infra := &model.BuildInfra{Build: datastore.KeyForObj(ctx, bld)}
   330  		if err := datastore.Get(ctx, bld, infra); err != nil {
   331  			return transient.Tag.Apply(errors.Annotate(err, "failed to fetch build or buildInfra: %d", bld.ID).Err())
   332  		}
   333  
   334  		if protoutil.IsEnded(bld.Status) {
   335  			return nil
   336  		}
   337  		if bld.Status == pb.Status_STARTED && taskResult.State == apipb.TaskState_PENDING {
   338  			// Most probably, race between PubSub push handler and Cron job.
   339  			// With swarming, a build cannot go from STARTED back to PENDING,
   340  			// so ignore this.
   341  			return nil
   342  		}
   343  
   344  		botDimsChanged := updateBotDimensions(infra, taskResult)
   345  
   346  		bs, steps, err := updateBuildStatusFromTaskResult(ctx, bld, taskResult)
   347  		if err != nil {
   348  			return tq.Fatal.Apply(err)
   349  		}
   350  
   351  		shouldUpdate := false
   352  		if bs != nil {
   353  			shouldUpdate = true
   354  			statusChanged = true
   355  		}
   356  		if bs == nil && botDimsChanged && bld.Proto.Status == pb.Status_STARTED {
   357  			shouldUpdate = true
   358  		}
   359  		if !shouldUpdate {
   360  			return nil
   361  		}
   362  
   363  		toPut := []any{bld, infra}
   364  		if bs != nil {
   365  			toPut = append(toPut, bs)
   366  		}
   367  		if steps != nil {
   368  			toPut = append(toPut, steps)
   369  		}
   370  		return transient.Tag.Apply(datastore.Put(ctx, toPut...))
   371  	}, nil)
   372  
   373  	switch {
   374  	case err != nil:
   375  	case !statusChanged:
   376  	case bld.Status == pb.Status_STARTED:
   377  		metrics.BuildStarted(ctx, bld)
   378  	case protoutil.IsEnded(bld.Status):
   379  		metrics.BuildCompleted(ctx, bld)
   380  	}
   381  	return err
   382  }
   383  
   384  // updateBotDimensions mutates the infra entity to update the bot dimensions
   385  // according to the given task result.
   386  // Note, it will not write the entities into Datastore.
   387  func updateBotDimensions(infra *model.BuildInfra, taskResult *apipb.TaskResultResponse) bool {
   388  	sw := infra.Proto.Swarming
   389  	botDimsChanged := false
   390  
   391  	// Update BotDimensions
   392  	oldBotDimsMap := protoutil.StringPairMap(sw.BotDimensions)
   393  	newBotDims := []*pb.StringPair{}
   394  	for _, dim := range taskResult.BotDimensions {
   395  		for _, v := range dim.Value {
   396  			if !botDimsChanged && !oldBotDimsMap.Contains(dim.Key, v) {
   397  				botDimsChanged = true
   398  			}
   399  			newBotDims = append(newBotDims, &pb.StringPair{Key: dim.Key, Value: v})
   400  		}
   401  	}
   402  	if len(newBotDims) != len(sw.BotDimensions) {
   403  		botDimsChanged = true
   404  	}
   405  	sw.BotDimensions = newBotDims
   406  
   407  	sort.Slice(sw.BotDimensions, func(i, j int) bool {
   408  		if sw.BotDimensions[i].Key == sw.BotDimensions[j].Key {
   409  			return sw.BotDimensions[i].Value < sw.BotDimensions[j].Value
   410  		}
   411  		return sw.BotDimensions[i].Key < sw.BotDimensions[j].Key
   412  	})
   413  	return botDimsChanged
   414  }
   415  
   416  // updateBuildStatusFromTaskResult mutates the build entity to update the top
   417  // level status, and also the update time of the build.
   418  // Note, it will not write the entities into Datastore.
   419  func updateBuildStatusFromTaskResult(ctx context.Context, bld *model.Build, taskResult *apipb.TaskResultResponse) (bs *model.BuildStatus, steps *model.BuildSteps, err error) {
   420  	now := clock.Now(ctx)
   421  	oldStatus := bld.Status
   422  	// A helper function to correctly set Build ended status from taskResult. It
   423  	// corrects the build start_time only if start_time is empty and taskResult
   424  	// has start_ts populated.
   425  	setEndStatus := func(st pb.Status, details *pb.StatusDetails) {
   426  		if !protoutil.IsEnded(st) {
   427  			return
   428  		}
   429  		if bld.Proto.StartTime == nil && taskResult.StartedTs.AsTime().Unix() != 0 {
   430  			startTime := taskResult.StartedTs.AsTime()
   431  			// Backfill build start time.
   432  			protoutil.SetStatus(startTime, bld.Proto, pb.Status_STARTED)
   433  		}
   434  
   435  		endTime := now
   436  		if t := taskResult.CompletedTs; t != nil {
   437  			endTime = t.AsTime()
   438  		} else if t := taskResult.AbandonedTs; t != nil {
   439  			endTime = t.AsTime()
   440  		}
   441  		// It is possible that swarming task was marked as NO_RESOURCE the moment
   442  		// it was created. Swarming VM time is not synchronized with buildbucket VM
   443  		// time, so adjust end_time if needed.
   444  		if endTime.Before(bld.Proto.CreateTime.AsTime()) {
   445  			endTime = bld.Proto.CreateTime.AsTime()
   446  		}
   447  
   448  		stWithDetails := &buildstatus.StatusWithDetails{Status: st, Details: details}
   449  		bs, steps, err = updateBuildStatusOnTaskStatusChange(ctx, bld, stWithDetails, stWithDetails, endTime)
   450  	}
   451  
   452  	// Update build status
   453  	switch taskResult.State {
   454  	case apipb.TaskState_PENDING:
   455  		if bld.Status == pb.Status_STATUS_UNSPECIFIED {
   456  			// Scheduled Build should have SCHEDULED status already, so in theory this
   457  			// should not happen.
   458  			// Adding a log to confirm this.
   459  			logging.Debugf(ctx, "build %d has unspecified status, setting it to pending", bld.ID)
   460  			protoutil.SetStatus(now, bld.Proto, pb.Status_SCHEDULED)
   461  		} else {
   462  			// Most probably, race between PubSub push handler and Cron job.
   463  			// With swarming, a build cannot go from STARTED/ended back to PENDING,
   464  			// so ignore this.
   465  			return
   466  		}
   467  	case apipb.TaskState_RUNNING:
   468  		updateTime := now
   469  		if t := taskResult.StartedTs; t != nil {
   470  			updateTime = t.AsTime()
   471  		}
   472  		stWithDetails := &buildstatus.StatusWithDetails{Status: pb.Status_STARTED}
   473  		bs, steps, err = updateBuildStatusOnTaskStatusChange(ctx, bld, stWithDetails, stWithDetails, updateTime)
   474  	case apipb.TaskState_CANCELED, apipb.TaskState_KILLED:
   475  		setEndStatus(pb.Status_CANCELED, nil)
   476  	case apipb.TaskState_NO_RESOURCE:
   477  		setEndStatus(pb.Status_INFRA_FAILURE, &pb.StatusDetails{
   478  			ResourceExhaustion: &pb.StatusDetails_ResourceExhaustion{},
   479  		})
   480  	case apipb.TaskState_EXPIRED:
   481  		setEndStatus(pb.Status_INFRA_FAILURE, &pb.StatusDetails{
   482  			ResourceExhaustion: &pb.StatusDetails_ResourceExhaustion{},
   483  			Timeout:            &pb.StatusDetails_Timeout{},
   484  		})
   485  	case apipb.TaskState_TIMED_OUT:
   486  		setEndStatus(pb.Status_INFRA_FAILURE, &pb.StatusDetails{
   487  			Timeout: &pb.StatusDetails_Timeout{},
   488  		})
   489  	case apipb.TaskState_BOT_DIED, apipb.TaskState_CLIENT_ERROR:
   490  		// BB only supplies bbagent CIPD packages in a task, no other user packages.
   491  		// So the CLIENT_ERROR task state should be treated as build INFRA_FAILURE.
   492  		setEndStatus(pb.Status_INFRA_FAILURE, nil)
   493  	case apipb.TaskState_COMPLETED:
   494  		if taskResult.Failure {
   495  			switch bld.Proto.Output.GetStatus() {
   496  			case pb.Status_FAILURE:
   497  				setEndStatus(pb.Status_FAILURE, nil)
   498  			case pb.Status_CANCELED:
   499  				setEndStatus(pb.Status_CANCELED, nil)
   500  			default:
   501  				//If this truly was a non-infra failure, bbagent would catch that and
   502  				//mark the build as FAILURE.
   503  				//That did not happen, so this is an infra failure.
   504  				setEndStatus(pb.Status_INFRA_FAILURE, nil)
   505  			}
   506  		} else {
   507  			finalStatus := pb.Status_SUCCESS
   508  			if protoutil.IsEnded(bld.Proto.Output.GetStatus()) {
   509  				// Swarming task ends with COMPLETED(SUCCESS), use the build status
   510  				// as final status.
   511  				finalStatus = bld.Proto.Output.GetStatus()
   512  			}
   513  			setEndStatus(finalStatus, nil)
   514  		}
   515  	default:
   516  		err = errors.Reason("Unexpected task state: %s", taskResult.State).Err()
   517  		return
   518  	}
   519  
   520  	if bld.Proto.Status != oldStatus {
   521  		logging.Infof(ctx, "Build %d status: %s -> %s", bld.ID, oldStatus, bld.Proto.Status)
   522  		return
   523  	}
   524  	return
   525  }
   526  
   527  // createSwarmingTask creates a swarming task for the build.
   528  // Requires build.proto.infra to be populated.
   529  // If the returned error is fatal and non-retryable, the tq.Fatal tag will be added.
   530  func createSwarmingTask(ctx context.Context, build *model.Build, swarm clients.SwarmingClient) error {
   531  	taskReq, err := computeSwarmingNewTaskReq(ctx, build)
   532  	if err != nil {
   533  		return tq.Fatal.Apply(err)
   534  	}
   535  
   536  	// Insert secret bytes.
   537  	token, err := buildtoken.GenerateToken(ctx, build.ID, pb.TokenBody_BUILD)
   538  	if err != nil {
   539  		return tq.Fatal.Apply(err)
   540  	}
   541  	secrets := &pb.BuildSecrets{
   542  		StartBuildToken:               token,
   543  		BuildToken:                    token,
   544  		ResultdbInvocationUpdateToken: build.ResultDBUpdateToken,
   545  	}
   546  	secretsBytes, err := proto.Marshal(secrets)
   547  	if err != nil {
   548  		return tq.Fatal.Apply(err)
   549  	}
   550  	for _, t := range taskReq.TaskSlices {
   551  		t.Properties.SecretBytes = secretsBytes
   552  	}
   553  
   554  	// Create a swarming task
   555  	res, err := swarm.CreateTask(ctx, taskReq)
   556  	if err != nil {
   557  		// Give up if HTTP 500s are happening continuously. Otherwise re-throw the
   558  		// error so Cloud Tasks retries the task.
   559  		if apiErr, _ := err.(*googleapi.Error); apiErr == nil || apiErr.Code >= 500 {
   560  			if clock.Now(ctx).Sub(build.CreateTime) < swarmingCreateTaskGiveUpTimeout {
   561  				return transient.Tag.Apply(errors.Annotate(err, "failed to create a swarming task").Err())
   562  			}
   563  			logging.Errorf(ctx, "Give up Swarming task creation retry after %s", swarmingCreateTaskGiveUpTimeout.String())
   564  		}
   565  		// Strip out secret bytes and dump the task definition to the log.
   566  		for _, t := range taskReq.TaskSlices {
   567  			t.Properties.SecretBytes = nil
   568  		}
   569  		logging.Errorf(ctx, "Swarming task creation failure:%s. CreateTask request: %+v\nResponse: %+v", err, taskReq, res)
   570  		return tq.Fatal.Apply(errors.Annotate(err, "failed to create a swarming task").Err())
   571  	}
   572  
   573  	// Update the build with the build token and new task id.
   574  	err = datastore.RunInTransaction(ctx, func(ctx context.Context) error {
   575  		bld := &model.Build{
   576  			ID: build.ID,
   577  		}
   578  		infra := &model.BuildInfra{Build: datastore.KeyForObj(ctx, bld)}
   579  		if err := datastore.Get(ctx, bld, infra); err != nil {
   580  			return errors.Annotate(err, "failed to fetch build or buildInfra: %d", bld.ID).Err()
   581  		}
   582  
   583  		if infra.Proto.Swarming.TaskId != "" {
   584  			return errors.Reason("build already has a task %s", infra.Proto.Swarming.TaskId).Err()
   585  		}
   586  		infra.Proto.Swarming.TaskId = res.TaskId
   587  		bld.UpdateToken = token
   588  
   589  		return datastore.Put(ctx, bld, infra)
   590  	}, nil)
   591  	if err != nil {
   592  		// now that swarm.CreateTask is idempotent, we should reuse the task,
   593  		// instead of cancelling it.
   594  		logging.Errorf(ctx, "created a task %s, but failed to update datastore with the error:%s", res.TaskId, err)
   595  		return transient.Tag.Apply(errors.Annotate(err, "failed to update build %d", build.ID).Err())
   596  	}
   597  	return nil
   598  }
   599  
   600  func computeSwarmingNewTaskReq(ctx context.Context, build *model.Build) (*apipb.NewTaskRequest, error) {
   601  	sw := build.Proto.GetInfra().GetSwarming()
   602  	if sw == nil {
   603  		return nil, errors.New("build.Proto.Infra.Swarming isn't set")
   604  	}
   605  	taskReq := &apipb.NewTaskRequest{
   606  		// to prevent accidental multiple task creation
   607  		RequestUuid:      uuid.NewSHA1(uuid.Nil, []byte(strconv.FormatInt(build.ID, 10))).String(),
   608  		Name:             fmt.Sprintf("bb-%d-%s", build.ID, build.BuilderID),
   609  		Realm:            build.Realm(),
   610  		Tags:             computeTags(ctx, build),
   611  		Priority:         int32(sw.Priority),
   612  		PoolTaskTemplate: apipb.NewTaskRequest_SKIP,
   613  	}
   614  
   615  	if build.Proto.Number > 0 {
   616  		taskReq.Name = fmt.Sprintf("%s-%d", taskReq.Name, build.Proto.Number)
   617  	}
   618  
   619  	taskSlices, err := computeTaskSlice(build)
   620  	if err != nil {
   621  		errors.Annotate(err, "failed to computing task slices").Err()
   622  	}
   623  	taskReq.TaskSlices = taskSlices
   624  
   625  	// Only makes swarming to track the build's parent if Buildbucket doesn't
   626  	// track.
   627  	// Buildbucket should track the parent/child build relationships for all
   628  	// Buildbucket Builds.
   629  	// Except for children of led builds, whose parents are still tracked by
   630  	// swarming using sw.parent_run_id.
   631  	// TODO(crbug.com/1031205): remove the check on
   632  	// luci.buildbucket.parent_tracking after this experiment is on globally and
   633  	// we're ready to remove it.
   634  	if sw.ParentRunId != "" && (len(build.Proto.AncestorIds) == 0 ||
   635  		!strings.Contains(build.ExperimentsString(), buildbucket.ExperimentParentTracking)) {
   636  		taskReq.ParentTaskId = sw.ParentRunId
   637  	}
   638  
   639  	if sw.TaskServiceAccount != "" {
   640  		taskReq.ServiceAccount = sw.TaskServiceAccount
   641  	}
   642  
   643  	taskReq.PubsubTopic = fmt.Sprintf(pubsubTopicTemplate, info.AppID(ctx))
   644  	ud := &userdata{
   645  		BuildID:          build.ID,
   646  		CreatedTS:        clock.Now(ctx).UnixNano() / int64(time.Microsecond),
   647  		SwarmingHostname: sw.Hostname,
   648  	}
   649  	udBytes, err := json.Marshal(ud)
   650  	if err != nil {
   651  		return nil, errors.Annotate(err, "failed to marshal pubsub userdata").Err()
   652  	}
   653  	taskReq.PubsubUserdata = string(udBytes)
   654  	return taskReq, err
   655  }
   656  
   657  // computeTags computes the Swarming task request tags to use.
   658  // Note it doesn't compute kitchen related tags.
   659  func computeTags(ctx context.Context, build *model.Build) []string {
   660  	tags := []string{
   661  		"buildbucket_bucket:" + build.BucketID,
   662  		fmt.Sprintf("buildbucket_build_id:%d", build.ID),
   663  		fmt.Sprintf("buildbucket_hostname:%s", build.Proto.GetInfra().GetBuildbucket().GetHostname()),
   664  		"luci_project:" + build.Project,
   665  	}
   666  	if build.Canary {
   667  		tags = append(tags, "buildbucket_template_canary:1")
   668  	} else {
   669  		tags = append(tags, "buildbucket_template_canary:0")
   670  	}
   671  
   672  	tags = append(tags, build.Tags...)
   673  	sort.Strings(tags)
   674  	return tags
   675  }
   676  
   677  // computeTaskSlice computes swarming task slices.
   678  // build.Proto.Infra must be set.
   679  func computeTaskSlice(build *model.Build) ([]*apipb.TaskSlice, error) {
   680  	// expiration_secs -> []*StringPair
   681  	dims := map[int64][]*apipb.StringPair{}
   682  	for _, cache := range build.Proto.GetInfra().GetSwarming().GetCaches() {
   683  		expSecs := cache.WaitForWarmCache.GetSeconds()
   684  		if expSecs <= 0 {
   685  			continue
   686  		}
   687  		if _, ok := dims[expSecs]; !ok {
   688  			dims[expSecs] = []*apipb.StringPair{}
   689  		}
   690  		dims[expSecs] = append(dims[expSecs], &apipb.StringPair{
   691  			Key:   "caches",
   692  			Value: cache.Name,
   693  		})
   694  	}
   695  	for _, dim := range build.Proto.GetInfra().GetSwarming().GetTaskDimensions() {
   696  		expSecs := dim.Expiration.GetSeconds()
   697  		if _, ok := dims[expSecs]; !ok {
   698  			dims[expSecs] = []*apipb.StringPair{}
   699  		}
   700  		dims[expSecs] = append(dims[expSecs], &apipb.StringPair{
   701  			Key:   dim.Key,
   702  			Value: dim.Value,
   703  		})
   704  	}
   705  
   706  	// extract base dim and delete it from the map.
   707  	baseDim, ok := dims[0]
   708  	if !ok {
   709  		baseDim = []*apipb.StringPair{}
   710  	}
   711  	delete(dims, 0)
   712  	if len(dims) > 6 {
   713  		return nil, errors.New("At most 6 different expiration_secs to be allowed in swarming")
   714  	}
   715  
   716  	baseSlice := &apipb.TaskSlice{
   717  		ExpirationSecs:  int32(build.Proto.GetSchedulingTimeout().GetSeconds()),
   718  		WaitForCapacity: build.Proto.GetWaitForCapacity(),
   719  		Properties: &apipb.TaskProperties{
   720  			CipdInput:            computeCipdInput(build),
   721  			ExecutionTimeoutSecs: int32(build.Proto.GetExecutionTimeout().GetSeconds()),
   722  			GracePeriodSecs:      int32(build.Proto.GetGracePeriod().GetSeconds() + bbagentReservedGracePeriod),
   723  			Caches:               computeTaskSliceCaches(build),
   724  			Dimensions:           baseDim,
   725  			EnvPrefixes:          computeEnvPrefixes(build),
   726  			Env: []*apipb.StringPair{
   727  				{Key: "BUILDBUCKET_EXPERIMENTAL", Value: strings.ToUpper(strconv.FormatBool(build.Experimental))},
   728  			},
   729  			Command: computeCommand(build),
   730  		},
   731  	}
   732  
   733  	// sort dims map by expiration_sec.
   734  	var expSecs []int
   735  	for expSec := range dims {
   736  		expSecs = append(expSecs, int(expSec))
   737  	}
   738  	sort.Ints(expSecs)
   739  
   740  	// TODO(vadimsh): Remove this when no longer needed, ETA Oct 2022. This is
   741  	// used to load test Swarming's slice expiration mechanism.
   742  	sliceWaitForCapacity := build.Proto.GetWaitForCapacity() &&
   743  		strings.Contains(build.ExperimentsString(), buildbucket.ExperimentWaitForCapacity)
   744  
   745  	// Create extra task slices by copying the base task slice. Adding the
   746  	// corresponding expiration and desired dimensions
   747  	lastExp := 0
   748  	taskSlices := make([]*apipb.TaskSlice, len(expSecs)+1)
   749  	for i, sec := range expSecs {
   750  		prop := &apipb.TaskProperties{}
   751  		if err := deepCopy(baseSlice.Properties, prop); err != nil {
   752  			return nil, err
   753  		}
   754  		taskSlices[i] = &apipb.TaskSlice{
   755  			WaitForCapacity: sliceWaitForCapacity,
   756  			ExpirationSecs:  int32(sec - lastExp),
   757  			Properties:      prop,
   758  		}
   759  		// dims[i] should be added into all previous non-expired task slices.
   760  		for j := 0; j <= i; j++ {
   761  			taskSlices[j].Properties.Dimensions = append(taskSlices[j].Properties.Dimensions, dims[int64(sec)]...)
   762  		}
   763  		lastExp = sec
   764  	}
   765  
   766  	// Tweak expiration on the baseSlice, which is the last slice.
   767  	exp := max(int(baseSlice.ExpirationSecs)-lastExp, 60)
   768  	baseSlice.ExpirationSecs = int32(exp)
   769  	taskSlices[len(taskSlices)-1] = baseSlice
   770  
   771  	sortDim := func(strPairs []*apipb.StringPair) {
   772  		sort.Slice(strPairs, func(i, j int) bool {
   773  			if strPairs[i].Key == strPairs[j].Key {
   774  				return strPairs[i].Value < strPairs[j].Value
   775  			}
   776  			return strPairs[i].Key < strPairs[j].Key
   777  		})
   778  	}
   779  	// sort dimensions in each task slice.
   780  	for _, t := range taskSlices {
   781  		sortDim(t.Properties.Dimensions)
   782  	}
   783  	return taskSlices, nil
   784  }
   785  
   786  // computeTaskSliceCaches computes the task slice caches.
   787  func computeTaskSliceCaches(build *model.Build) []*apipb.CacheEntry {
   788  	infra := build.Proto.Infra
   789  	caches := make([]*apipb.CacheEntry, 0, len(infra.Swarming.GetCaches())+2)
   790  	for _, c := range build.Proto.Infra.Swarming.GetCaches() {
   791  		caches = append(caches, &apipb.CacheEntry{
   792  			Name: c.Name,
   793  			Path: filepath.Join(cacheDir, c.Path),
   794  		})
   795  	}
   796  	if cipdClientCache := infra.Buildbucket.GetAgent().GetCipdClientCache(); cipdClientCache != nil {
   797  		caches = append(caches, &apipb.CacheEntry{
   798  			Name: cipdClientCache.Name,
   799  			Path: filepath.Join(cacheDir, cipdClientCache.Path),
   800  		})
   801  	}
   802  	if cipdPackagesCache := infra.Buildbucket.GetAgent().GetCipdPackagesCache(); cipdPackagesCache != nil {
   803  		caches = append(caches, &apipb.CacheEntry{
   804  			Name: cipdPackagesCache.Name,
   805  			Path: filepath.Join(cacheDir, cipdPackagesCache.Path),
   806  		})
   807  	}
   808  
   809  	return caches
   810  }
   811  
   812  // computeCipdInput returns swarming task CIPD input.
   813  // Note: this function only considers v2 bbagent builds.
   814  // The build.Proto.Infra.Buildbucket.Agent.Source must be set
   815  func computeCipdInput(build *model.Build) *apipb.CipdInput {
   816  	return &apipb.CipdInput{
   817  		Packages: []*apipb.CipdPackage{{
   818  			PackageName: build.Proto.GetInfra().GetBuildbucket().GetAgent().GetSource().GetCipd().GetPackage(),
   819  			Version:     build.Proto.GetInfra().GetBuildbucket().GetAgent().GetSource().GetCipd().GetVersion(),
   820  			Path:        ".",
   821  		}},
   822  	}
   823  }
   824  
   825  // computeEnvPrefixes returns env_prefixes key in swarming properties.
   826  // Note: this function only considers v2 bbagent builds.
   827  func computeEnvPrefixes(build *model.Build) []*apipb.StringListPair {
   828  	prefixesMap := map[string][]string{}
   829  	for _, c := range build.Proto.GetInfra().GetSwarming().GetCaches() {
   830  		if c.EnvVar != "" {
   831  			if _, ok := prefixesMap[c.EnvVar]; !ok {
   832  				prefixesMap[c.EnvVar] = []string{}
   833  			}
   834  			prefixesMap[c.EnvVar] = append(prefixesMap[c.EnvVar], filepath.Join(cacheDir, c.Path))
   835  		}
   836  	}
   837  	var keys []string
   838  	for key := range prefixesMap {
   839  		keys = append(keys, key)
   840  	}
   841  	sort.Strings(keys)
   842  	prefixes := make([]*apipb.StringListPair, len(keys))
   843  	for i, key := range keys {
   844  		prefixes[i] = &apipb.StringListPair{
   845  			Key:   key,
   846  			Value: prefixesMap[key],
   847  		}
   848  	}
   849  	return prefixes
   850  }
   851  
   852  // computeCommand computes the command for bbagent.
   853  func computeCommand(build *model.Build) []string {
   854  	if strings.Contains(build.ExperimentsString(), buildbucket.ExperimentBBAgentGetBuild) {
   855  		return []string{
   856  			"bbagent${EXECUTABLE_SUFFIX}",
   857  			"-host",
   858  			build.Proto.GetInfra().GetBuildbucket().GetHostname(),
   859  			"-build-id",
   860  			strconv.FormatInt(build.ID, 10),
   861  		}
   862  	}
   863  
   864  	return []string{
   865  		"bbagent${EXECUTABLE_SUFFIX}",
   866  		bbinput.Encode(&pb.BBAgentArgs{
   867  			Build:                  build.Proto,
   868  			CacheDir:               build.Proto.GetInfra().GetBbagent().GetCacheDir(),
   869  			KnownPublicGerritHosts: build.Proto.GetInfra().GetBuildbucket().GetKnownPublicGerritHosts(),
   870  			PayloadPath:            build.Proto.GetInfra().GetBbagent().GetPayloadPath(),
   871  		}),
   872  	}
   873  }
   874  
   875  func max(a, b int) int {
   876  	if a > b {
   877  		return a
   878  	}
   879  	return b
   880  }
   881  
   882  // deepCopy deep copies src to dst using json marshaling for non-proto messages.
   883  func deepCopy(src, dst any) error {
   884  	srcBytes, err := json.Marshal(src)
   885  	if err != nil {
   886  		return err
   887  	}
   888  	return json.Unmarshal(srcBytes, dst)
   889  }