go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/buildbucket/appengine/tasks/backend.go (about)

     1  // Copyright 2022 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tasks
    16  
    17  import (
    18  	"context"
    19  	"encoding/base64"
    20  	"encoding/json"
    21  	"fmt"
    22  	"strconv"
    23  	"strings"
    24  	"time"
    25  
    26  	"google.golang.org/api/googleapi"
    27  	codepb "google.golang.org/genproto/googleapis/rpc/code"
    28  	"google.golang.org/protobuf/proto"
    29  	"google.golang.org/protobuf/types/known/durationpb"
    30  	"google.golang.org/protobuf/types/known/structpb"
    31  	"google.golang.org/protobuf/types/known/timestamppb"
    32  
    33  	cipdpb "go.chromium.org/luci/cipd/api/cipd/v1"
    34  	"go.chromium.org/luci/common/clock"
    35  	"go.chromium.org/luci/common/errors"
    36  	"go.chromium.org/luci/common/logging"
    37  	"go.chromium.org/luci/common/retry/transient"
    38  	"go.chromium.org/luci/gae/service/datastore"
    39  	"go.chromium.org/luci/gae/service/info"
    40  	"go.chromium.org/luci/grpc/prpc"
    41  	"go.chromium.org/luci/server/caching/layered"
    42  	"go.chromium.org/luci/server/tq"
    43  
    44  	"go.chromium.org/luci/buildbucket/appengine/common"
    45  	"go.chromium.org/luci/buildbucket/appengine/internal/buildtoken"
    46  	"go.chromium.org/luci/buildbucket/appengine/internal/clients"
    47  	"go.chromium.org/luci/buildbucket/appengine/internal/config"
    48  	"go.chromium.org/luci/buildbucket/appengine/model"
    49  	pb "go.chromium.org/luci/buildbucket/proto"
    50  )
    51  
    52  const (
    53  	// bbagentReservedGracePeriod is the time reserved by bbagent in order to have
    54  	// time to have a couple retry rounds for UpdateBuild RPCs
    55  	// TODO(crbug.com/1328646): may need to adjust the grace_period based on
    56  	// UpdateBuild's new performance in Buildbucket Go.
    57  	bbagentReservedGracePeriod = 180
    58  
    59  	// runTaskGiveUpTimeoutDefault is the default value for how long to retry
    60  	// the CreateBackendTask before giving up with INFRA_FAILURE.
    61  	runTaskGiveUpTimeoutDefault = 10 * 60 * time.Second
    62  
    63  	cipdCacheTTL = 10 * time.Minute
    64  )
    65  
    66  type cipdPackageDetails struct {
    67  	Size int64  `json:"size,omitempty"`
    68  	Hash string `json:"hash,omitempty"`
    69  }
    70  
    71  type cipdPackageDetailsMap map[string]*cipdPackageDetails
    72  
    73  var cipdDescribeBootstrapBundleCache = layered.RegisterCache(layered.Parameters[cipdPackageDetailsMap]{
    74  	ProcessCacheCapacity: 1000,
    75  	GlobalNamespace:      "cipd-describeBootstrapBundle-v1",
    76  	Marshal: func(item cipdPackageDetailsMap) ([]byte, error) {
    77  		return json.Marshal(item)
    78  	},
    79  	Unmarshal: func(blob []byte) (cipdPackageDetailsMap, error) {
    80  		res := cipdPackageDetailsMap{}
    81  		err := json.Unmarshal(blob, &res)
    82  		return res, err
    83  	},
    84  })
    85  
    86  type MockCipdClientKey struct{}
    87  
    88  func NewCipdClient(ctx context.Context, host string, project string) (client *prpc.Client, err error) {
    89  	if mockClient, ok := ctx.Value(MockCipdClientKey{}).(*prpc.Client); ok {
    90  		return mockClient, nil
    91  	}
    92  	client, err = clients.CreateRawPrpcClient(ctx, host, project)
    93  	return
    94  }
    95  
    96  // computeTaskCaches computes the task caches.
    97  func computeTaskCaches(infra *model.BuildInfra) []*pb.CacheEntry {
    98  	caches := make([]*pb.CacheEntry, 0, len(infra.Proto.Backend.GetCaches())+2)
    99  	if len(infra.Proto.Backend.GetCaches()) > 0 {
   100  		caches = append(caches, infra.Proto.Backend.Caches...)
   101  	}
   102  	if infra.Proto.Buildbucket.GetAgent().GetCipdClientCache() != nil {
   103  		caches = append(caches, infra.Proto.Buildbucket.Agent.CipdClientCache)
   104  	}
   105  	if infra.Proto.Buildbucket.GetAgent().GetCipdPackagesCache() != nil {
   106  		caches = append(caches, infra.Proto.Buildbucket.Agent.CipdPackagesCache)
   107  	}
   108  	return caches
   109  }
   110  
   111  func computeAgentArgs(build *pb.Build, infra *pb.BuildInfra) (args []string) {
   112  	args = []string{}
   113  	// build-id arg
   114  	args = append(args, "-build-id")
   115  	args = append(args, strconv.FormatInt(build.GetId(), 10))
   116  	// host arg
   117  	args = append(args, "-host")
   118  	args = append(args, infra.Buildbucket.GetHostname())
   119  	// cache-base arg
   120  	args = append(args, "-cache-base")
   121  	args = append(args, infra.Bbagent.GetCacheDir())
   122  
   123  	// context-file arg
   124  	args = append(args, "-context-file")
   125  	args = append(args, "${BUILDBUCKET_AGENT_CONTEXT_FILE}")
   126  	return
   127  }
   128  
   129  // computeBackendPubsubTopic computes the pubsub topic that should be included
   130  // in RunTaskRequest. Return an empty string if the backend is in lite mode.
   131  func computeBackendPubsubTopic(ctx context.Context, target string, globalCfg *pb.SettingsCfg) (string, error) {
   132  	if globalCfg == nil {
   133  		return "", errors.Reason("error fetching service config").Err()
   134  	}
   135  	for _, backend := range globalCfg.Backends {
   136  		if backend.Target == target {
   137  			switch backend.Mode.(type) {
   138  			case *pb.BackendSetting_LiteMode_:
   139  				return "", nil
   140  			case *pb.BackendSetting_FullMode_:
   141  				return fmt.Sprintf("projects/%s/topics/%s", info.AppID(ctx), backend.GetFullMode().GetPubsubId()), nil
   142  			default:
   143  				return "", errors.Reason("getting pubsub_id from backend %s is not supported", target).Err()
   144  			}
   145  		}
   146  	}
   147  	return "", errors.Reason("backend %s not found in global settings", target).Err()
   148  }
   149  
   150  func computeBackendNewTaskReq(ctx context.Context, build *model.Build, infra *model.BuildInfra, requestID string, globalCfg *pb.SettingsCfg) (*pb.RunTaskRequest, error) {
   151  	// Create StartBuildToken and secrets.
   152  	startBuildToken, err := buildtoken.GenerateToken(ctx, build.ID, pb.TokenBody_START_BUILD)
   153  	if err != nil {
   154  		return nil, err
   155  	}
   156  	secrets := &pb.BuildSecrets{
   157  		StartBuildToken:               startBuildToken,
   158  		ResultdbInvocationUpdateToken: build.ResultDBUpdateToken,
   159  	}
   160  	backend := infra.Proto.GetBackend()
   161  	if backend == nil {
   162  		return nil, errors.New("infra.Proto.Backend isn't set")
   163  	}
   164  	caches := computeTaskCaches(infra)
   165  	gracePeriod := &durationpb.Duration{
   166  		Seconds: build.Proto.GetGracePeriod().GetSeconds() + bbagentReservedGracePeriod,
   167  	}
   168  
   169  	startDeadline := &timestamppb.Timestamp{
   170  		Seconds: build.Proto.GetCreateTime().GetSeconds() + build.Proto.GetSchedulingTimeout().GetSeconds(),
   171  	}
   172  
   173  	pubsubTopic, err := computeBackendPubsubTopic(ctx, backend.Task.Id.Target, globalCfg)
   174  	if err != nil {
   175  		return nil, err
   176  	}
   177  
   178  	// Add task name into backend config.
   179  	taskName := fmt.Sprintf("bb-%d-%s", build.ID, build.BuilderID)
   180  	if build.Proto.Number > 0 {
   181  		taskName = fmt.Sprintf("%s-%d", taskName, build.Proto.Number)
   182  	}
   183  	backend.Config.Fields["task_name"] = structpb.NewStringValue(taskName)
   184  
   185  	taskReq := &pb.RunTaskRequest{
   186  		BuildbucketHost:  infra.Proto.Buildbucket.Hostname,
   187  		Secrets:          secrets,
   188  		Target:           backend.Task.Id.Target,
   189  		RequestId:        requestID,
   190  		BuildId:          strconv.FormatInt(build.Proto.Id, 10),
   191  		Realm:            build.Realm(),
   192  		BackendConfig:    backend.Config,
   193  		ExecutionTimeout: build.Proto.GetExecutionTimeout(),
   194  		GracePeriod:      gracePeriod,
   195  		Caches:           caches,
   196  		AgentArgs:        computeAgentArgs(build.Proto, infra.Proto),
   197  		Dimensions:       infra.Proto.Backend.GetTaskDimensions(),
   198  		StartDeadline:    startDeadline,
   199  		Experiments:      build.Proto.Input.GetExperiments(),
   200  		PubsubTopic:      pubsubTopic,
   201  	}
   202  
   203  	project := build.Proto.Builder.Project
   204  	taskReq.Agent = &pb.RunTaskRequest_AgentExecutable{}
   205  	taskReq.Agent.Source, err = extractCipdDetails(ctx, project, infra.Proto)
   206  	if err != nil {
   207  		return nil, err
   208  	}
   209  
   210  	build.Proto.Infra = infra.Proto
   211  	tags := computeTags(ctx, build)
   212  	tagsAny := make([]any, len(tags))
   213  	for i, t := range tags {
   214  		tagsAny[i] = t
   215  	}
   216  	tagsList, err := structpb.NewList(tagsAny)
   217  	if err != nil {
   218  		return nil, err
   219  	}
   220  	if taskReq.BackendConfig == nil {
   221  		taskReq.BackendConfig = &structpb.Struct{}
   222  	}
   223  	taskReq.BackendConfig.Fields["tags"] = structpb.NewListValue(tagsList)
   224  	return taskReq, nil
   225  }
   226  
   227  func createCipdDescribeBootstrapBundleRequest(infra *pb.BuildInfra) *cipdpb.DescribeBootstrapBundleRequest {
   228  	prefix := infra.Buildbucket.Agent.Source.GetCipd().GetPackage()
   229  	prefix = strings.TrimSuffix(prefix, "/${platform}")
   230  	return &cipdpb.DescribeBootstrapBundleRequest{
   231  		Prefix:  prefix,
   232  		Version: infra.Buildbucket.Agent.Source.GetCipd().GetVersion(),
   233  	}
   234  }
   235  
   236  func computeCipdURL(source *pb.BuildInfra_Buildbucket_Agent_Source, pkg string, details *cipdPackageDetails) (url string) {
   237  	server := source.GetCipd().GetServer()
   238  	version := source.GetCipd().GetVersion()
   239  	return server + "/bootstrap/" + pkg + "/+/" + version
   240  }
   241  
   242  // extractCipdDetails returns a map that maps package (Prefix + variant for each variant)
   243  // to a cipdPackageDetails object, which is just the hash and size.
   244  //
   245  // A Cipd client is created and calls DescribeBootstrapBundle to retrieve the data.
   246  func extractCipdDetails(ctx context.Context, project string, infra *pb.BuildInfra) (details map[string]*pb.RunTaskRequest_AgentExecutable_AgentSource, err error) {
   247  	cipdServer := infra.Buildbucket.Agent.Source.GetCipd().GetServer()
   248  	cipdClient, err := NewCipdClient(ctx, cipdServer, project)
   249  	if err != nil {
   250  		return nil, err
   251  	}
   252  	req := createCipdDescribeBootstrapBundleRequest(infra)
   253  	bytes, err := proto.Marshal(req)
   254  	if err != nil {
   255  		return nil, err
   256  	}
   257  	cachePrefix := base64.StdEncoding.EncodeToString(bytes)
   258  	cipdDetails, err := cipdDescribeBootstrapBundleCache.GetOrCreate(ctx, cachePrefix, func() (cipdPackageDetailsMap, time.Duration, error) {
   259  		out := &cipdpb.DescribeBootstrapBundleResponse{}
   260  		err := cipdClient.Call(ctx, "cipd.Repository", "DescribeBootstrapBundle", req, out)
   261  		if err != nil {
   262  			return nil, 0, err
   263  		}
   264  		resp := make(cipdPackageDetailsMap, len(out.Files))
   265  		hasErrFile := false
   266  		for _, file := range out.Files {
   267  			if s := file.Status; s != nil && s.Code != int32(codepb.Code_OK) {
   268  				hasErrFile = true
   269  				logging.Warningf(ctx, "cannot resolve the package %q: error code - %d, message - %s", file.Package, s.Code, s.Message)
   270  				continue
   271  			}
   272  			resp[file.Package] = &cipdPackageDetails{
   273  				Hash: file.Instance.HexDigest,
   274  				Size: file.Size,
   275  			}
   276  		}
   277  
   278  		ttl := cipdCacheTTL
   279  		if hasErrFile {
   280  			// Sometimes, the cipd package may not exist on one of platforms or its
   281  			// tag hasn't been populated yet, etc. Choose a shorter cache time in
   282  			// these situations.
   283  			ttl = 1 * time.Minute
   284  		}
   285  		return resp, ttl, nil
   286  	})
   287  	if err != nil {
   288  		return nil, errors.Annotate(err, "cache error for cipd request").Err()
   289  	}
   290  	details = map[string]*pb.RunTaskRequest_AgentExecutable_AgentSource{}
   291  	for k, v := range cipdDetails {
   292  		val := &pb.RunTaskRequest_AgentExecutable_AgentSource{
   293  			Sha256:    v.Hash,
   294  			SizeBytes: v.Size,
   295  			Url:       computeCipdURL(infra.Buildbucket.Agent.Source, k, v),
   296  		}
   297  		details[k] = val
   298  	}
   299  	return
   300  }
   301  
   302  // CreateBackendTask creates a backend task for the build.
   303  func CreateBackendTask(ctx context.Context, buildID int64, requestID string) error {
   304  	entities, err := common.GetBuildEntities(ctx, buildID, model.BuildKind, model.BuildInfraKind)
   305  	if err != nil {
   306  		return errors.Annotate(err, "failed to get build %d", buildID).Err()
   307  	}
   308  	bld := entities[0].(*model.Build)
   309  	infra := entities[1].(*model.BuildInfra)
   310  
   311  	if infra.Proto.GetBackend().GetTask().GetId().GetId() != "" {
   312  		// This task is likely a retry.
   313  		// It could happen if the previous RunTask attempt(s) failed, but a backend
   314  		// task was actually created and associated with the build in the backup
   315  		// flow.
   316  		// Bail out.
   317  		logging.Infof(ctx, "build %d has associated with task %q", buildID, infra.Proto.Backend.Task.Id)
   318  		return nil
   319  	}
   320  
   321  	globalCfg, err := config.GetSettingsCfg(ctx)
   322  	if err != nil {
   323  		return errors.Annotate(err, "could not get global settings config").Err()
   324  	}
   325  
   326  	var backendCfg *pb.BackendSetting
   327  	for _, backend := range globalCfg.GetBackends() {
   328  		if backend.Target == infra.Proto.Backend.Task.Id.Target {
   329  			backendCfg = backend
   330  		}
   331  	}
   332  	if backendCfg == nil {
   333  		return tq.Fatal.Apply(errors.Reason("failed to get backend config from global settings").Err())
   334  	}
   335  
   336  	var runTaskGiveUpTimeout time.Duration
   337  	if backendCfg.TaskCreatingTimeout.GetSeconds() == 0 {
   338  		runTaskGiveUpTimeout = runTaskGiveUpTimeoutDefault
   339  	} else {
   340  		runTaskGiveUpTimeout = backendCfg.TaskCreatingTimeout.AsDuration()
   341  	}
   342  
   343  	// If task creation has already expired, fail the build immediately.
   344  	if clock.Now(ctx).Sub(bld.CreateTime) >= runTaskGiveUpTimeout {
   345  		dsPutErr := failBuild(ctx, buildID, "Backend task creation failure.")
   346  		if dsPutErr != nil {
   347  			return dsPutErr
   348  		}
   349  		return tq.Fatal.Apply(errors.Reason("creating backend task for build %d with requestID %s has expired after %s", buildID, requestID, runTaskGiveUpTimeout.String()).Err())
   350  	}
   351  
   352  	// Initialize a TaskCreator for creating the backend task.
   353  	_, isLite := backendCfg.Mode.(*pb.BackendSetting_LiteMode_)
   354  	backend, err := clients.NewTaskCreator(ctx, bld.Proto.Builder.Project, infra.Proto.Backend.Task.Id.Target, globalCfg, isLite)
   355  	if err != nil {
   356  		return tq.Fatal.Apply(errors.Annotate(err, "failed to connect to backend service").Err())
   357  	}
   358  
   359  	taskReq, err := computeBackendNewTaskReq(ctx, bld, infra, requestID, globalCfg)
   360  	if err != nil {
   361  		return tq.Fatal.Apply(err)
   362  	}
   363  
   364  	// Create a backend task via RunTask
   365  	taskResp, err := backend.RunTask(ctx, taskReq)
   366  
   367  	// TODO(b/288158829): remove it once the root cause for the Skia failure is found.
   368  	if bld.Proto.Builder.Project == "skia" {
   369  		logging.Debugf(ctx, "RunTaskResponse from skia: %v", taskResp)
   370  	}
   371  
   372  	now := clock.Now(ctx)
   373  	if err != nil {
   374  		// Give up if HTTP 500s are happening continuously. Otherwise re-throw the
   375  		// error so Cloud Tasks retries the task.
   376  		if apiErr, _ := err.(*googleapi.Error); apiErr == nil || apiErr.Code >= 500 {
   377  			if now.Sub(bld.CreateTime) < runTaskGiveUpTimeout {
   378  				return transient.Tag.Apply(errors.Annotate(err, "failed to create a backend task").Err())
   379  			}
   380  			logging.Errorf(ctx, "Give up backend task creation retry after %s", runTaskGiveUpTimeout.String())
   381  		}
   382  		logging.Errorf(ctx, "Backend task creation failure:%s. RunTask request: %+v", err, taskReq)
   383  		dsPutErr := failBuild(ctx, bld.ID, "Backend task creation failure.")
   384  		if dsPutErr != nil {
   385  			return dsPutErr
   386  		}
   387  		return tq.Fatal.Apply(errors.Annotate(err, "failed to create a backend task").Err())
   388  	}
   389  	if taskResp.Task.GetUpdateId() == 0 {
   390  		return tq.Fatal.Apply(errors.Reason("task returned with an updateID of 0").Err())
   391  	}
   392  
   393  	checkLiveness, heartbeatTimeout, err := shouldCheckLiveness(ctx, bld, backendCfg)
   394  	if err != nil {
   395  		return transient.Tag.Apply(err)
   396  	}
   397  
   398  	txErr := datastore.RunInTransaction(ctx, func(ctx context.Context) error {
   399  		entities, err := common.GetBuildEntities(ctx, buildID, model.BuildKind, model.BuildInfraKind)
   400  		if err != nil {
   401  			return errors.Annotate(err, "failed to get build %d", buildID).Err()
   402  		}
   403  		bld = entities[0].(*model.Build)
   404  		infra = entities[1].(*model.BuildInfra)
   405  
   406  		infra.Proto.Backend.Task = taskResp.Task
   407  
   408  		// Update Build entity.
   409  		bld.Proto.UpdateTime = timestamppb.New(now)
   410  		target := taskResp.Task.Id.Target
   411  		for _, backendSetting := range globalCfg.Backends {
   412  			if backendSetting.Target == target {
   413  				if backendSetting.GetFullMode().GetBuildSyncSetting() != nil {
   414  					bld.BackendTarget = target
   415  					interval := backendSetting.GetFullMode().GetBuildSyncSetting().GetSyncIntervalSeconds()
   416  					if interval > 0 {
   417  						bld.BackendSyncInterval = time.Duration(interval) * time.Second
   418  					}
   419  					bld.GenerateNextBackendSyncTime(ctx, backendSetting.GetFullMode().GetBuildSyncSetting().GetShards())
   420  				}
   421  				break
   422  			}
   423  		}
   424  
   425  		if checkLiveness {
   426  			// SchedulingTimeout is always set in schedule_build flow.
   427  			delay := bld.Proto.SchedulingTimeout.Seconds
   428  			if heartbeatTimeout != 0 && int64(heartbeatTimeout) < delay {
   429  				// Better to choose a shorter delay as a first CheckBuildLiveness task.
   430  				delay = int64(heartbeatTimeout)
   431  			}
   432  			if err = CheckBuildLiveness(ctx, bld.ID, heartbeatTimeout, time.Duration(delay)*time.Second); err != nil {
   433  				return errors.Annotate(err, "failed to enqueue CheckBuildLiveness task").Err()
   434  			}
   435  		}
   436  		return errors.Annotate(datastore.Put(ctx, bld, infra), "failed to save Build and BuildInfra").Err()
   437  	}, nil)
   438  	if txErr != nil {
   439  		logging.Errorf(ctx, "Task failed to save: %s", taskResp.String())
   440  		return transient.Tag.Apply(err)
   441  	}
   442  	return nil
   443  }
   444  
   445  // shouldCheckLiveness checks if Buildbucket should enqueue a task to
   446  // periodically check the build liveness.
   447  func shouldCheckLiveness(ctx context.Context, bld *model.Build, backendCfg *pb.BackendSetting) (bool, uint32, error) {
   448  	if _, ok := backendCfg.Mode.(*pb.BackendSetting_LiteMode_); ok {
   449  		bkt := &model.Bucket{
   450  			ID:     bld.Proto.Builder.Bucket,
   451  			Parent: model.ProjectKey(ctx, bld.Proto.Builder.Project),
   452  		}
   453  		bldr := &model.Builder{
   454  			ID:     bld.Proto.Builder.Builder,
   455  			Parent: datastore.KeyForObj(ctx, bkt),
   456  		}
   457  		if err := datastore.Get(ctx, bldr, bkt); err != nil {
   458  			switch merr, ok := err.(errors.MultiError); {
   459  			case ok && errors.Contains(merr[0], datastore.ErrNoSuchEntity) && bkt.Proto.GetDynamicBuilderTemplate() != nil:
   460  				// It's a dynamic builder.
   461  				return true, bkt.Proto.DynamicBuilderTemplate.Template.GetHeartbeatTimeoutSecs(), nil
   462  			default:
   463  				return false, 0, errors.Annotate(err, "failed to fetch builder %s", bld.BuilderID).Err()
   464  			}
   465  		}
   466  		// No matter whether the hearbeat_timeout_secs is set or not, Buildbucket
   467  		// should always monitor the liveness for the build on TaskBackendLite.
   468  		return true, bldr.Config.GetHeartbeatTimeoutSecs(), nil
   469  	}
   470  	return false, 0, nil
   471  }