go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/buildbucket/appengine/tasks/backend.go (about) 1 // Copyright 2022 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tasks 16 17 import ( 18 "context" 19 "encoding/base64" 20 "encoding/json" 21 "fmt" 22 "strconv" 23 "strings" 24 "time" 25 26 "google.golang.org/api/googleapi" 27 codepb "google.golang.org/genproto/googleapis/rpc/code" 28 "google.golang.org/protobuf/proto" 29 "google.golang.org/protobuf/types/known/durationpb" 30 "google.golang.org/protobuf/types/known/structpb" 31 "google.golang.org/protobuf/types/known/timestamppb" 32 33 cipdpb "go.chromium.org/luci/cipd/api/cipd/v1" 34 "go.chromium.org/luci/common/clock" 35 "go.chromium.org/luci/common/errors" 36 "go.chromium.org/luci/common/logging" 37 "go.chromium.org/luci/common/retry/transient" 38 "go.chromium.org/luci/gae/service/datastore" 39 "go.chromium.org/luci/gae/service/info" 40 "go.chromium.org/luci/grpc/prpc" 41 "go.chromium.org/luci/server/caching/layered" 42 "go.chromium.org/luci/server/tq" 43 44 "go.chromium.org/luci/buildbucket/appengine/common" 45 "go.chromium.org/luci/buildbucket/appengine/internal/buildtoken" 46 "go.chromium.org/luci/buildbucket/appengine/internal/clients" 47 "go.chromium.org/luci/buildbucket/appengine/internal/config" 48 "go.chromium.org/luci/buildbucket/appengine/model" 49 pb "go.chromium.org/luci/buildbucket/proto" 50 ) 51 52 const ( 53 // bbagentReservedGracePeriod is the time reserved by bbagent in order to have 54 // time to have a couple retry rounds for UpdateBuild RPCs 55 // TODO(crbug.com/1328646): may need to adjust the grace_period based on 56 // UpdateBuild's new performance in Buildbucket Go. 57 bbagentReservedGracePeriod = 180 58 59 // runTaskGiveUpTimeoutDefault is the default value for how long to retry 60 // the CreateBackendTask before giving up with INFRA_FAILURE. 61 runTaskGiveUpTimeoutDefault = 10 * 60 * time.Second 62 63 cipdCacheTTL = 10 * time.Minute 64 ) 65 66 type cipdPackageDetails struct { 67 Size int64 `json:"size,omitempty"` 68 Hash string `json:"hash,omitempty"` 69 } 70 71 type cipdPackageDetailsMap map[string]*cipdPackageDetails 72 73 var cipdDescribeBootstrapBundleCache = layered.RegisterCache(layered.Parameters[cipdPackageDetailsMap]{ 74 ProcessCacheCapacity: 1000, 75 GlobalNamespace: "cipd-describeBootstrapBundle-v1", 76 Marshal: func(item cipdPackageDetailsMap) ([]byte, error) { 77 return json.Marshal(item) 78 }, 79 Unmarshal: func(blob []byte) (cipdPackageDetailsMap, error) { 80 res := cipdPackageDetailsMap{} 81 err := json.Unmarshal(blob, &res) 82 return res, err 83 }, 84 }) 85 86 type MockCipdClientKey struct{} 87 88 func NewCipdClient(ctx context.Context, host string, project string) (client *prpc.Client, err error) { 89 if mockClient, ok := ctx.Value(MockCipdClientKey{}).(*prpc.Client); ok { 90 return mockClient, nil 91 } 92 client, err = clients.CreateRawPrpcClient(ctx, host, project) 93 return 94 } 95 96 // computeTaskCaches computes the task caches. 97 func computeTaskCaches(infra *model.BuildInfra) []*pb.CacheEntry { 98 caches := make([]*pb.CacheEntry, 0, len(infra.Proto.Backend.GetCaches())+2) 99 if len(infra.Proto.Backend.GetCaches()) > 0 { 100 caches = append(caches, infra.Proto.Backend.Caches...) 101 } 102 if infra.Proto.Buildbucket.GetAgent().GetCipdClientCache() != nil { 103 caches = append(caches, infra.Proto.Buildbucket.Agent.CipdClientCache) 104 } 105 if infra.Proto.Buildbucket.GetAgent().GetCipdPackagesCache() != nil { 106 caches = append(caches, infra.Proto.Buildbucket.Agent.CipdPackagesCache) 107 } 108 return caches 109 } 110 111 func computeAgentArgs(build *pb.Build, infra *pb.BuildInfra) (args []string) { 112 args = []string{} 113 // build-id arg 114 args = append(args, "-build-id") 115 args = append(args, strconv.FormatInt(build.GetId(), 10)) 116 // host arg 117 args = append(args, "-host") 118 args = append(args, infra.Buildbucket.GetHostname()) 119 // cache-base arg 120 args = append(args, "-cache-base") 121 args = append(args, infra.Bbagent.GetCacheDir()) 122 123 // context-file arg 124 args = append(args, "-context-file") 125 args = append(args, "${BUILDBUCKET_AGENT_CONTEXT_FILE}") 126 return 127 } 128 129 // computeBackendPubsubTopic computes the pubsub topic that should be included 130 // in RunTaskRequest. Return an empty string if the backend is in lite mode. 131 func computeBackendPubsubTopic(ctx context.Context, target string, globalCfg *pb.SettingsCfg) (string, error) { 132 if globalCfg == nil { 133 return "", errors.Reason("error fetching service config").Err() 134 } 135 for _, backend := range globalCfg.Backends { 136 if backend.Target == target { 137 switch backend.Mode.(type) { 138 case *pb.BackendSetting_LiteMode_: 139 return "", nil 140 case *pb.BackendSetting_FullMode_: 141 return fmt.Sprintf("projects/%s/topics/%s", info.AppID(ctx), backend.GetFullMode().GetPubsubId()), nil 142 default: 143 return "", errors.Reason("getting pubsub_id from backend %s is not supported", target).Err() 144 } 145 } 146 } 147 return "", errors.Reason("backend %s not found in global settings", target).Err() 148 } 149 150 func computeBackendNewTaskReq(ctx context.Context, build *model.Build, infra *model.BuildInfra, requestID string, globalCfg *pb.SettingsCfg) (*pb.RunTaskRequest, error) { 151 // Create StartBuildToken and secrets. 152 startBuildToken, err := buildtoken.GenerateToken(ctx, build.ID, pb.TokenBody_START_BUILD) 153 if err != nil { 154 return nil, err 155 } 156 secrets := &pb.BuildSecrets{ 157 StartBuildToken: startBuildToken, 158 ResultdbInvocationUpdateToken: build.ResultDBUpdateToken, 159 } 160 backend := infra.Proto.GetBackend() 161 if backend == nil { 162 return nil, errors.New("infra.Proto.Backend isn't set") 163 } 164 caches := computeTaskCaches(infra) 165 gracePeriod := &durationpb.Duration{ 166 Seconds: build.Proto.GetGracePeriod().GetSeconds() + bbagentReservedGracePeriod, 167 } 168 169 startDeadline := ×tamppb.Timestamp{ 170 Seconds: build.Proto.GetCreateTime().GetSeconds() + build.Proto.GetSchedulingTimeout().GetSeconds(), 171 } 172 173 pubsubTopic, err := computeBackendPubsubTopic(ctx, backend.Task.Id.Target, globalCfg) 174 if err != nil { 175 return nil, err 176 } 177 178 // Add task name into backend config. 179 taskName := fmt.Sprintf("bb-%d-%s", build.ID, build.BuilderID) 180 if build.Proto.Number > 0 { 181 taskName = fmt.Sprintf("%s-%d", taskName, build.Proto.Number) 182 } 183 backend.Config.Fields["task_name"] = structpb.NewStringValue(taskName) 184 185 taskReq := &pb.RunTaskRequest{ 186 BuildbucketHost: infra.Proto.Buildbucket.Hostname, 187 Secrets: secrets, 188 Target: backend.Task.Id.Target, 189 RequestId: requestID, 190 BuildId: strconv.FormatInt(build.Proto.Id, 10), 191 Realm: build.Realm(), 192 BackendConfig: backend.Config, 193 ExecutionTimeout: build.Proto.GetExecutionTimeout(), 194 GracePeriod: gracePeriod, 195 Caches: caches, 196 AgentArgs: computeAgentArgs(build.Proto, infra.Proto), 197 Dimensions: infra.Proto.Backend.GetTaskDimensions(), 198 StartDeadline: startDeadline, 199 Experiments: build.Proto.Input.GetExperiments(), 200 PubsubTopic: pubsubTopic, 201 } 202 203 project := build.Proto.Builder.Project 204 taskReq.Agent = &pb.RunTaskRequest_AgentExecutable{} 205 taskReq.Agent.Source, err = extractCipdDetails(ctx, project, infra.Proto) 206 if err != nil { 207 return nil, err 208 } 209 210 build.Proto.Infra = infra.Proto 211 tags := computeTags(ctx, build) 212 tagsAny := make([]any, len(tags)) 213 for i, t := range tags { 214 tagsAny[i] = t 215 } 216 tagsList, err := structpb.NewList(tagsAny) 217 if err != nil { 218 return nil, err 219 } 220 if taskReq.BackendConfig == nil { 221 taskReq.BackendConfig = &structpb.Struct{} 222 } 223 taskReq.BackendConfig.Fields["tags"] = structpb.NewListValue(tagsList) 224 return taskReq, nil 225 } 226 227 func createCipdDescribeBootstrapBundleRequest(infra *pb.BuildInfra) *cipdpb.DescribeBootstrapBundleRequest { 228 prefix := infra.Buildbucket.Agent.Source.GetCipd().GetPackage() 229 prefix = strings.TrimSuffix(prefix, "/${platform}") 230 return &cipdpb.DescribeBootstrapBundleRequest{ 231 Prefix: prefix, 232 Version: infra.Buildbucket.Agent.Source.GetCipd().GetVersion(), 233 } 234 } 235 236 func computeCipdURL(source *pb.BuildInfra_Buildbucket_Agent_Source, pkg string, details *cipdPackageDetails) (url string) { 237 server := source.GetCipd().GetServer() 238 version := source.GetCipd().GetVersion() 239 return server + "/bootstrap/" + pkg + "/+/" + version 240 } 241 242 // extractCipdDetails returns a map that maps package (Prefix + variant for each variant) 243 // to a cipdPackageDetails object, which is just the hash and size. 244 // 245 // A Cipd client is created and calls DescribeBootstrapBundle to retrieve the data. 246 func extractCipdDetails(ctx context.Context, project string, infra *pb.BuildInfra) (details map[string]*pb.RunTaskRequest_AgentExecutable_AgentSource, err error) { 247 cipdServer := infra.Buildbucket.Agent.Source.GetCipd().GetServer() 248 cipdClient, err := NewCipdClient(ctx, cipdServer, project) 249 if err != nil { 250 return nil, err 251 } 252 req := createCipdDescribeBootstrapBundleRequest(infra) 253 bytes, err := proto.Marshal(req) 254 if err != nil { 255 return nil, err 256 } 257 cachePrefix := base64.StdEncoding.EncodeToString(bytes) 258 cipdDetails, err := cipdDescribeBootstrapBundleCache.GetOrCreate(ctx, cachePrefix, func() (cipdPackageDetailsMap, time.Duration, error) { 259 out := &cipdpb.DescribeBootstrapBundleResponse{} 260 err := cipdClient.Call(ctx, "cipd.Repository", "DescribeBootstrapBundle", req, out) 261 if err != nil { 262 return nil, 0, err 263 } 264 resp := make(cipdPackageDetailsMap, len(out.Files)) 265 hasErrFile := false 266 for _, file := range out.Files { 267 if s := file.Status; s != nil && s.Code != int32(codepb.Code_OK) { 268 hasErrFile = true 269 logging.Warningf(ctx, "cannot resolve the package %q: error code - %d, message - %s", file.Package, s.Code, s.Message) 270 continue 271 } 272 resp[file.Package] = &cipdPackageDetails{ 273 Hash: file.Instance.HexDigest, 274 Size: file.Size, 275 } 276 } 277 278 ttl := cipdCacheTTL 279 if hasErrFile { 280 // Sometimes, the cipd package may not exist on one of platforms or its 281 // tag hasn't been populated yet, etc. Choose a shorter cache time in 282 // these situations. 283 ttl = 1 * time.Minute 284 } 285 return resp, ttl, nil 286 }) 287 if err != nil { 288 return nil, errors.Annotate(err, "cache error for cipd request").Err() 289 } 290 details = map[string]*pb.RunTaskRequest_AgentExecutable_AgentSource{} 291 for k, v := range cipdDetails { 292 val := &pb.RunTaskRequest_AgentExecutable_AgentSource{ 293 Sha256: v.Hash, 294 SizeBytes: v.Size, 295 Url: computeCipdURL(infra.Buildbucket.Agent.Source, k, v), 296 } 297 details[k] = val 298 } 299 return 300 } 301 302 // CreateBackendTask creates a backend task for the build. 303 func CreateBackendTask(ctx context.Context, buildID int64, requestID string) error { 304 entities, err := common.GetBuildEntities(ctx, buildID, model.BuildKind, model.BuildInfraKind) 305 if err != nil { 306 return errors.Annotate(err, "failed to get build %d", buildID).Err() 307 } 308 bld := entities[0].(*model.Build) 309 infra := entities[1].(*model.BuildInfra) 310 311 if infra.Proto.GetBackend().GetTask().GetId().GetId() != "" { 312 // This task is likely a retry. 313 // It could happen if the previous RunTask attempt(s) failed, but a backend 314 // task was actually created and associated with the build in the backup 315 // flow. 316 // Bail out. 317 logging.Infof(ctx, "build %d has associated with task %q", buildID, infra.Proto.Backend.Task.Id) 318 return nil 319 } 320 321 globalCfg, err := config.GetSettingsCfg(ctx) 322 if err != nil { 323 return errors.Annotate(err, "could not get global settings config").Err() 324 } 325 326 var backendCfg *pb.BackendSetting 327 for _, backend := range globalCfg.GetBackends() { 328 if backend.Target == infra.Proto.Backend.Task.Id.Target { 329 backendCfg = backend 330 } 331 } 332 if backendCfg == nil { 333 return tq.Fatal.Apply(errors.Reason("failed to get backend config from global settings").Err()) 334 } 335 336 var runTaskGiveUpTimeout time.Duration 337 if backendCfg.TaskCreatingTimeout.GetSeconds() == 0 { 338 runTaskGiveUpTimeout = runTaskGiveUpTimeoutDefault 339 } else { 340 runTaskGiveUpTimeout = backendCfg.TaskCreatingTimeout.AsDuration() 341 } 342 343 // If task creation has already expired, fail the build immediately. 344 if clock.Now(ctx).Sub(bld.CreateTime) >= runTaskGiveUpTimeout { 345 dsPutErr := failBuild(ctx, buildID, "Backend task creation failure.") 346 if dsPutErr != nil { 347 return dsPutErr 348 } 349 return tq.Fatal.Apply(errors.Reason("creating backend task for build %d with requestID %s has expired after %s", buildID, requestID, runTaskGiveUpTimeout.String()).Err()) 350 } 351 352 // Initialize a TaskCreator for creating the backend task. 353 _, isLite := backendCfg.Mode.(*pb.BackendSetting_LiteMode_) 354 backend, err := clients.NewTaskCreator(ctx, bld.Proto.Builder.Project, infra.Proto.Backend.Task.Id.Target, globalCfg, isLite) 355 if err != nil { 356 return tq.Fatal.Apply(errors.Annotate(err, "failed to connect to backend service").Err()) 357 } 358 359 taskReq, err := computeBackendNewTaskReq(ctx, bld, infra, requestID, globalCfg) 360 if err != nil { 361 return tq.Fatal.Apply(err) 362 } 363 364 // Create a backend task via RunTask 365 taskResp, err := backend.RunTask(ctx, taskReq) 366 367 // TODO(b/288158829): remove it once the root cause for the Skia failure is found. 368 if bld.Proto.Builder.Project == "skia" { 369 logging.Debugf(ctx, "RunTaskResponse from skia: %v", taskResp) 370 } 371 372 now := clock.Now(ctx) 373 if err != nil { 374 // Give up if HTTP 500s are happening continuously. Otherwise re-throw the 375 // error so Cloud Tasks retries the task. 376 if apiErr, _ := err.(*googleapi.Error); apiErr == nil || apiErr.Code >= 500 { 377 if now.Sub(bld.CreateTime) < runTaskGiveUpTimeout { 378 return transient.Tag.Apply(errors.Annotate(err, "failed to create a backend task").Err()) 379 } 380 logging.Errorf(ctx, "Give up backend task creation retry after %s", runTaskGiveUpTimeout.String()) 381 } 382 logging.Errorf(ctx, "Backend task creation failure:%s. RunTask request: %+v", err, taskReq) 383 dsPutErr := failBuild(ctx, bld.ID, "Backend task creation failure.") 384 if dsPutErr != nil { 385 return dsPutErr 386 } 387 return tq.Fatal.Apply(errors.Annotate(err, "failed to create a backend task").Err()) 388 } 389 if taskResp.Task.GetUpdateId() == 0 { 390 return tq.Fatal.Apply(errors.Reason("task returned with an updateID of 0").Err()) 391 } 392 393 checkLiveness, heartbeatTimeout, err := shouldCheckLiveness(ctx, bld, backendCfg) 394 if err != nil { 395 return transient.Tag.Apply(err) 396 } 397 398 txErr := datastore.RunInTransaction(ctx, func(ctx context.Context) error { 399 entities, err := common.GetBuildEntities(ctx, buildID, model.BuildKind, model.BuildInfraKind) 400 if err != nil { 401 return errors.Annotate(err, "failed to get build %d", buildID).Err() 402 } 403 bld = entities[0].(*model.Build) 404 infra = entities[1].(*model.BuildInfra) 405 406 infra.Proto.Backend.Task = taskResp.Task 407 408 // Update Build entity. 409 bld.Proto.UpdateTime = timestamppb.New(now) 410 target := taskResp.Task.Id.Target 411 for _, backendSetting := range globalCfg.Backends { 412 if backendSetting.Target == target { 413 if backendSetting.GetFullMode().GetBuildSyncSetting() != nil { 414 bld.BackendTarget = target 415 interval := backendSetting.GetFullMode().GetBuildSyncSetting().GetSyncIntervalSeconds() 416 if interval > 0 { 417 bld.BackendSyncInterval = time.Duration(interval) * time.Second 418 } 419 bld.GenerateNextBackendSyncTime(ctx, backendSetting.GetFullMode().GetBuildSyncSetting().GetShards()) 420 } 421 break 422 } 423 } 424 425 if checkLiveness { 426 // SchedulingTimeout is always set in schedule_build flow. 427 delay := bld.Proto.SchedulingTimeout.Seconds 428 if heartbeatTimeout != 0 && int64(heartbeatTimeout) < delay { 429 // Better to choose a shorter delay as a first CheckBuildLiveness task. 430 delay = int64(heartbeatTimeout) 431 } 432 if err = CheckBuildLiveness(ctx, bld.ID, heartbeatTimeout, time.Duration(delay)*time.Second); err != nil { 433 return errors.Annotate(err, "failed to enqueue CheckBuildLiveness task").Err() 434 } 435 } 436 return errors.Annotate(datastore.Put(ctx, bld, infra), "failed to save Build and BuildInfra").Err() 437 }, nil) 438 if txErr != nil { 439 logging.Errorf(ctx, "Task failed to save: %s", taskResp.String()) 440 return transient.Tag.Apply(err) 441 } 442 return nil 443 } 444 445 // shouldCheckLiveness checks if Buildbucket should enqueue a task to 446 // periodically check the build liveness. 447 func shouldCheckLiveness(ctx context.Context, bld *model.Build, backendCfg *pb.BackendSetting) (bool, uint32, error) { 448 if _, ok := backendCfg.Mode.(*pb.BackendSetting_LiteMode_); ok { 449 bkt := &model.Bucket{ 450 ID: bld.Proto.Builder.Bucket, 451 Parent: model.ProjectKey(ctx, bld.Proto.Builder.Project), 452 } 453 bldr := &model.Builder{ 454 ID: bld.Proto.Builder.Builder, 455 Parent: datastore.KeyForObj(ctx, bkt), 456 } 457 if err := datastore.Get(ctx, bldr, bkt); err != nil { 458 switch merr, ok := err.(errors.MultiError); { 459 case ok && errors.Contains(merr[0], datastore.ErrNoSuchEntity) && bkt.Proto.GetDynamicBuilderTemplate() != nil: 460 // It's a dynamic builder. 461 return true, bkt.Proto.DynamicBuilderTemplate.Template.GetHeartbeatTimeoutSecs(), nil 462 default: 463 return false, 0, errors.Annotate(err, "failed to fetch builder %s", bld.BuilderID).Err() 464 } 465 } 466 // No matter whether the hearbeat_timeout_secs is set or not, Buildbucket 467 // should always monitor the liveness for the build on TaskBackendLite. 468 return true, bldr.Config.GetHeartbeatTimeoutSecs(), nil 469 } 470 return false, 0, nil 471 }