go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/buildbucket/appengine/tasks/swarming.go (about) 1 // Copyright 2022 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tasks 16 17 import ( 18 "context" 19 "encoding/base64" 20 "encoding/json" 21 "fmt" 22 "io" 23 "path/filepath" 24 "sort" 25 "strconv" 26 "strings" 27 "time" 28 29 "github.com/google/uuid" 30 "google.golang.org/api/googleapi" 31 "google.golang.org/protobuf/proto" 32 33 "go.chromium.org/luci/common/clock" 34 "go.chromium.org/luci/common/errors" 35 "go.chromium.org/luci/common/logging" 36 "go.chromium.org/luci/common/retry/transient" 37 "go.chromium.org/luci/gae/service/datastore" 38 "go.chromium.org/luci/gae/service/info" 39 "go.chromium.org/luci/server/auth/realms" 40 "go.chromium.org/luci/server/caching" 41 "go.chromium.org/luci/server/tq" 42 apipb "go.chromium.org/luci/swarming/proto/api_v2" 43 44 "go.chromium.org/luci/buildbucket" 45 "go.chromium.org/luci/buildbucket/appengine/internal/buildstatus" 46 "go.chromium.org/luci/buildbucket/appengine/internal/buildtoken" 47 "go.chromium.org/luci/buildbucket/appengine/internal/clients" 48 "go.chromium.org/luci/buildbucket/appengine/internal/metrics" 49 "go.chromium.org/luci/buildbucket/appengine/model" 50 taskdefs "go.chromium.org/luci/buildbucket/appengine/tasks/defs" 51 "go.chromium.org/luci/buildbucket/cmd/bbagent/bbinput" 52 pb "go.chromium.org/luci/buildbucket/proto" 53 "go.chromium.org/luci/buildbucket/protoutil" 54 ) 55 56 const ( 57 // cacheDir is the path, relative to the swarming run dir, to the directory that 58 // contains the mounted swarming named caches. It will be prepended to paths of 59 // caches defined in global or builder configs. 60 cacheDir = "cache" 61 62 // pubsubTopicTemplate is the topic template where Swarming publishes 63 // notifications on the task update. 64 pubsubTopicTemplate = "projects/%s/topics/swarming-go" 65 66 // swarmingCreateTaskGiveUpTimeout indicates how long to retry 67 // the createSwarmingTask before giving up with INFRA_FAILURE. 68 swarmingCreateTaskGiveUpTimeout = 10 * 60 * time.Second 69 70 // swarmingTimeFormat is time format used by swarming. 71 swarmingTimeFormat = "2006-01-02T15:04:05.999999999" 72 ) 73 74 // userdata will be sent back and forth between Swarming and Buildbucket. 75 type userdata struct { 76 BuildID int64 `json:"build_id"` 77 CreatedTS int64 `json:"created_ts"` 78 SwarmingHostname string `json:"swarming_hostname"` 79 } 80 81 // notification captures all fields that Buildbucket needs from the message of Swarming notification subscription. 82 type notification struct { 83 messageID string 84 taskID string 85 *userdata 86 } 87 88 // SyncBuild synchronizes the build with Swarming. 89 // If the swarming task does not exist yet, creates it. 90 // Otherwise, updates the build state to match swarming task state. 91 // Enqueues a new sync push task if the build did not end. 92 // 93 // Cloud tasks handler will retry the task if any error is thrown, unless it's 94 // tagged with tq.Fatal. 95 func SyncBuild(ctx context.Context, buildID int64, generation int64) error { 96 bld := &model.Build{ID: buildID} 97 infra := &model.BuildInfra{Build: datastore.KeyForObj(ctx, bld)} 98 switch err := datastore.Get(ctx, bld, infra); { 99 case errors.Contains(err, datastore.ErrNoSuchEntity): 100 return tq.Fatal.Apply(errors.Annotate(err, "build %d or buildInfra not found", buildID).Err()) 101 case err != nil: 102 return transient.Tag.Apply(errors.Annotate(err, "failed to fetch build %d or buildInfra", buildID).Err()) 103 } 104 if protoutil.IsEnded(bld.Status) { 105 logging.Infof(ctx, "build %d is ended", buildID) 106 return nil 107 } 108 if clock.Now(ctx).Sub(bld.CreateTime) > model.BuildMaxCompletionTime { 109 logging.Infof(ctx, "build %d (create_time:%s) has passed the sync deadline: %s", buildID, bld.CreateTime, model.BuildMaxCompletionTime) 110 return nil 111 } 112 113 bld.Proto.Infra = infra.Proto 114 swarm, err := clients.NewSwarmingClient(ctx, infra.Proto.Swarming.Hostname, bld.Project) 115 if err != nil { 116 logging.Errorf(ctx, "failed to create a swarming client for build %d (%s), in %s: %s", buildID, bld.Project, infra.Proto.Swarming.Hostname, err) 117 return failBuild(ctx, buildID, fmt.Sprintf("failed to create a swarming client:%s", err)) 118 } 119 if bld.Proto.Infra.Swarming.TaskId == "" { 120 if err := createSwarmingTask(ctx, bld, swarm); err != nil { 121 // Mark build as Infra_failure for fatal and non-retryable errors. 122 if tq.Fatal.In(err) { 123 return failBuild(ctx, bld.ID, err.Error()) 124 } 125 return err 126 } 127 } else { 128 if err := syncBuildWithTaskResult(ctx, bld.ID, bld.Proto.Infra.Swarming.TaskId, swarm); err != nil { 129 // Tq should retry non-fatal errors. 130 if !tq.Fatal.In(err) { 131 return err 132 } 133 // For fatal errors, we just log it and continue to the part of enqueueing 134 // the next generation sync task. 135 logging.Errorf(ctx, "Dropping the sync task due to the fatal error: %s", err) 136 } 137 } 138 139 // Enqueue a continuation sync task in 5m. 140 if clock.Now(ctx).Sub(bld.CreateTime) < model.BuildMaxCompletionTime { 141 if err := SyncSwarmingBuildTask(ctx, &taskdefs.SyncSwarmingBuildTask{BuildId: buildID, Generation: generation + 1}, 5*time.Minute); err != nil { 142 return transient.Tag.Apply(errors.Annotate(err, "failed to enqueue the continuation sync task for build %d", buildID).Err()) 143 } 144 } 145 return nil 146 } 147 148 // SubNotify handles swarming-go PubSub push messages produced by Swarming. 149 // For a retryable error, it will be tagged with transient.Tag. 150 func SubNotify(ctx context.Context, body io.Reader) error { 151 nt, err := unpackMsg(ctx, body) 152 if err != nil { 153 return err 154 } 155 // TODO(crbug/1328646): delete the log once the new Go flow becomes stable. 156 logging.Infof(ctx, "Received message - messageID:%s, taskID:%s, userdata:%+v", nt.messageID, nt.taskID, nt.userdata) 157 158 // Try not to process same message more than once. 159 cache := caching.GlobalCache(ctx, "swarming-pubsub-msg-id") 160 if cache == nil { 161 return errors.Reason("global cache is not found").Tag(transient.Tag).Err() 162 } 163 msgCached, err := cache.Get(ctx, nt.messageID) 164 switch { 165 case err == caching.ErrCacheMiss: // no-op, continue 166 case err != nil: 167 return errors.Annotate(err, "failed to read %s from the global cache", nt.messageID).Tag(transient.Tag).Err() 168 case msgCached != nil: 169 logging.Infof(ctx, "seen this message %s before, ignoring", nt.messageID) 170 return nil 171 } 172 173 taskURL := func(hostname, taskID string) string { 174 return fmt.Sprintf("https://%s/task?id=%s", hostname, taskID) 175 } 176 // load build and build infra. 177 logging.Infof(ctx, "received swarming notification for build %d", nt.BuildID) 178 bld := &model.Build{ID: nt.BuildID} 179 infra := &model.BuildInfra{Build: datastore.KeyForObj(ctx, bld)} 180 switch err := datastore.Get(ctx, bld, infra); { 181 case errors.Contains(err, datastore.ErrNoSuchEntity): 182 if clock.Now(ctx).Sub(time.Unix(0, nt.CreatedTS*int64(time.Microsecond)).UTC()) < time.Minute { 183 return errors.Annotate(err, "Build %d or BuildInfra for task %s not found yet", nt.BuildID, taskURL(nt.SwarmingHostname, nt.taskID)).Tag(transient.Tag).Err() 184 } 185 return errors.Annotate(err, "Build %d or BuildInfra for task %s not found", nt.BuildID, taskURL(nt.SwarmingHostname, nt.taskID)).Err() 186 case err != nil: 187 return errors.Annotate(err, "failed to fetch build %d or buildInfra", nt.BuildID).Tag(transient.Tag).Err() 188 } 189 if protoutil.IsEnded(bld.Status) { 190 logging.Infof(ctx, "build(%d) is completed and immutable.", nt.BuildID) 191 return nil 192 } 193 194 // ensure the loaded build is associated with the task. 195 bld.Proto.Infra = infra.Proto 196 sw := bld.Proto.GetInfra().GetSwarming() 197 if nt.SwarmingHostname != sw.GetHostname() { 198 return errors.Reason("swarming_hostname %s of build %d does not match %s", sw.Hostname, nt.BuildID, nt.SwarmingHostname).Err() 199 } 200 if strings.TrimSpace(sw.GetTaskId()) == "" { 201 return errors.Reason("build %d is not associated with a task", nt.BuildID).Tag(transient.Tag).Err() 202 } 203 if nt.taskID != sw.GetTaskId() { 204 return errors.Reason("swarming_task_id %s of build %d does not match %s", sw.TaskId, nt.BuildID, nt.taskID).Err() 205 } 206 207 // update build. 208 swarm, err := clients.NewSwarmingClient(ctx, sw.Hostname, bld.Project) 209 if err != nil { 210 return errors.Annotate(err, "failed to create a swarming client for build %d (%s), in %s", nt.BuildID, bld.Project, sw.Hostname).Err() 211 } 212 if err := syncBuildWithTaskResult(ctx, nt.BuildID, sw.TaskId, swarm); err != nil { 213 return err 214 } 215 216 return cache.Set(ctx, nt.messageID, []byte{1}, 10*time.Minute) 217 } 218 219 func HandleCancelSwarmingTask(ctx context.Context, hostname string, taskID string, realm string) error { 220 // Validate 221 switch err := realms.ValidateRealmName(realm, realms.GlobalScope); { 222 case err != nil: 223 return tq.Fatal.Apply(err) 224 case hostname == "": 225 return errors.Reason("hostname is empty").Tag(tq.Fatal).Err() 226 case taskID == "": 227 return errors.Reason("taskID is empty").Tag(tq.Fatal).Err() 228 } 229 230 // Send the cancellation request. 231 project, _ := realms.Split(realm) 232 swarm, err := clients.NewSwarmingClient(ctx, hostname, project) 233 if err != nil { 234 return errors.Annotate(err, "failed to create a swarming client for task %s in %s", taskID, hostname).Tag(tq.Fatal).Err() 235 } 236 res, err := swarm.CancelTask(ctx, &apipb.TaskCancelRequest{KillRunning: true, TaskId: taskID}) 237 if err != nil { 238 if apiErr, ok := err.(*googleapi.Error); ok && apiErr.Code >= 500 { 239 return errors.Annotate(err, "transient error in cancelling the task %s", taskID).Tag(transient.Tag).Err() 240 } 241 return errors.Annotate(err, "fatal error in cancelling the task %s", taskID).Tag(tq.Fatal).Err() 242 } 243 244 // Non-Canceled in the body indicates the task may have already ended. Hence, just logging it. 245 if !res.Canceled { 246 logging.Warningf(ctx, "Swarming response for cancelling task %s: %+v", taskID, res) 247 } 248 return nil 249 } 250 251 // unpackMsg unpacks swarming-go pubsub message and extracts message id, 252 // swarming hostname, creation time, task id and build id. 253 func unpackMsg(ctx context.Context, body io.Reader) (*notification, error) { 254 blob, err := io.ReadAll(body) 255 if err != nil { 256 return nil, errors.Annotate(err, "failed to read the request body").Tag(transient.Tag).Err() 257 } 258 259 // process pubsub message 260 // See https://cloud.google.com/pubsub/docs/push#receive_push 261 var msg struct { 262 Message struct { 263 Attributes map[string]string `json:"attributes,omitempty"` 264 Data string `json:"data,omitempty"` 265 MessageID string `json:"messageId,omitempty"` 266 } `json:"message"` 267 } 268 if err := json.Unmarshal(blob, &msg); err != nil { 269 return nil, errors.Annotate(err, "failed to unmarshal swarming PubSub message").Err() 270 } 271 272 // process swarming message data 273 swarmData, err := base64.StdEncoding.DecodeString(msg.Message.Data) 274 if err != nil { 275 return nil, errors.Annotate(err, "cannot decode message data as base64").Err() 276 } 277 var data struct { 278 TaskID string `json:"task_id"` 279 Userdata string `json:"userdata"` 280 } 281 if err := json.Unmarshal(swarmData, &data); err != nil { 282 return nil, errors.Annotate(err, "failed to unmarshal the swarming pubsub data").Err() 283 } 284 ud := &userdata{} 285 if err := json.Unmarshal([]byte(data.Userdata), ud); err != nil { 286 return nil, errors.Annotate(err, "failed to unmarshal userdata").Err() 287 } 288 289 // validate swarming message data 290 switch { 291 case strings.TrimSpace(data.TaskID) == "": 292 return nil, errors.Reason("task_id not found in message data").Err() 293 case ud.BuildID <= 0: 294 return nil, errors.Reason("invalid build_id %d", ud.BuildID).Err() 295 case ud.CreatedTS <= 0: 296 return nil, errors.Reason("invalid created_ts %d", ud.CreatedTS).Err() 297 case strings.TrimSpace(ud.SwarmingHostname) == "": 298 return nil, errors.Reason("swarming hostname not found in userdata").Err() 299 case strings.Contains(ud.SwarmingHostname, "://"): 300 return nil, errors.Reason("swarming hostname %s must not contain '://'", ud.SwarmingHostname).Err() 301 } 302 303 return ¬ification{ 304 messageID: msg.Message.MessageID, 305 taskID: data.TaskID, 306 userdata: ud, 307 }, nil 308 } 309 310 // syncBuildWithTaskResult syncs Build entity in the datastore with a result of the swarming task. 311 func syncBuildWithTaskResult(ctx context.Context, buildID int64, taskID string, swarm clients.SwarmingClient) error { 312 taskResult, err := swarm.GetTaskResult(ctx, taskID) 313 if err != nil { 314 logging.Errorf(ctx, "failed to fetch swarming task %s for build %d: %s", taskID, buildID, err) 315 if apiErr, ok := err.(*googleapi.Error); ok && apiErr.Code >= 400 && apiErr.Code < 500 { 316 return failBuild(ctx, buildID, fmt.Sprintf("invalid swarming task %s", taskID)) 317 } 318 return transient.Tag.Apply(err) 319 } 320 if taskResult == nil { 321 return failBuild(ctx, buildID, fmt.Sprintf("Swarming task %s unexpectedly disappeared", taskID)) 322 } 323 324 var statusChanged bool 325 bld := &model.Build{ 326 ID: buildID, 327 } 328 err = datastore.RunInTransaction(ctx, func(ctx context.Context) (err error) { 329 infra := &model.BuildInfra{Build: datastore.KeyForObj(ctx, bld)} 330 if err := datastore.Get(ctx, bld, infra); err != nil { 331 return transient.Tag.Apply(errors.Annotate(err, "failed to fetch build or buildInfra: %d", bld.ID).Err()) 332 } 333 334 if protoutil.IsEnded(bld.Status) { 335 return nil 336 } 337 if bld.Status == pb.Status_STARTED && taskResult.State == apipb.TaskState_PENDING { 338 // Most probably, race between PubSub push handler and Cron job. 339 // With swarming, a build cannot go from STARTED back to PENDING, 340 // so ignore this. 341 return nil 342 } 343 344 botDimsChanged := updateBotDimensions(infra, taskResult) 345 346 bs, steps, err := updateBuildStatusFromTaskResult(ctx, bld, taskResult) 347 if err != nil { 348 return tq.Fatal.Apply(err) 349 } 350 351 shouldUpdate := false 352 if bs != nil { 353 shouldUpdate = true 354 statusChanged = true 355 } 356 if bs == nil && botDimsChanged && bld.Proto.Status == pb.Status_STARTED { 357 shouldUpdate = true 358 } 359 if !shouldUpdate { 360 return nil 361 } 362 363 toPut := []any{bld, infra} 364 if bs != nil { 365 toPut = append(toPut, bs) 366 } 367 if steps != nil { 368 toPut = append(toPut, steps) 369 } 370 return transient.Tag.Apply(datastore.Put(ctx, toPut...)) 371 }, nil) 372 373 switch { 374 case err != nil: 375 case !statusChanged: 376 case bld.Status == pb.Status_STARTED: 377 metrics.BuildStarted(ctx, bld) 378 case protoutil.IsEnded(bld.Status): 379 metrics.BuildCompleted(ctx, bld) 380 } 381 return err 382 } 383 384 // updateBotDimensions mutates the infra entity to update the bot dimensions 385 // according to the given task result. 386 // Note, it will not write the entities into Datastore. 387 func updateBotDimensions(infra *model.BuildInfra, taskResult *apipb.TaskResultResponse) bool { 388 sw := infra.Proto.Swarming 389 botDimsChanged := false 390 391 // Update BotDimensions 392 oldBotDimsMap := protoutil.StringPairMap(sw.BotDimensions) 393 newBotDims := []*pb.StringPair{} 394 for _, dim := range taskResult.BotDimensions { 395 for _, v := range dim.Value { 396 if !botDimsChanged && !oldBotDimsMap.Contains(dim.Key, v) { 397 botDimsChanged = true 398 } 399 newBotDims = append(newBotDims, &pb.StringPair{Key: dim.Key, Value: v}) 400 } 401 } 402 if len(newBotDims) != len(sw.BotDimensions) { 403 botDimsChanged = true 404 } 405 sw.BotDimensions = newBotDims 406 407 sort.Slice(sw.BotDimensions, func(i, j int) bool { 408 if sw.BotDimensions[i].Key == sw.BotDimensions[j].Key { 409 return sw.BotDimensions[i].Value < sw.BotDimensions[j].Value 410 } 411 return sw.BotDimensions[i].Key < sw.BotDimensions[j].Key 412 }) 413 return botDimsChanged 414 } 415 416 // updateBuildStatusFromTaskResult mutates the build entity to update the top 417 // level status, and also the update time of the build. 418 // Note, it will not write the entities into Datastore. 419 func updateBuildStatusFromTaskResult(ctx context.Context, bld *model.Build, taskResult *apipb.TaskResultResponse) (bs *model.BuildStatus, steps *model.BuildSteps, err error) { 420 now := clock.Now(ctx) 421 oldStatus := bld.Status 422 // A helper function to correctly set Build ended status from taskResult. It 423 // corrects the build start_time only if start_time is empty and taskResult 424 // has start_ts populated. 425 setEndStatus := func(st pb.Status, details *pb.StatusDetails) { 426 if !protoutil.IsEnded(st) { 427 return 428 } 429 if bld.Proto.StartTime == nil && taskResult.StartedTs.AsTime().Unix() != 0 { 430 startTime := taskResult.StartedTs.AsTime() 431 // Backfill build start time. 432 protoutil.SetStatus(startTime, bld.Proto, pb.Status_STARTED) 433 } 434 435 endTime := now 436 if t := taskResult.CompletedTs; t != nil { 437 endTime = t.AsTime() 438 } else if t := taskResult.AbandonedTs; t != nil { 439 endTime = t.AsTime() 440 } 441 // It is possible that swarming task was marked as NO_RESOURCE the moment 442 // it was created. Swarming VM time is not synchronized with buildbucket VM 443 // time, so adjust end_time if needed. 444 if endTime.Before(bld.Proto.CreateTime.AsTime()) { 445 endTime = bld.Proto.CreateTime.AsTime() 446 } 447 448 stWithDetails := &buildstatus.StatusWithDetails{Status: st, Details: details} 449 bs, steps, err = updateBuildStatusOnTaskStatusChange(ctx, bld, stWithDetails, stWithDetails, endTime) 450 } 451 452 // Update build status 453 switch taskResult.State { 454 case apipb.TaskState_PENDING: 455 if bld.Status == pb.Status_STATUS_UNSPECIFIED { 456 // Scheduled Build should have SCHEDULED status already, so in theory this 457 // should not happen. 458 // Adding a log to confirm this. 459 logging.Debugf(ctx, "build %d has unspecified status, setting it to pending", bld.ID) 460 protoutil.SetStatus(now, bld.Proto, pb.Status_SCHEDULED) 461 } else { 462 // Most probably, race between PubSub push handler and Cron job. 463 // With swarming, a build cannot go from STARTED/ended back to PENDING, 464 // so ignore this. 465 return 466 } 467 case apipb.TaskState_RUNNING: 468 updateTime := now 469 if t := taskResult.StartedTs; t != nil { 470 updateTime = t.AsTime() 471 } 472 stWithDetails := &buildstatus.StatusWithDetails{Status: pb.Status_STARTED} 473 bs, steps, err = updateBuildStatusOnTaskStatusChange(ctx, bld, stWithDetails, stWithDetails, updateTime) 474 case apipb.TaskState_CANCELED, apipb.TaskState_KILLED: 475 setEndStatus(pb.Status_CANCELED, nil) 476 case apipb.TaskState_NO_RESOURCE: 477 setEndStatus(pb.Status_INFRA_FAILURE, &pb.StatusDetails{ 478 ResourceExhaustion: &pb.StatusDetails_ResourceExhaustion{}, 479 }) 480 case apipb.TaskState_EXPIRED: 481 setEndStatus(pb.Status_INFRA_FAILURE, &pb.StatusDetails{ 482 ResourceExhaustion: &pb.StatusDetails_ResourceExhaustion{}, 483 Timeout: &pb.StatusDetails_Timeout{}, 484 }) 485 case apipb.TaskState_TIMED_OUT: 486 setEndStatus(pb.Status_INFRA_FAILURE, &pb.StatusDetails{ 487 Timeout: &pb.StatusDetails_Timeout{}, 488 }) 489 case apipb.TaskState_BOT_DIED, apipb.TaskState_CLIENT_ERROR: 490 // BB only supplies bbagent CIPD packages in a task, no other user packages. 491 // So the CLIENT_ERROR task state should be treated as build INFRA_FAILURE. 492 setEndStatus(pb.Status_INFRA_FAILURE, nil) 493 case apipb.TaskState_COMPLETED: 494 if taskResult.Failure { 495 switch bld.Proto.Output.GetStatus() { 496 case pb.Status_FAILURE: 497 setEndStatus(pb.Status_FAILURE, nil) 498 case pb.Status_CANCELED: 499 setEndStatus(pb.Status_CANCELED, nil) 500 default: 501 //If this truly was a non-infra failure, bbagent would catch that and 502 //mark the build as FAILURE. 503 //That did not happen, so this is an infra failure. 504 setEndStatus(pb.Status_INFRA_FAILURE, nil) 505 } 506 } else { 507 finalStatus := pb.Status_SUCCESS 508 if protoutil.IsEnded(bld.Proto.Output.GetStatus()) { 509 // Swarming task ends with COMPLETED(SUCCESS), use the build status 510 // as final status. 511 finalStatus = bld.Proto.Output.GetStatus() 512 } 513 setEndStatus(finalStatus, nil) 514 } 515 default: 516 err = errors.Reason("Unexpected task state: %s", taskResult.State).Err() 517 return 518 } 519 520 if bld.Proto.Status != oldStatus { 521 logging.Infof(ctx, "Build %d status: %s -> %s", bld.ID, oldStatus, bld.Proto.Status) 522 return 523 } 524 return 525 } 526 527 // createSwarmingTask creates a swarming task for the build. 528 // Requires build.proto.infra to be populated. 529 // If the returned error is fatal and non-retryable, the tq.Fatal tag will be added. 530 func createSwarmingTask(ctx context.Context, build *model.Build, swarm clients.SwarmingClient) error { 531 taskReq, err := computeSwarmingNewTaskReq(ctx, build) 532 if err != nil { 533 return tq.Fatal.Apply(err) 534 } 535 536 // Insert secret bytes. 537 token, err := buildtoken.GenerateToken(ctx, build.ID, pb.TokenBody_BUILD) 538 if err != nil { 539 return tq.Fatal.Apply(err) 540 } 541 secrets := &pb.BuildSecrets{ 542 StartBuildToken: token, 543 BuildToken: token, 544 ResultdbInvocationUpdateToken: build.ResultDBUpdateToken, 545 } 546 secretsBytes, err := proto.Marshal(secrets) 547 if err != nil { 548 return tq.Fatal.Apply(err) 549 } 550 for _, t := range taskReq.TaskSlices { 551 t.Properties.SecretBytes = secretsBytes 552 } 553 554 // Create a swarming task 555 res, err := swarm.CreateTask(ctx, taskReq) 556 if err != nil { 557 // Give up if HTTP 500s are happening continuously. Otherwise re-throw the 558 // error so Cloud Tasks retries the task. 559 if apiErr, _ := err.(*googleapi.Error); apiErr == nil || apiErr.Code >= 500 { 560 if clock.Now(ctx).Sub(build.CreateTime) < swarmingCreateTaskGiveUpTimeout { 561 return transient.Tag.Apply(errors.Annotate(err, "failed to create a swarming task").Err()) 562 } 563 logging.Errorf(ctx, "Give up Swarming task creation retry after %s", swarmingCreateTaskGiveUpTimeout.String()) 564 } 565 // Strip out secret bytes and dump the task definition to the log. 566 for _, t := range taskReq.TaskSlices { 567 t.Properties.SecretBytes = nil 568 } 569 logging.Errorf(ctx, "Swarming task creation failure:%s. CreateTask request: %+v\nResponse: %+v", err, taskReq, res) 570 return tq.Fatal.Apply(errors.Annotate(err, "failed to create a swarming task").Err()) 571 } 572 573 // Update the build with the build token and new task id. 574 err = datastore.RunInTransaction(ctx, func(ctx context.Context) error { 575 bld := &model.Build{ 576 ID: build.ID, 577 } 578 infra := &model.BuildInfra{Build: datastore.KeyForObj(ctx, bld)} 579 if err := datastore.Get(ctx, bld, infra); err != nil { 580 return errors.Annotate(err, "failed to fetch build or buildInfra: %d", bld.ID).Err() 581 } 582 583 if infra.Proto.Swarming.TaskId != "" { 584 return errors.Reason("build already has a task %s", infra.Proto.Swarming.TaskId).Err() 585 } 586 infra.Proto.Swarming.TaskId = res.TaskId 587 bld.UpdateToken = token 588 589 return datastore.Put(ctx, bld, infra) 590 }, nil) 591 if err != nil { 592 // now that swarm.CreateTask is idempotent, we should reuse the task, 593 // instead of cancelling it. 594 logging.Errorf(ctx, "created a task %s, but failed to update datastore with the error:%s", res.TaskId, err) 595 return transient.Tag.Apply(errors.Annotate(err, "failed to update build %d", build.ID).Err()) 596 } 597 return nil 598 } 599 600 func computeSwarmingNewTaskReq(ctx context.Context, build *model.Build) (*apipb.NewTaskRequest, error) { 601 sw := build.Proto.GetInfra().GetSwarming() 602 if sw == nil { 603 return nil, errors.New("build.Proto.Infra.Swarming isn't set") 604 } 605 taskReq := &apipb.NewTaskRequest{ 606 // to prevent accidental multiple task creation 607 RequestUuid: uuid.NewSHA1(uuid.Nil, []byte(strconv.FormatInt(build.ID, 10))).String(), 608 Name: fmt.Sprintf("bb-%d-%s", build.ID, build.BuilderID), 609 Realm: build.Realm(), 610 Tags: computeTags(ctx, build), 611 Priority: int32(sw.Priority), 612 PoolTaskTemplate: apipb.NewTaskRequest_SKIP, 613 } 614 615 if build.Proto.Number > 0 { 616 taskReq.Name = fmt.Sprintf("%s-%d", taskReq.Name, build.Proto.Number) 617 } 618 619 taskSlices, err := computeTaskSlice(build) 620 if err != nil { 621 errors.Annotate(err, "failed to computing task slices").Err() 622 } 623 taskReq.TaskSlices = taskSlices 624 625 // Only makes swarming to track the build's parent if Buildbucket doesn't 626 // track. 627 // Buildbucket should track the parent/child build relationships for all 628 // Buildbucket Builds. 629 // Except for children of led builds, whose parents are still tracked by 630 // swarming using sw.parent_run_id. 631 // TODO(crbug.com/1031205): remove the check on 632 // luci.buildbucket.parent_tracking after this experiment is on globally and 633 // we're ready to remove it. 634 if sw.ParentRunId != "" && (len(build.Proto.AncestorIds) == 0 || 635 !strings.Contains(build.ExperimentsString(), buildbucket.ExperimentParentTracking)) { 636 taskReq.ParentTaskId = sw.ParentRunId 637 } 638 639 if sw.TaskServiceAccount != "" { 640 taskReq.ServiceAccount = sw.TaskServiceAccount 641 } 642 643 taskReq.PubsubTopic = fmt.Sprintf(pubsubTopicTemplate, info.AppID(ctx)) 644 ud := &userdata{ 645 BuildID: build.ID, 646 CreatedTS: clock.Now(ctx).UnixNano() / int64(time.Microsecond), 647 SwarmingHostname: sw.Hostname, 648 } 649 udBytes, err := json.Marshal(ud) 650 if err != nil { 651 return nil, errors.Annotate(err, "failed to marshal pubsub userdata").Err() 652 } 653 taskReq.PubsubUserdata = string(udBytes) 654 return taskReq, err 655 } 656 657 // computeTags computes the Swarming task request tags to use. 658 // Note it doesn't compute kitchen related tags. 659 func computeTags(ctx context.Context, build *model.Build) []string { 660 tags := []string{ 661 "buildbucket_bucket:" + build.BucketID, 662 fmt.Sprintf("buildbucket_build_id:%d", build.ID), 663 fmt.Sprintf("buildbucket_hostname:%s", build.Proto.GetInfra().GetBuildbucket().GetHostname()), 664 "luci_project:" + build.Project, 665 } 666 if build.Canary { 667 tags = append(tags, "buildbucket_template_canary:1") 668 } else { 669 tags = append(tags, "buildbucket_template_canary:0") 670 } 671 672 tags = append(tags, build.Tags...) 673 sort.Strings(tags) 674 return tags 675 } 676 677 // computeTaskSlice computes swarming task slices. 678 // build.Proto.Infra must be set. 679 func computeTaskSlice(build *model.Build) ([]*apipb.TaskSlice, error) { 680 // expiration_secs -> []*StringPair 681 dims := map[int64][]*apipb.StringPair{} 682 for _, cache := range build.Proto.GetInfra().GetSwarming().GetCaches() { 683 expSecs := cache.WaitForWarmCache.GetSeconds() 684 if expSecs <= 0 { 685 continue 686 } 687 if _, ok := dims[expSecs]; !ok { 688 dims[expSecs] = []*apipb.StringPair{} 689 } 690 dims[expSecs] = append(dims[expSecs], &apipb.StringPair{ 691 Key: "caches", 692 Value: cache.Name, 693 }) 694 } 695 for _, dim := range build.Proto.GetInfra().GetSwarming().GetTaskDimensions() { 696 expSecs := dim.Expiration.GetSeconds() 697 if _, ok := dims[expSecs]; !ok { 698 dims[expSecs] = []*apipb.StringPair{} 699 } 700 dims[expSecs] = append(dims[expSecs], &apipb.StringPair{ 701 Key: dim.Key, 702 Value: dim.Value, 703 }) 704 } 705 706 // extract base dim and delete it from the map. 707 baseDim, ok := dims[0] 708 if !ok { 709 baseDim = []*apipb.StringPair{} 710 } 711 delete(dims, 0) 712 if len(dims) > 6 { 713 return nil, errors.New("At most 6 different expiration_secs to be allowed in swarming") 714 } 715 716 baseSlice := &apipb.TaskSlice{ 717 ExpirationSecs: int32(build.Proto.GetSchedulingTimeout().GetSeconds()), 718 WaitForCapacity: build.Proto.GetWaitForCapacity(), 719 Properties: &apipb.TaskProperties{ 720 CipdInput: computeCipdInput(build), 721 ExecutionTimeoutSecs: int32(build.Proto.GetExecutionTimeout().GetSeconds()), 722 GracePeriodSecs: int32(build.Proto.GetGracePeriod().GetSeconds() + bbagentReservedGracePeriod), 723 Caches: computeTaskSliceCaches(build), 724 Dimensions: baseDim, 725 EnvPrefixes: computeEnvPrefixes(build), 726 Env: []*apipb.StringPair{ 727 {Key: "BUILDBUCKET_EXPERIMENTAL", Value: strings.ToUpper(strconv.FormatBool(build.Experimental))}, 728 }, 729 Command: computeCommand(build), 730 }, 731 } 732 733 // sort dims map by expiration_sec. 734 var expSecs []int 735 for expSec := range dims { 736 expSecs = append(expSecs, int(expSec)) 737 } 738 sort.Ints(expSecs) 739 740 // TODO(vadimsh): Remove this when no longer needed, ETA Oct 2022. This is 741 // used to load test Swarming's slice expiration mechanism. 742 sliceWaitForCapacity := build.Proto.GetWaitForCapacity() && 743 strings.Contains(build.ExperimentsString(), buildbucket.ExperimentWaitForCapacity) 744 745 // Create extra task slices by copying the base task slice. Adding the 746 // corresponding expiration and desired dimensions 747 lastExp := 0 748 taskSlices := make([]*apipb.TaskSlice, len(expSecs)+1) 749 for i, sec := range expSecs { 750 prop := &apipb.TaskProperties{} 751 if err := deepCopy(baseSlice.Properties, prop); err != nil { 752 return nil, err 753 } 754 taskSlices[i] = &apipb.TaskSlice{ 755 WaitForCapacity: sliceWaitForCapacity, 756 ExpirationSecs: int32(sec - lastExp), 757 Properties: prop, 758 } 759 // dims[i] should be added into all previous non-expired task slices. 760 for j := 0; j <= i; j++ { 761 taskSlices[j].Properties.Dimensions = append(taskSlices[j].Properties.Dimensions, dims[int64(sec)]...) 762 } 763 lastExp = sec 764 } 765 766 // Tweak expiration on the baseSlice, which is the last slice. 767 exp := max(int(baseSlice.ExpirationSecs)-lastExp, 60) 768 baseSlice.ExpirationSecs = int32(exp) 769 taskSlices[len(taskSlices)-1] = baseSlice 770 771 sortDim := func(strPairs []*apipb.StringPair) { 772 sort.Slice(strPairs, func(i, j int) bool { 773 if strPairs[i].Key == strPairs[j].Key { 774 return strPairs[i].Value < strPairs[j].Value 775 } 776 return strPairs[i].Key < strPairs[j].Key 777 }) 778 } 779 // sort dimensions in each task slice. 780 for _, t := range taskSlices { 781 sortDim(t.Properties.Dimensions) 782 } 783 return taskSlices, nil 784 } 785 786 // computeTaskSliceCaches computes the task slice caches. 787 func computeTaskSliceCaches(build *model.Build) []*apipb.CacheEntry { 788 infra := build.Proto.Infra 789 caches := make([]*apipb.CacheEntry, 0, len(infra.Swarming.GetCaches())+2) 790 for _, c := range build.Proto.Infra.Swarming.GetCaches() { 791 caches = append(caches, &apipb.CacheEntry{ 792 Name: c.Name, 793 Path: filepath.Join(cacheDir, c.Path), 794 }) 795 } 796 if cipdClientCache := infra.Buildbucket.GetAgent().GetCipdClientCache(); cipdClientCache != nil { 797 caches = append(caches, &apipb.CacheEntry{ 798 Name: cipdClientCache.Name, 799 Path: filepath.Join(cacheDir, cipdClientCache.Path), 800 }) 801 } 802 if cipdPackagesCache := infra.Buildbucket.GetAgent().GetCipdPackagesCache(); cipdPackagesCache != nil { 803 caches = append(caches, &apipb.CacheEntry{ 804 Name: cipdPackagesCache.Name, 805 Path: filepath.Join(cacheDir, cipdPackagesCache.Path), 806 }) 807 } 808 809 return caches 810 } 811 812 // computeCipdInput returns swarming task CIPD input. 813 // Note: this function only considers v2 bbagent builds. 814 // The build.Proto.Infra.Buildbucket.Agent.Source must be set 815 func computeCipdInput(build *model.Build) *apipb.CipdInput { 816 return &apipb.CipdInput{ 817 Packages: []*apipb.CipdPackage{{ 818 PackageName: build.Proto.GetInfra().GetBuildbucket().GetAgent().GetSource().GetCipd().GetPackage(), 819 Version: build.Proto.GetInfra().GetBuildbucket().GetAgent().GetSource().GetCipd().GetVersion(), 820 Path: ".", 821 }}, 822 } 823 } 824 825 // computeEnvPrefixes returns env_prefixes key in swarming properties. 826 // Note: this function only considers v2 bbagent builds. 827 func computeEnvPrefixes(build *model.Build) []*apipb.StringListPair { 828 prefixesMap := map[string][]string{} 829 for _, c := range build.Proto.GetInfra().GetSwarming().GetCaches() { 830 if c.EnvVar != "" { 831 if _, ok := prefixesMap[c.EnvVar]; !ok { 832 prefixesMap[c.EnvVar] = []string{} 833 } 834 prefixesMap[c.EnvVar] = append(prefixesMap[c.EnvVar], filepath.Join(cacheDir, c.Path)) 835 } 836 } 837 var keys []string 838 for key := range prefixesMap { 839 keys = append(keys, key) 840 } 841 sort.Strings(keys) 842 prefixes := make([]*apipb.StringListPair, len(keys)) 843 for i, key := range keys { 844 prefixes[i] = &apipb.StringListPair{ 845 Key: key, 846 Value: prefixesMap[key], 847 } 848 } 849 return prefixes 850 } 851 852 // computeCommand computes the command for bbagent. 853 func computeCommand(build *model.Build) []string { 854 if strings.Contains(build.ExperimentsString(), buildbucket.ExperimentBBAgentGetBuild) { 855 return []string{ 856 "bbagent${EXECUTABLE_SUFFIX}", 857 "-host", 858 build.Proto.GetInfra().GetBuildbucket().GetHostname(), 859 "-build-id", 860 strconv.FormatInt(build.ID, 10), 861 } 862 } 863 864 return []string{ 865 "bbagent${EXECUTABLE_SUFFIX}", 866 bbinput.Encode(&pb.BBAgentArgs{ 867 Build: build.Proto, 868 CacheDir: build.Proto.GetInfra().GetBbagent().GetCacheDir(), 869 KnownPublicGerritHosts: build.Proto.GetInfra().GetBuildbucket().GetKnownPublicGerritHosts(), 870 PayloadPath: build.Proto.GetInfra().GetBbagent().GetPayloadPath(), 871 }), 872 } 873 } 874 875 func max(a, b int) int { 876 if a > b { 877 return a 878 } 879 return b 880 } 881 882 // deepCopy deep copies src to dst using json marshaling for non-proto messages. 883 func deepCopy(src, dst any) error { 884 srcBytes, err := json.Marshal(src) 885 if err != nil { 886 return err 887 } 888 return json.Unmarshal(srcBytes, dst) 889 }