go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/buildbucket/appengine/tasks/swarming_test.go (about) 1 // Copyright 2022 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tasks 16 17 import ( 18 "bytes" 19 "context" 20 "encoding/base64" 21 "encoding/json" 22 "fmt" 23 "io" 24 "path/filepath" 25 "testing" 26 "time" 27 28 "github.com/golang/mock/gomock" 29 "google.golang.org/api/googleapi" 30 "google.golang.org/protobuf/types/known/durationpb" 31 "google.golang.org/protobuf/types/known/timestamppb" 32 33 "go.chromium.org/luci/common/clock/testclock" 34 "go.chromium.org/luci/common/retry/transient" 35 "go.chromium.org/luci/common/tsmon" 36 "go.chromium.org/luci/gae/filter/txndefer" 37 "go.chromium.org/luci/gae/impl/memory" 38 "go.chromium.org/luci/gae/service/datastore" 39 "go.chromium.org/luci/server/caching" 40 "go.chromium.org/luci/server/caching/cachingtest" 41 "go.chromium.org/luci/server/secrets" 42 "go.chromium.org/luci/server/secrets/testsecrets" 43 "go.chromium.org/luci/server/tq" 44 apipb "go.chromium.org/luci/swarming/proto/api_v2" 45 46 "go.chromium.org/luci/buildbucket/appengine/internal/clients" 47 "go.chromium.org/luci/buildbucket/appengine/internal/metrics" 48 "go.chromium.org/luci/buildbucket/appengine/model" 49 taskdefs "go.chromium.org/luci/buildbucket/appengine/tasks/defs" 50 "go.chromium.org/luci/buildbucket/cmd/bbagent/bbinput" 51 pb "go.chromium.org/luci/buildbucket/proto" 52 "go.chromium.org/luci/buildbucket/protoutil" 53 54 . "github.com/smartystreets/goconvey/convey" 55 . "go.chromium.org/luci/common/testing/assertions" 56 ) 57 58 func TestTaskDef(t *testing.T) { 59 Convey("compute task slice", t, func() { 60 b := &model.Build{ 61 ID: 123, 62 Proto: &pb.Build{ 63 Id: 123, 64 SchedulingTimeout: &durationpb.Duration{ 65 Seconds: 3600, 66 }, 67 ExecutionTimeout: &durationpb.Duration{ 68 Seconds: 4800, 69 }, 70 GracePeriod: &durationpb.Duration{ 71 Seconds: 60, 72 }, 73 Infra: &pb.BuildInfra{ 74 Swarming: &pb.BuildInfra_Swarming{}, 75 Bbagent: &pb.BuildInfra_BBAgent{ 76 CacheDir: "cache", 77 PayloadPath: "kitchen-checkout", 78 }, 79 Buildbucket: &pb.BuildInfra_Buildbucket{ 80 Agent: &pb.BuildInfra_Buildbucket_Agent{ 81 Source: &pb.BuildInfra_Buildbucket_Agent_Source{ 82 DataType: &pb.BuildInfra_Buildbucket_Agent_Source_Cipd{ 83 Cipd: &pb.BuildInfra_Buildbucket_Agent_Source_CIPD{ 84 Package: "infra/tools/luci/bbagent/${platform}", 85 Version: "canary-version", 86 Server: "cipd server", 87 }, 88 }, 89 }, 90 CipdClientCache: &pb.CacheEntry{ 91 Name: "cipd_client_hash", 92 Path: "cipd_client", 93 }, 94 CipdPackagesCache: &pb.CacheEntry{ 95 Name: "cipd_cache_hash", 96 Path: "cipd_cache", 97 }, 98 }, 99 }, 100 }, 101 }, 102 } 103 Convey("only base slice", func() { 104 b.Proto.Infra.Swarming = &pb.BuildInfra_Swarming{ 105 Caches: []*pb.BuildInfra_Swarming_CacheEntry{ 106 {Name: "shared_builder_cache", Path: "builder"}, 107 }, 108 TaskDimensions: []*pb.RequestedDimension{ 109 {Key: "pool", Value: "Chrome"}, 110 }, 111 } 112 slices, err := computeTaskSlice(b) 113 So(err, ShouldBeNil) 114 So(len(slices), ShouldEqual, 1) 115 So(slices[0].Properties.Caches, ShouldResemble, []*apipb.CacheEntry{ 116 { 117 Path: filepath.Join("cache", "builder"), 118 Name: "shared_builder_cache", 119 }, 120 { 121 Path: filepath.Join("cache", "cipd_client"), 122 Name: "cipd_client_hash", 123 }, 124 { 125 Path: filepath.Join("cache", "cipd_cache"), 126 Name: "cipd_cache_hash", 127 }, 128 }) 129 So(slices[0].Properties.Dimensions, ShouldResemble, []*apipb.StringPair{ 130 { 131 Key: "pool", 132 Value: "Chrome", 133 }, 134 }) 135 }) 136 137 Convey("multiple dimensions and cache fallback", func() { 138 // Creates 4 task_slices by modifying the buildercfg in 2 ways: 139 // - Add two named caches, one expiring at 60 seconds, one at 360 seconds. 140 // - Add an optional builder dimension, expiring at 120 seconds. 141 // 142 // This ensures the combination of these features works correctly, and that 143 // multiple 'caches' dimensions can be injected. 144 b.Proto.Infra.Swarming = &pb.BuildInfra_Swarming{ 145 Caches: []*pb.BuildInfra_Swarming_CacheEntry{ 146 {Name: "shared_builder_cache", Path: "builder", WaitForWarmCache: &durationpb.Duration{Seconds: 60}}, 147 {Name: "second_cache", Path: "second", WaitForWarmCache: &durationpb.Duration{Seconds: 360}}, 148 }, 149 TaskDimensions: []*pb.RequestedDimension{ 150 {Key: "a", Value: "1", Expiration: &durationpb.Duration{Seconds: 120}}, 151 {Key: "a", Value: "2", Expiration: &durationpb.Duration{Seconds: 120}}, 152 {Key: "pool", Value: "Chrome"}, 153 }, 154 } 155 slices, err := computeTaskSlice(b) 156 So(err, ShouldBeNil) 157 So(len(slices), ShouldEqual, 4) 158 159 // All slices properties fields have the same value except dimensions. 160 for _, tSlice := range slices { 161 So(tSlice.Properties.ExecutionTimeoutSecs, ShouldEqual, 4800) 162 So(tSlice.Properties.GracePeriodSecs, ShouldEqual, 240) 163 So(tSlice.Properties.Caches, ShouldResemble, []*apipb.CacheEntry{ 164 {Path: filepath.Join("cache", "builder"), Name: "shared_builder_cache"}, 165 {Path: filepath.Join("cache", "second"), Name: "second_cache"}, 166 {Path: filepath.Join("cache", "cipd_client"), Name: "cipd_client_hash"}, 167 {Path: filepath.Join("cache", "cipd_cache"), Name: "cipd_cache_hash"}, 168 }) 169 So(tSlice.Properties.Env, ShouldResemble, []*apipb.StringPair{ 170 {Key: "BUILDBUCKET_EXPERIMENTAL", Value: "FALSE"}, 171 }) 172 } 173 174 So(slices[0].ExpirationSecs, ShouldEqual, 60) 175 // The dimensions are different. 'a' and 'caches' are injected. 176 So(slices[0].Properties.Dimensions, ShouldResemble, []*apipb.StringPair{ 177 {Key: "a", Value: "1"}, 178 {Key: "a", Value: "2"}, 179 {Key: "caches", Value: "second_cache"}, 180 {Key: "caches", Value: "shared_builder_cache"}, 181 {Key: "pool", Value: "Chrome"}, 182 }) 183 184 // 120 - 60 185 So(slices[1].ExpirationSecs, ShouldEqual, 60) 186 // The dimensions are different. 'a' and 'caches' are injected. 187 So(slices[1].Properties.Dimensions, ShouldResemble, []*apipb.StringPair{ 188 {Key: "a", Value: "1"}, 189 {Key: "a", Value: "2"}, 190 {Key: "caches", Value: "second_cache"}, 191 {Key: "pool", Value: "Chrome"}, 192 }) 193 194 // 360 - 120 195 So(slices[2].ExpirationSecs, ShouldEqual, 240) 196 // 'a' expired, one 'caches' remains. 197 So(slices[2].Properties.Dimensions, ShouldResemble, []*apipb.StringPair{ 198 {Key: "caches", Value: "second_cache"}, 199 {Key: "pool", Value: "Chrome"}, 200 }) 201 202 // 3600-360 203 So(slices[3].ExpirationSecs, ShouldEqual, 3240) 204 // # The cold fallback; the last 'caches' expired. 205 So(slices[3].Properties.Dimensions, ShouldResemble, []*apipb.StringPair{ 206 {Key: "pool", Value: "Chrome"}, 207 }) 208 }) 209 }) 210 211 Convey("compute bbagent command", t, func() { 212 b := &model.Build{ 213 ID: 123, 214 Proto: &pb.Build{ 215 Infra: &pb.BuildInfra{ 216 Buildbucket: &pb.BuildInfra_Buildbucket{ 217 Hostname: "bbhost.com", 218 }, 219 }, 220 }, 221 } 222 Convey("bbagent_getbuild experiment", func() { 223 b.Experiments = []string{"+luci.buildbucket.bbagent_getbuild"} 224 bbagentCmd := computeCommand(b) 225 So(bbagentCmd, ShouldResemble, []string{ 226 "bbagent${EXECUTABLE_SUFFIX}", 227 "-host", 228 "bbhost.com", 229 "-build-id", 230 "123", 231 }) 232 }) 233 234 Convey("no bbagent_getbuild experiment", func() { 235 b.Experiments = []string{"-luci.buildbucket.bbagent_getbuild"} 236 b.Proto.Infra.Bbagent = &pb.BuildInfra_BBAgent{ 237 CacheDir: "cache", 238 PayloadPath: "payload_path", 239 } 240 bbagentCmd := computeCommand(b) 241 expectedEncoded := bbinput.Encode(&pb.BBAgentArgs{ 242 Build: b.Proto, 243 CacheDir: "cache", 244 PayloadPath: "payload_path", 245 }) 246 So(bbagentCmd, ShouldResemble, []string{ 247 "bbagent${EXECUTABLE_SUFFIX}", 248 expectedEncoded, 249 }) 250 }) 251 }) 252 253 Convey("compute env_prefixes", t, func() { 254 b := &model.Build{ 255 ID: 123, 256 Proto: &pb.Build{ 257 Infra: &pb.BuildInfra{ 258 Swarming: &pb.BuildInfra_Swarming{}, 259 }, 260 }, 261 } 262 Convey("empty swarming cache", func() { 263 prefixes := computeEnvPrefixes(b) 264 So(prefixes, ShouldResemble, []*apipb.StringListPair{}) 265 }) 266 267 Convey("normal", func() { 268 b.Proto.Infra.Swarming.Caches = []*pb.BuildInfra_Swarming_CacheEntry{ 269 {Path: "vpython", Name: "vpython", EnvVar: "VPYTHON_VIRTUALENV_ROOT"}, 270 {Path: "abc", Name: "abc", EnvVar: "ABC"}, 271 } 272 prefixes := computeEnvPrefixes(b) 273 So(prefixes, ShouldResemble, []*apipb.StringListPair{ 274 {Key: "ABC", Value: []string{filepath.Join("cache", "abc")}}, 275 {Key: "VPYTHON_VIRTUALENV_ROOT", Value: []string{filepath.Join("cache", "vpython")}}, 276 }) 277 }) 278 }) 279 280 Convey("compute swarming new task req", t, func() { 281 ctx := memory.UseWithAppID(context.Background(), "dev~app-id") 282 ctx, _ = testclock.UseTime(ctx, time.Unix(1444945245, 0).UTC()) 283 b := &model.Build{ 284 ID: 123, 285 Project: "project", 286 BucketID: "bucket", 287 BuilderID: "builder", 288 Proto: &pb.Build{ 289 Id: 123, 290 Number: 1, 291 Builder: &pb.BuilderID{ 292 Project: "project", 293 Bucket: "bucket", 294 Builder: "builder", 295 }, 296 Infra: &pb.BuildInfra{ 297 Swarming: &pb.BuildInfra_Swarming{ 298 Priority: 20, 299 TaskServiceAccount: "abc", 300 Hostname: "swarm.com", 301 }, 302 Bbagent: &pb.BuildInfra_BBAgent{}, 303 Buildbucket: &pb.BuildInfra_Buildbucket{ 304 Hostname: "app-id.appspot.com", 305 Agent: &pb.BuildInfra_Buildbucket_Agent{ 306 Source: &pb.BuildInfra_Buildbucket_Agent_Source{ 307 DataType: &pb.BuildInfra_Buildbucket_Agent_Source_Cipd{ 308 Cipd: &pb.BuildInfra_Buildbucket_Agent_Source_CIPD{ 309 Package: "infra/tools/luci/bbagent/${platform}", 310 Version: "canary-version", 311 Server: "cipd server", 312 }, 313 }, 314 }, 315 }, 316 }, 317 }, 318 }, 319 } 320 321 req, err := computeSwarmingNewTaskReq(ctx, b) 322 // Strip out TaskSlices. It has been tested in other tests 323 req.TaskSlices = []*apipb.TaskSlice(nil) 324 So(err, ShouldBeNil) 325 ud, _ := json.Marshal(&userdata{ 326 BuildID: 123, 327 CreatedTS: 1444945245000000, 328 SwarmingHostname: "swarm.com", 329 }) 330 expected := &apipb.NewTaskRequest{ 331 RequestUuid: "203882df-ce4b-5012-b32a-2c1d29c321a7", 332 Name: "bb-123-builder-1", 333 Realm: "project:bucket", 334 Tags: []string{"buildbucket_bucket:bucket", "buildbucket_build_id:123", "buildbucket_hostname:app-id.appspot.com", "buildbucket_template_canary:0", "luci_project:project"}, 335 Priority: int32(20), 336 PubsubTopic: "projects/app-id/topics/swarming-go", 337 PubsubUserdata: string(ud), 338 ServiceAccount: "abc", 339 PoolTaskTemplate: apipb.NewTaskRequest_SKIP, 340 } 341 So(req, ShouldResemble, expected) 342 }) 343 } 344 345 func TestSyncBuild(t *testing.T) { 346 t.Parallel() 347 Convey("SyncBuild", t, func() { 348 ctl := gomock.NewController(t) 349 defer ctl.Finish() 350 now := testclock.TestRecentTimeUTC 351 mockSwarm := clients.NewMockSwarmingClient(ctl) 352 ctx, _ := testclock.UseTime(context.Background(), now) 353 ctx = context.WithValue(ctx, &clients.MockSwarmingClientKey, mockSwarm) 354 ctx = memory.UseWithAppID(ctx, "dev~app-id") 355 ctx = txndefer.FilterRDS(ctx) 356 ctx = metrics.WithServiceInfo(ctx, "svc", "job", "ins") 357 ctx = metrics.WithBuilder(ctx, "proj", "bucket", "builder") 358 datastore.GetTestable(ctx).AutoIndex(true) 359 datastore.GetTestable(ctx).Consistent(true) 360 ctx, sch := tq.TestingContext(ctx, nil) 361 store := &testsecrets.Store{ 362 Secrets: map[string]secrets.Secret{ 363 "key": {Active: []byte("stuff")}, 364 }, 365 } 366 ctx = secrets.Use(ctx, store) 367 ctx = secrets.GeneratePrimaryTinkAEADForTest(ctx) 368 ctx, _ = tsmon.WithDummyInMemory(ctx) 369 metricsStore := tsmon.Store(ctx) 370 371 b := &model.Build{ 372 ID: 123, 373 Proto: &pb.Build{ 374 Id: 123, 375 Status: pb.Status_SCHEDULED, 376 CreateTime: ×tamppb.Timestamp{Seconds: now.UnixNano() / 1000000000}, 377 SchedulingTimeout: &durationpb.Duration{ 378 Seconds: 3600, 379 }, 380 ExecutionTimeout: &durationpb.Duration{ 381 Seconds: 4800, 382 }, 383 GracePeriod: &durationpb.Duration{ 384 Seconds: 60, 385 }, 386 Builder: &pb.BuilderID{ 387 Project: "proj", 388 Bucket: "bucket", 389 Builder: "builder", 390 }, 391 }, 392 } 393 inf := &model.BuildInfra{ 394 ID: 1, 395 Build: datastore.KeyForObj(ctx, &model.Build{ID: 123}), 396 Proto: &pb.BuildInfra{ 397 Swarming: &pb.BuildInfra_Swarming{ 398 Hostname: "swarm", 399 Caches: []*pb.BuildInfra_Swarming_CacheEntry{ 400 {Name: "shared_builder_cache", Path: "builder", WaitForWarmCache: &durationpb.Duration{Seconds: 60}}, 401 {Name: "second_cache", Path: "second", WaitForWarmCache: &durationpb.Duration{Seconds: 360}}, 402 }, 403 TaskDimensions: []*pb.RequestedDimension{ 404 {Key: "a", Value: "1", Expiration: &durationpb.Duration{Seconds: 120}}, 405 {Key: "a", Value: "2", Expiration: &durationpb.Duration{Seconds: 120}}, 406 {Key: "pool", Value: "Chrome"}, 407 }, 408 }, 409 Bbagent: &pb.BuildInfra_BBAgent{ 410 CacheDir: "cache", 411 PayloadPath: "kitchen-checkout", 412 }, 413 Buildbucket: &pb.BuildInfra_Buildbucket{ 414 Agent: &pb.BuildInfra_Buildbucket_Agent{ 415 Source: &pb.BuildInfra_Buildbucket_Agent_Source{ 416 DataType: &pb.BuildInfra_Buildbucket_Agent_Source_Cipd{ 417 Cipd: &pb.BuildInfra_Buildbucket_Agent_Source_CIPD{ 418 Package: "infra/tools/luci/bbagent/${platform}", 419 Version: "canary-version", 420 Server: "cipd server", 421 }, 422 }, 423 }, 424 }, 425 }, 426 }, 427 } 428 bs := &model.BuildStatus{ 429 Build: datastore.KeyForObj(ctx, b), 430 Status: b.Proto.Status, 431 } 432 So(datastore.Put(ctx, b, inf, bs), ShouldBeNil) 433 Convey("swarming-build-create", func() { 434 435 Convey("build not found", func() { 436 err := SyncBuild(ctx, 789, 0) 437 So(err, ShouldErrLike, "build 789 or buildInfra not found") 438 }) 439 440 Convey("build too old", func() { 441 So(datastore.Put(ctx, &model.Build{ 442 ID: 111, 443 CreateTime: now.AddDate(0, 0, -3), 444 Proto: &pb.Build{ 445 Builder: &pb.BuilderID{}, 446 }, 447 }), ShouldBeNil) 448 449 So(datastore.Put(ctx, &model.BuildInfra{ 450 ID: 1, 451 Build: datastore.KeyForObj(ctx, &model.Build{ID: 111}), 452 }), ShouldBeNil) 453 err := SyncBuild(ctx, 111, 0) 454 So(err, ShouldBeNil) 455 So(sch.Tasks(), ShouldHaveLength, 0) 456 }) 457 458 Convey("build ended", func() { 459 So(datastore.Put(ctx, &model.Build{ 460 ID: 111, 461 Status: pb.Status_SUCCESS, 462 Proto: &pb.Build{ 463 Builder: &pb.BuilderID{}, 464 }, 465 }), ShouldBeNil) 466 So(datastore.Put(ctx, &model.BuildInfra{ 467 ID: 1, 468 Build: datastore.KeyForObj(ctx, &model.Build{ID: 111}), 469 }), ShouldBeNil) 470 err := SyncBuild(ctx, 111, 0) 471 So(err, ShouldBeNil) 472 So(sch.Tasks(), ShouldHaveLength, 0) 473 }) 474 475 Convey("create swarming success", func() { 476 mockSwarm.EXPECT().CreateTask(gomock.Any(), gomock.Any()).Return(&apipb.TaskRequestMetadataResponse{ 477 TaskId: "task123", 478 }, nil) 479 err := SyncBuild(ctx, 123, 0) 480 So(err, ShouldBeNil) 481 updatedBuild := &model.Build{ID: 123} 482 updatedInfra := &model.BuildInfra{Build: datastore.KeyForObj(ctx, updatedBuild)} 483 So(datastore.Get(ctx, updatedBuild), ShouldBeNil) 484 So(datastore.Get(ctx, updatedInfra), ShouldBeNil) 485 So(updatedBuild.UpdateToken, ShouldNotBeEmpty) 486 So(updatedInfra.Proto.Swarming.TaskId, ShouldEqual, "task123") 487 So(sch.Tasks(), ShouldHaveLength, 1) 488 }) 489 490 Convey("create swarming http 400 err", func() { 491 mockSwarm.EXPECT().CreateTask(gomock.Any(), gomock.Any()).Return(nil, &googleapi.Error{Code: 400}) 492 err := SyncBuild(ctx, 123, 0) 493 So(err, ShouldBeNil) 494 failedBuild := &model.Build{ID: 123} 495 bldStatus := &model.BuildStatus{ 496 Build: datastore.KeyForObj(ctx, &model.Build{ID: 123}), 497 } 498 So(datastore.Get(ctx, failedBuild, bldStatus), ShouldBeNil) 499 So(failedBuild.Status, ShouldEqual, pb.Status_INFRA_FAILURE) 500 So(failedBuild.Proto.SummaryMarkdown, ShouldContainSubstring, "failed to create a swarming task: googleapi: got HTTP response code 400") 501 So(sch.Tasks(), ShouldHaveLength, 4) 502 So(bldStatus.Status, ShouldEqual, pb.Status_INFRA_FAILURE) 503 }) 504 505 Convey("create swarming http 500 err", func() { 506 mockSwarm.EXPECT().CreateTask(gomock.Any(), gomock.Any()).Return(nil, &googleapi.Error{Code: 500}) 507 err := SyncBuild(ctx, 123, 0) 508 So(err, ShouldErrLike, "failed to create a swarming task") 509 So(transient.Tag.In(err), ShouldBeTrue) 510 bld := &model.Build{ID: 123} 511 So(datastore.Get(ctx, bld), ShouldBeNil) 512 So(bld.Status, ShouldEqual, pb.Status_SCHEDULED) 513 }) 514 515 Convey("create swarming http 500 err give up", func() { 516 ctx1, _ := testclock.UseTime(ctx, now.Add(swarmingCreateTaskGiveUpTimeout)) 517 mockSwarm.EXPECT().CreateTask(gomock.Any(), gomock.Any()).Return(nil, &googleapi.Error{Code: 500}) 518 err := SyncBuild(ctx1, 123, 0) 519 So(err, ShouldBeNil) 520 failedBuild := &model.Build{ID: 123} 521 So(datastore.Get(ctx, failedBuild), ShouldBeNil) 522 So(failedBuild.Status, ShouldEqual, pb.Status_INFRA_FAILURE) 523 So(failedBuild.Proto.SummaryMarkdown, ShouldContainSubstring, "failed to create a swarming task: googleapi: got HTTP response code 500") 524 So(sch.Tasks(), ShouldHaveLength, 4) 525 }) 526 527 Convey("swarming task creation success but update build fail", func() { 528 mockSwarm.EXPECT().CreateTask(gomock.Any(), gomock.Any()).DoAndReturn(func(ctx context.Context, req *apipb.NewTaskRequest) (*apipb.TaskRequestMetadataResponse, error) { 529 // Hack to make the build update fail when trying to update build with the new task ID. 530 inf.Proto.Swarming.TaskId = "old task ID" 531 So(datastore.Put(ctx, inf), ShouldBeNil) 532 return &apipb.TaskRequestMetadataResponse{TaskId: "new task ID"}, nil 533 }) 534 535 err := SyncBuild(ctx, 123, 0) 536 So(err, ShouldErrLike, "failed to update build 123: build already has a task old task ID") 537 currentInfra := &model.BuildInfra{Build: datastore.KeyForObj(ctx, &model.Build{ 538 ID: 123, 539 })} 540 So(datastore.Get(ctx, currentInfra), ShouldBeNil) 541 So(currentInfra.Proto.Swarming.TaskId, ShouldEqual, "old task ID") 542 So(sch.Tasks(), ShouldHaveLength, 0) 543 }) 544 }) 545 546 Convey("swarming sync", func() { 547 inf.Proto.Swarming.TaskId = "task_id" 548 So(datastore.Put(ctx, inf), ShouldBeNil) 549 550 Convey("non-existing task ID", func() { 551 mockSwarm.EXPECT().GetTaskResult(ctx, "task_id").Return(nil, &googleapi.Error{Code: 404}) 552 err := syncBuildWithTaskResult(ctx, 123, "task_id", mockSwarm) 553 So(err, ShouldBeNil) 554 failedBuild := &model.Build{ID: 123} 555 So(datastore.Get(ctx, failedBuild), ShouldBeNil) 556 So(failedBuild.Status, ShouldEqual, pb.Status_INFRA_FAILURE) 557 So(failedBuild.Proto.SummaryMarkdown, ShouldContainSubstring, "invalid swarming task task_id") 558 }) 559 560 Convey("swarming server 500", func() { 561 mockSwarm.EXPECT().GetTaskResult(ctx, "task_id").Return(nil, &googleapi.Error{Code: 500}) 562 err := syncBuildWithTaskResult(ctx, 123, "task_id", mockSwarm) 563 So(transient.Tag.In(err), ShouldBeTrue) 564 }) 565 566 Convey("empty task result", func() { 567 mockSwarm.EXPECT().GetTaskResult(ctx, "task_id").Return(nil, nil) 568 err := syncBuildWithTaskResult(ctx, 123, "task_id", mockSwarm) 569 So(err, ShouldBeNil) 570 failedBuild := &model.Build{ID: 123} 571 So(datastore.Get(ctx, failedBuild), ShouldBeNil) 572 So(failedBuild.Status, ShouldEqual, pb.Status_INFRA_FAILURE) 573 So(failedBuild.Proto.SummaryMarkdown, ShouldContainSubstring, "Swarming task task_id unexpectedly disappeared") 574 }) 575 576 Convey("invalid task result state", func() { 577 // syncBuildWithTaskResult should return Fatal error 578 mockSwarm.EXPECT().GetTaskResult(ctx, "task_id").Return(&apipb.TaskResultResponse{State: apipb.TaskState_INVALID}, nil) 579 err := syncBuildWithTaskResult(ctx, 123, "task_id", mockSwarm) 580 So(tq.Fatal.In(err), ShouldBeTrue) 581 bb := &model.Build{ID: 123} 582 So(datastore.Get(ctx, bb), ShouldBeNil) 583 So(bb.Status, ShouldEqual, pb.Status_SCHEDULED) // build status should not been impacted 584 585 // The swarming-build-sync flow shouldn't bubble up the Fatal error. 586 // It should ignore and enqueue the next generation of sync task. 587 mockSwarm.EXPECT().GetTaskResult(ctx, "task_id").Return(&apipb.TaskResultResponse{State: apipb.TaskState_INVALID}, nil) 588 err = SyncBuild(ctx, 123, 1) 589 So(err, ShouldBeNil) 590 bb = &model.Build{ID: 123} 591 So(datastore.Get(ctx, bb), ShouldBeNil) 592 So(bb.Status, ShouldEqual, pb.Status_SCHEDULED) 593 So(sch.Tasks(), ShouldHaveLength, 1) 594 So(sch.Tasks().Payloads()[0], ShouldResembleProto, &taskdefs.SyncSwarmingBuildTask{ 595 BuildId: 123, 596 Generation: 2, 597 }) 598 }) 599 600 Convey("cancel incomplete steps for an ended build", func() { 601 mockSwarm.EXPECT().GetTaskResult(ctx, "task_id").Return(&apipb.TaskResultResponse{ 602 State: apipb.TaskState_BOT_DIED, 603 StartedTs: ×tamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000}, 604 CompletedTs: ×tamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000}, 605 }, nil) 606 steps := model.BuildSteps{ 607 ID: 1, 608 Build: datastore.KeyForObj(ctx, &model.Build{ID: 123}), 609 } 610 So(steps.FromProto([]*pb.Step{ 611 {Name: "step1", Status: pb.Status_SUCCESS}, 612 {Name: "step2", Status: pb.Status_STARTED}, 613 }), ShouldBeNil) 614 So(datastore.Put(ctx, &steps), ShouldBeNil) 615 616 err := syncBuildWithTaskResult(ctx, 123, "task_id", mockSwarm) 617 So(err, ShouldBeNil) 618 failedBuild := &model.Build{ID: 123} 619 So(datastore.Get(ctx, failedBuild), ShouldBeNil) 620 So(failedBuild.Status, ShouldEqual, pb.Status_INFRA_FAILURE) 621 allSteps := &model.BuildSteps{ 622 ID: 1, 623 Build: datastore.KeyForObj(ctx, &model.Build{ID: 123}), 624 } 625 So(datastore.Get(ctx, allSteps), ShouldBeNil) 626 mSteps, err := allSteps.ToProto(ctx) 627 So(err, ShouldBeNil) 628 So(mSteps, ShouldResembleProto, []*pb.Step{ 629 { 630 Name: "step1", 631 Status: pb.Status_SUCCESS, 632 }, 633 { 634 Name: "step2", 635 Status: pb.Status_CANCELED, 636 EndTime: ×tamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000}, 637 }, 638 }) 639 }) 640 641 Convey("build has output status set to FAILURE", func() { 642 fakeTaskResult := &apipb.TaskResultResponse{ 643 State: apipb.TaskState_COMPLETED, 644 Failure: true, 645 } 646 mockSwarm.EXPECT().GetTaskResult(ctx, "task567").Return(fakeTaskResult, nil) 647 b := &model.Build{ 648 ID: 567, 649 Proto: &pb.Build{ 650 Builder: &pb.BuilderID{ 651 Project: "proj", 652 Bucket: "bucket", 653 Builder: "builder", 654 }, 655 CreateTime: ×tamppb.Timestamp{Seconds: now.UnixNano() / 1000000000}, 656 Status: pb.Status_STARTED, 657 Output: &pb.Build_Output{ 658 Status: pb.Status_FAILURE, 659 }, 660 }, 661 } 662 inf := &model.BuildInfra{ 663 ID: 1, 664 Build: datastore.KeyForObj(ctx, &model.Build{ID: 567}), 665 Proto: &pb.BuildInfra{ 666 Swarming: &pb.BuildInfra_Swarming{ 667 Hostname: "swarm", 668 TaskId: "task567", 669 }, 670 }, 671 } 672 bs := &model.BuildStatus{ 673 Build: datastore.KeyForObj(ctx, b), 674 Status: b.Proto.Status, 675 } 676 So(datastore.Put(ctx, b, inf, bs), ShouldBeNil) 677 err := SyncBuild(ctx, 567, 1) 678 So(err, ShouldBeNil) 679 So(datastore.Get(ctx, b), ShouldBeNil) 680 So(b.Proto.Status, ShouldEqual, pb.Status_FAILURE) 681 }) 682 683 Convey("build has output status set to CANCELED", func() { 684 fakeTaskResult := &apipb.TaskResultResponse{ 685 State: apipb.TaskState_COMPLETED, 686 Failure: true, 687 } 688 mockSwarm.EXPECT().GetTaskResult(ctx, "task567").Return(fakeTaskResult, nil) 689 b := &model.Build{ 690 ID: 567, 691 Proto: &pb.Build{ 692 Builder: &pb.BuilderID{ 693 Project: "proj", 694 Bucket: "bucket", 695 Builder: "builder", 696 }, 697 CreateTime: ×tamppb.Timestamp{Seconds: now.UnixNano() / 1000000000}, 698 Status: pb.Status_STARTED, 699 Output: &pb.Build_Output{ 700 Status: pb.Status_CANCELED, 701 }, 702 }, 703 } 704 inf := &model.BuildInfra{ 705 ID: 1, 706 Build: datastore.KeyForObj(ctx, &model.Build{ID: 567}), 707 Proto: &pb.BuildInfra{ 708 Swarming: &pb.BuildInfra_Swarming{ 709 Hostname: "swarm", 710 TaskId: "task567", 711 }, 712 }, 713 } 714 bs := &model.BuildStatus{ 715 Build: datastore.KeyForObj(ctx, b), 716 Status: b.Proto.Status, 717 } 718 So(datastore.Put(ctx, b, inf, bs), ShouldBeNil) 719 err := SyncBuild(ctx, 567, 1) 720 So(err, ShouldBeNil) 721 So(datastore.Get(ctx, b), ShouldBeNil) 722 So(b.Proto.Status, ShouldEqual, pb.Status_CANCELED) 723 }) 724 725 Convey("build has output status set to CANCELED while swarming task succeeded", func() { 726 fakeTaskResult := &apipb.TaskResultResponse{ 727 State: apipb.TaskState_COMPLETED, 728 } 729 mockSwarm.EXPECT().GetTaskResult(ctx, "task567").Return(fakeTaskResult, nil) 730 b := &model.Build{ 731 ID: 567, 732 Proto: &pb.Build{ 733 Builder: &pb.BuilderID{ 734 Project: "proj", 735 Bucket: "bucket", 736 Builder: "builder", 737 }, 738 CreateTime: ×tamppb.Timestamp{Seconds: now.UnixNano() / 1000000000}, 739 Status: pb.Status_STARTED, 740 Output: &pb.Build_Output{ 741 Status: pb.Status_CANCELED, 742 }, 743 }, 744 } 745 inf := &model.BuildInfra{ 746 ID: 1, 747 Build: datastore.KeyForObj(ctx, &model.Build{ID: 567}), 748 Proto: &pb.BuildInfra{ 749 Swarming: &pb.BuildInfra_Swarming{ 750 Hostname: "swarm", 751 TaskId: "task567", 752 }, 753 }, 754 } 755 bs := &model.BuildStatus{ 756 Build: datastore.KeyForObj(ctx, b), 757 Status: b.Proto.Status, 758 } 759 So(datastore.Put(ctx, b, inf, bs), ShouldBeNil) 760 err := SyncBuild(ctx, 567, 1) 761 So(err, ShouldBeNil) 762 So(datastore.Get(ctx, b), ShouldBeNil) 763 So(b.Proto.Status, ShouldEqual, pb.Status_CANCELED) 764 }) 765 766 Convey("task has no resource", func() { 767 fakeTaskResult := &apipb.TaskResultResponse{ 768 State: apipb.TaskState_NO_RESOURCE, 769 AbandonedTs: ×tamppb.Timestamp{Seconds: now.UnixNano() / 1000000000}, 770 } 771 mockSwarm.EXPECT().GetTaskResult(ctx, "task567").Return(fakeTaskResult, nil) 772 b := &model.Build{ 773 ID: 567, 774 Proto: &pb.Build{ 775 Builder: &pb.BuilderID{ 776 Project: "proj", 777 Bucket: "bucket", 778 Builder: "builder", 779 }, 780 CreateTime: ×tamppb.Timestamp{Seconds: now.UnixNano() / 1000000000}, 781 Status: pb.Status_SCHEDULED, 782 }, 783 } 784 inf := &model.BuildInfra{ 785 ID: 1, 786 Build: datastore.KeyForObj(ctx, &model.Build{ID: 567}), 787 Proto: &pb.BuildInfra{ 788 Swarming: &pb.BuildInfra_Swarming{ 789 Hostname: "swarm", 790 TaskId: "task567", 791 }, 792 }, 793 } 794 bs := &model.BuildStatus{ 795 Build: datastore.KeyForObj(ctx, b), 796 Status: b.Proto.Status, 797 } 798 So(datastore.Put(ctx, b, inf, bs), ShouldBeNil) 799 err := SyncBuild(ctx, 567, 1) 800 So(err, ShouldBeNil) 801 So(datastore.Get(ctx, b), ShouldBeNil) 802 So(b.Proto.Status, ShouldEqual, pb.Status_INFRA_FAILURE) 803 So(b.Proto.StartTime, ShouldBeNil) 804 }) 805 806 var cases = []struct { 807 fakeTaskResult *apipb.TaskResultResponse 808 expected *expectedBuildFields 809 }{ 810 { 811 fakeTaskResult: &apipb.TaskResultResponse{State: apipb.TaskState_PENDING}, 812 expected: &expectedBuildFields{ 813 status: pb.Status_SCHEDULED, 814 }, 815 }, 816 { 817 fakeTaskResult: &apipb.TaskResultResponse{ 818 State: apipb.TaskState_RUNNING, 819 StartedTs: ×tamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000}, 820 }, 821 expected: &expectedBuildFields{ 822 status: pb.Status_STARTED, 823 startT: ×tamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000}, 824 }, 825 }, 826 { 827 fakeTaskResult: &apipb.TaskResultResponse{ 828 State: apipb.TaskState_COMPLETED, 829 StartedTs: ×tamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000}, 830 CompletedTs: ×tamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000}, 831 }, 832 expected: &expectedBuildFields{ 833 status: pb.Status_SUCCESS, 834 startT: ×tamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000}, 835 endT: ×tamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000}, 836 }, 837 }, 838 { 839 fakeTaskResult: &apipb.TaskResultResponse{ 840 State: apipb.TaskState_COMPLETED, 841 StartedTs: ×tamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000}, 842 CompletedTs: ×tamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000}, 843 BotDimensions: []*apipb.StringListPair{ 844 { 845 Key: "os", 846 Value: []string{"Ubuntu", "Trusty"}, 847 }, 848 { 849 Key: "pool", 850 Value: []string{"luci.chromium.try"}, 851 }, 852 { 853 Key: "id", 854 Value: []string{"bot1"}, 855 }, 856 { 857 Key: "empty", 858 }, 859 }, 860 }, 861 expected: &expectedBuildFields{ 862 status: pb.Status_SUCCESS, 863 startT: ×tamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000}, 864 endT: ×tamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000}, 865 botDimensions: []*pb.StringPair{ 866 {Key: "id", Value: "bot1"}, 867 {Key: "os", Value: "Trusty"}, 868 {Key: "os", Value: "Ubuntu"}, 869 {Key: "pool", Value: "luci.chromium.try"}, 870 }, 871 }, 872 }, 873 { 874 fakeTaskResult: &apipb.TaskResultResponse{ 875 State: apipb.TaskState_COMPLETED, 876 Failure: true, 877 StartedTs: ×tamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000}, 878 CompletedTs: ×tamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000}, 879 }, 880 expected: &expectedBuildFields{ 881 status: pb.Status_INFRA_FAILURE, 882 startT: ×tamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000}, 883 endT: ×tamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000}, 884 }, 885 }, 886 { 887 fakeTaskResult: &apipb.TaskResultResponse{ 888 State: apipb.TaskState_BOT_DIED, 889 StartedTs: ×tamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000}, 890 CompletedTs: ×tamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000}, 891 }, 892 expected: &expectedBuildFields{ 893 status: pb.Status_INFRA_FAILURE, 894 startT: ×tamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000}, 895 endT: ×tamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000}, 896 }, 897 }, 898 { 899 fakeTaskResult: &apipb.TaskResultResponse{ 900 State: apipb.TaskState_TIMED_OUT, 901 StartedTs: ×tamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000}, 902 CompletedTs: ×tamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000}, 903 }, 904 expected: &expectedBuildFields{ 905 status: pb.Status_INFRA_FAILURE, 906 isTimeOut: true, 907 startT: ×tamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000}, 908 endT: ×tamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000}, 909 }, 910 }, 911 { 912 fakeTaskResult: &apipb.TaskResultResponse{ 913 State: apipb.TaskState_EXPIRED, 914 AbandonedTs: ×tamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000}, 915 }, 916 expected: &expectedBuildFields{ 917 status: pb.Status_INFRA_FAILURE, 918 isResourceExhaustion: true, 919 isTimeOut: true, 920 endT: ×tamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000}, 921 }, 922 }, 923 { 924 fakeTaskResult: &apipb.TaskResultResponse{ 925 State: apipb.TaskState_KILLED, 926 AbandonedTs: ×tamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000}, 927 }, 928 expected: &expectedBuildFields{ 929 status: pb.Status_CANCELED, 930 endT: ×tamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000}, 931 }, 932 }, 933 { 934 fakeTaskResult: &apipb.TaskResultResponse{ 935 State: apipb.TaskState_NO_RESOURCE, 936 AbandonedTs: ×tamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000}, 937 }, 938 expected: &expectedBuildFields{ 939 status: pb.Status_INFRA_FAILURE, 940 isResourceExhaustion: true, 941 endT: ×tamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000}, 942 }, 943 }, 944 { 945 fakeTaskResult: &apipb.TaskResultResponse{ 946 State: apipb.TaskState_NO_RESOURCE, 947 AbandonedTs: ×tamppb.Timestamp{Seconds: now.UnixNano() / 1000000000}, 948 }, 949 expected: &expectedBuildFields{ 950 status: pb.Status_INFRA_FAILURE, 951 isResourceExhaustion: true, 952 endT: ×tamppb.Timestamp{Seconds: now.UnixNano() / 1000000000}, 953 }, 954 }, 955 } 956 for i, tCase := range cases { 957 Convey(fmt.Sprintf("test %d - task %s", i, tCase.fakeTaskResult.State), func() { 958 mockSwarm.EXPECT().GetTaskResult(ctx, "task_id").Return(tCase.fakeTaskResult, nil) 959 err := SyncBuild(ctx, 123, 1) 960 So(err, ShouldBeNil) 961 syncedBuild := &model.Build{ID: 123} 962 So(datastore.Get(ctx, syncedBuild), ShouldBeNil) 963 So(syncedBuild.Status, ShouldEqual, tCase.expected.status) 964 if tCase.expected.isResourceExhaustion { 965 So(syncedBuild.Proto.StatusDetails.ResourceExhaustion, ShouldResembleProto, &pb.StatusDetails_ResourceExhaustion{}) 966 } else { 967 So(syncedBuild.Proto.StatusDetails.GetResourceExhaustion(), ShouldBeNil) 968 } 969 if tCase.expected.isTimeOut { 970 So(syncedBuild.Proto.StatusDetails.Timeout, ShouldResembleProto, &pb.StatusDetails_Timeout{}) 971 } else { 972 So(syncedBuild.Proto.StatusDetails.GetTimeout(), ShouldBeNil) 973 } 974 if tCase.expected.startT != nil { 975 So(syncedBuild.Proto.StartTime, ShouldResembleProto, tCase.expected.startT) 976 } 977 if tCase.expected.endT != nil { 978 So(syncedBuild.Proto.EndTime, ShouldResembleProto, tCase.expected.endT) 979 } 980 if tCase.expected.botDimensions != nil { 981 syncedInfra := &model.BuildInfra{Build: datastore.KeyForObj(ctx, syncedBuild)} 982 So(datastore.Get(ctx, syncedInfra), ShouldBeNil) 983 So(syncedInfra.Proto.Swarming.BotDimensions, ShouldResembleProto, tCase.expected.botDimensions) 984 } 985 if protoutil.IsEnded(syncedBuild.Status) { 986 // FinalizeResultDB, ExportBigQuery, NotifyPubSub, NotifyPubSubGoProxy and a continuation sync task. 987 So(sch.Tasks(), ShouldHaveLength, 5) 988 989 v2fs := []any{pb.Status_name[int32(syncedBuild.Status)], "None"} 990 So(metricsStore.Get(ctx, metrics.V2.BuildCountCompleted, time.Time{}, v2fs), ShouldEqual, 1) 991 992 } else if syncedBuild.Status == pb.Status_STARTED { 993 // NotifyPubSub, NotifyPubSubGoProxy and a continuation sync task. 994 So(sch.Tasks(), ShouldHaveLength, 3) 995 So(metricsStore.Get(ctx, metrics.V2.BuildCountStarted, time.Time{}, []any{"None"}), ShouldEqual, 1) 996 } 997 syncedBuildStatus := &model.BuildStatus{Build: datastore.KeyForObj(ctx, syncedBuild)} 998 So(datastore.Get(ctx, syncedBuildStatus), ShouldBeNil) 999 So(syncedBuildStatus.Status, ShouldEqual, syncedBuild.Proto.Status) 1000 }) 1001 } 1002 }) 1003 }) 1004 1005 } 1006 1007 func TestHandleCancelSwarmingTask(t *testing.T) { 1008 t.Parallel() 1009 Convey("HandleCancelSwarmingTask", t, func() { 1010 ctl := gomock.NewController(t) 1011 defer ctl.Finish() 1012 now := testclock.TestRecentTimeUTC 1013 mockSwarm := clients.NewMockSwarmingClient(ctl) 1014 ctx, _ := testclock.UseTime(context.Background(), now) 1015 ctx = context.WithValue(ctx, &clients.MockSwarmingClientKey, mockSwarm) 1016 ctx = memory.UseWithAppID(ctx, "dev~app-id") 1017 ctx = txndefer.FilterRDS(ctx) 1018 ctx = metrics.WithServiceInfo(ctx, "svc", "job", "ins") 1019 datastore.GetTestable(ctx).AutoIndex(true) 1020 datastore.GetTestable(ctx).Consistent(true) 1021 1022 Convey("wrong", func() { 1023 Convey("empty hostname", func() { 1024 err := HandleCancelSwarmingTask(ctx, "", "task123", "project:bucket") 1025 So(err, ShouldErrLike, "hostname is empty") 1026 So(tq.Fatal.In(err), ShouldBeTrue) 1027 }) 1028 1029 Convey("empty taskID", func() { 1030 err := HandleCancelSwarmingTask(ctx, "hostname", "", "project:bucket") 1031 So(err, ShouldErrLike, "taskID is empty") 1032 So(tq.Fatal.In(err), ShouldBeTrue) 1033 }) 1034 1035 Convey("wrong realm", func() { 1036 err := HandleCancelSwarmingTask(ctx, "hostname", "task123", "bad_realm") 1037 So(err, ShouldErrLike, `bad global realm name "bad_realm"`) 1038 So(tq.Fatal.In(err), ShouldBeTrue) 1039 }) 1040 1041 Convey("swarming http 500", func() { 1042 mockSwarm.EXPECT().CancelTask(ctx, gomock.Any()).Return(nil, &googleapi.Error{Code: 500, Message: "swarming internal error"}) 1043 err := HandleCancelSwarmingTask(ctx, "hostname", "task123", "project:bucket") 1044 1045 So(err, ShouldErrLike, "transient error in cancelling the task task123") 1046 So(transient.Tag.In(err), ShouldBeTrue) 1047 }) 1048 1049 Convey("swarming http <500", func() { 1050 mockSwarm.EXPECT().CancelTask(ctx, gomock.Any()).Return(nil, &googleapi.Error{Code: 400, Message: "bad request"}) 1051 err := HandleCancelSwarmingTask(ctx, "hostname", "task123", "project:bucket") 1052 1053 So(err, ShouldErrLike, "fatal error in cancelling the task task123") 1054 So(tq.Fatal.In(err), ShouldBeTrue) 1055 }) 1056 }) 1057 1058 Convey("success", func() { 1059 Convey("response.ok", func() { 1060 mockSwarm.EXPECT().CancelTask(ctx, gomock.Any()).Return(&apipb.CancelResponse{Canceled: true}, nil) 1061 So(HandleCancelSwarmingTask(ctx, "hostname", "task123", "project:bucket"), ShouldBeNil) 1062 }) 1063 1064 Convey("!response.ok", func() { 1065 mockSwarm.EXPECT().CancelTask(ctx, gomock.Any()).Return(&apipb.CancelResponse{Canceled: false, WasRunning: false}, nil) 1066 So(HandleCancelSwarmingTask(ctx, "hostname", "task123", "project:bucket"), ShouldBeNil) 1067 }) 1068 }) 1069 }) 1070 } 1071 1072 func TestSubNotify(t *testing.T) { 1073 t.Parallel() 1074 Convey("SubNotify", t, func() { 1075 ctl := gomock.NewController(t) 1076 defer ctl.Finish() 1077 now := testclock.TestRecentTimeUTC 1078 mockSwarm := clients.NewMockSwarmingClient(ctl) 1079 ctx, _ := testclock.UseTime(context.Background(), now) 1080 ctx = context.WithValue(ctx, &clients.MockSwarmingClientKey, mockSwarm) 1081 ctx = memory.UseWithAppID(ctx, "dev~app-id") 1082 ctx = txndefer.FilterRDS(ctx) 1083 ctx = metrics.WithServiceInfo(ctx, "svc", "job", "ins") 1084 ctx = metrics.WithBuilder(ctx, "proj", "bucket", "builder") 1085 datastore.GetTestable(ctx).AutoIndex(true) 1086 datastore.GetTestable(ctx).Consistent(true) 1087 ctx, _ = tsmon.WithDummyInMemory(ctx) 1088 store := tsmon.Store(ctx) 1089 ctx, _ = tq.TestingContext(ctx, nil) 1090 ctx = cachingtest.WithGlobalCache(ctx, map[string]caching.BlobCache{ 1091 "swarming-pubsub-msg-id": cachingtest.NewBlobCache(), 1092 }) 1093 ctx, sch := tq.TestingContext(ctx, nil) 1094 1095 b := &model.Build{ 1096 ID: 123, 1097 Proto: &pb.Build{ 1098 Id: 123, 1099 Status: pb.Status_SCHEDULED, 1100 CreateTime: ×tamppb.Timestamp{Seconds: now.UnixNano() / 1000000000}, 1101 SchedulingTimeout: &durationpb.Duration{ 1102 Seconds: 3600, 1103 }, 1104 ExecutionTimeout: &durationpb.Duration{ 1105 Seconds: 4800, 1106 }, 1107 GracePeriod: &durationpb.Duration{ 1108 Seconds: 60, 1109 }, 1110 Builder: &pb.BuilderID{ 1111 Project: "proj", 1112 Bucket: "bucket", 1113 Builder: "builder", 1114 }, 1115 }, 1116 } 1117 inf := &model.BuildInfra{ 1118 ID: 1, 1119 Build: datastore.KeyForObj(ctx, &model.Build{ID: 123}), 1120 Proto: &pb.BuildInfra{ 1121 Swarming: &pb.BuildInfra_Swarming{ 1122 Hostname: "swarm", 1123 TaskId: "task123", 1124 Caches: []*pb.BuildInfra_Swarming_CacheEntry{ 1125 {Name: "shared_builder_cache", Path: "builder", WaitForWarmCache: &durationpb.Duration{Seconds: 60}}, 1126 {Name: "second_cache", Path: "second", WaitForWarmCache: &durationpb.Duration{Seconds: 360}}, 1127 }, 1128 TaskDimensions: []*pb.RequestedDimension{ 1129 {Key: "a", Value: "1", Expiration: &durationpb.Duration{Seconds: 120}}, 1130 {Key: "a", Value: "2", Expiration: &durationpb.Duration{Seconds: 120}}, 1131 {Key: "pool", Value: "Chrome"}, 1132 }, 1133 }, 1134 Bbagent: &pb.BuildInfra_BBAgent{ 1135 CacheDir: "cache", 1136 PayloadPath: "kitchen-checkout", 1137 }, 1138 Buildbucket: &pb.BuildInfra_Buildbucket{ 1139 Agent: &pb.BuildInfra_Buildbucket_Agent{ 1140 Source: &pb.BuildInfra_Buildbucket_Agent_Source{ 1141 DataType: &pb.BuildInfra_Buildbucket_Agent_Source_Cipd{ 1142 Cipd: &pb.BuildInfra_Buildbucket_Agent_Source_CIPD{ 1143 Package: "infra/tools/luci/bbagent/${platform}", 1144 Version: "canary-version", 1145 Server: "cipd server", 1146 }, 1147 }, 1148 }, 1149 }, 1150 }, 1151 }, 1152 } 1153 bs := &model.BuildStatus{ 1154 Build: datastore.KeyForObj(ctx, b), 1155 Status: b.Proto.Status, 1156 } 1157 So(datastore.Put(ctx, b, inf, bs), ShouldBeNil) 1158 1159 Convey("bad msg data", func() { 1160 body := makeSwarmingPubsubMsg(&userdata{ 1161 BuildID: 999, 1162 CreatedTS: 1448841600000, 1163 SwarmingHostname: "swarm", 1164 }, "", "msg1") 1165 err := SubNotify(ctx, body) 1166 So(err, ShouldErrLike, "task_id not found in message data") 1167 So(transient.Tag.In(err), ShouldBeFalse) 1168 1169 body = makeSwarmingPubsubMsg(&userdata{ 1170 CreatedTS: 1448841600000, 1171 SwarmingHostname: "swarm", 1172 }, "task123", "msg1") 1173 err = SubNotify(ctx, body) 1174 So(err, ShouldErrLike, "invalid build_id 0") 1175 1176 body = makeSwarmingPubsubMsg(&userdata{ 1177 BuildID: 999, 1178 SwarmingHostname: "swarm", 1179 }, "task123", "msg1") 1180 err = SubNotify(ctx, body) 1181 So(err, ShouldErrLike, "invalid created_ts 0") 1182 1183 body = makeSwarmingPubsubMsg(&userdata{ 1184 BuildID: 999, 1185 CreatedTS: 1448841600000, 1186 SwarmingHostname: " ", 1187 }, "task123", "msg1") 1188 err = SubNotify(ctx, body) 1189 So(err, ShouldErrLike, "swarming hostname not found in userdata") 1190 1191 body = makeSwarmingPubsubMsg(&userdata{ 1192 BuildID: 999, 1193 CreatedTS: 1448841600000, 1194 SwarmingHostname: "https://swarm.com", 1195 }, "task123", "msg1") 1196 err = SubNotify(ctx, body) 1197 So(err, ShouldErrLike, "swarming hostname https://swarm.com must not contain '://'") 1198 }) 1199 1200 Convey("build not found", func() { 1201 old := now.Add(-time.Minute).UnixNano() / int64(time.Microsecond) 1202 body := makeSwarmingPubsubMsg(&userdata{ 1203 BuildID: 999, 1204 CreatedTS: old, 1205 SwarmingHostname: "swarm", 1206 }, "task123", "msg1") 1207 err := SubNotify(ctx, body) 1208 So(err, ShouldErrLike, "Build 999 or BuildInfra for task https://swarm/task?id=task123 not found") 1209 So(transient.Tag.In(err), ShouldBeFalse) 1210 1211 recent := now.Add(-50*time.Second).UnixNano() / int64(time.Microsecond) 1212 body = makeSwarmingPubsubMsg(&userdata{ 1213 BuildID: 999, 1214 CreatedTS: recent, 1215 SwarmingHostname: "swarm", 1216 }, "task123", "msg1") 1217 err = SubNotify(ctx, body) 1218 So(err, ShouldErrLike, "Build 999 or BuildInfra for task https://swarm/task?id=task123 not found yet") 1219 So(transient.Tag.In(err), ShouldBeTrue) 1220 }) 1221 1222 Convey("different swarming hostname", func() { 1223 1224 body := makeSwarmingPubsubMsg(&userdata{ 1225 BuildID: 123, 1226 CreatedTS: 1517260502000000, 1227 SwarmingHostname: "swarm2", 1228 }, "task123", "msg1") 1229 err := SubNotify(ctx, body) 1230 So(err, ShouldErrLike, "swarming_hostname swarm of build 123 does not match swarm2") 1231 So(transient.Tag.In(err), ShouldBeFalse) 1232 }) 1233 1234 Convey("different task id", func() { 1235 body := makeSwarmingPubsubMsg(&userdata{ 1236 BuildID: 123, 1237 CreatedTS: 1517260502000000, 1238 SwarmingHostname: "swarm", 1239 }, "task345", "msg1") 1240 err := SubNotify(ctx, body) 1241 So(err, ShouldErrLike, "swarming_task_id task123 of build 123 does not match task345") 1242 So(transient.Tag.In(err), ShouldBeFalse) 1243 }) 1244 1245 Convey("swarming 500s error", func() { 1246 body := makeSwarmingPubsubMsg(&userdata{ 1247 BuildID: 123, 1248 CreatedTS: 1517260502000000, 1249 SwarmingHostname: "swarm", 1250 }, "task123", "msg1") 1251 mockSwarm.EXPECT().GetTaskResult(ctx, "task123").Return(nil, &googleapi.Error{Code: 500, Message: "swarming internal error"}) 1252 err := SubNotify(ctx, body) 1253 So(err, ShouldErrLike, "googleapi: Error 500: swarming internal error") 1254 So(transient.Tag.In(err), ShouldBeTrue) 1255 1256 cache := caching.GlobalCache(ctx, "swarming-pubsub-msg-id") 1257 _, err = cache.Get(ctx, "msg1") 1258 So(err, ShouldEqual, caching.ErrCacheMiss) 1259 }) 1260 1261 Convey("status already ended", func() { 1262 b.Proto.Status = pb.Status_SUCCESS 1263 So(datastore.Put(ctx, b), ShouldBeNil) 1264 1265 body := makeSwarmingPubsubMsg(&userdata{ 1266 BuildID: 123, 1267 CreatedTS: 1517260502000000, 1268 SwarmingHostname: "swarm", 1269 }, "task123", "msg1") 1270 err := SubNotify(ctx, body) 1271 So(err, ShouldBeNil) 1272 mockSwarm.EXPECT().CreateTask(gomock.Any(), gomock.Any()).Times(0) 1273 1274 So(sch.Tasks(), ShouldHaveLength, 0) 1275 }) 1276 1277 Convey("status changed to success", func() { 1278 body := makeSwarmingPubsubMsg(&userdata{ 1279 BuildID: 123, 1280 CreatedTS: 1517260502000000, 1281 SwarmingHostname: "swarm", 1282 }, "task123", "msg1") 1283 mockSwarm.EXPECT().GetTaskResult(ctx, "task123").Return(&apipb.TaskResultResponse{ 1284 State: apipb.TaskState_COMPLETED, 1285 StartedTs: ×tamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000}, 1286 CompletedTs: ×tamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000}, 1287 BotDimensions: []*apipb.StringListPair{ 1288 { 1289 Key: "new_key", 1290 Value: []string{"new_val"}, 1291 }, 1292 }, 1293 }, nil) 1294 err := SubNotify(ctx, body) 1295 So(err, ShouldBeNil) 1296 syncedBuild := &model.Build{ID: 123} 1297 So(datastore.Get(ctx, syncedBuild), ShouldBeNil) 1298 So(syncedBuild.Status, ShouldEqual, pb.Status_SUCCESS) 1299 So(syncedBuild.Proto.StartTime, ShouldResembleProto, ×tamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000}) 1300 So(syncedBuild.Proto.EndTime, ShouldResembleProto, ×tamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000}) 1301 1302 syncedInfra := &model.BuildInfra{Build: datastore.KeyForObj(ctx, syncedBuild)} 1303 So(datastore.Get(ctx, syncedInfra), ShouldBeNil) 1304 So(syncedInfra.Proto.Swarming.BotDimensions, ShouldResembleProto, []*pb.StringPair{ 1305 { 1306 Key: "new_key", 1307 Value: "new_val", 1308 }, 1309 }) 1310 syncedBuildStatus := &model.BuildStatus{Build: datastore.KeyForObj(ctx, syncedBuild)} 1311 So(datastore.Get(ctx, syncedBuildStatus), ShouldBeNil) 1312 So(syncedBuildStatus.Status, ShouldEqual, pb.Status_SUCCESS) 1313 1314 cache := caching.GlobalCache(ctx, "swarming-pubsub-msg-id") 1315 cached, err := cache.Get(ctx, "msg1") 1316 So(err, ShouldBeNil) 1317 So(cached, ShouldResemble, []byte{1}) 1318 1319 // FinalizeResultDB, ExportBigQuery, NotifyPubSub, NotifyPubSubGoProxy tasks. 1320 So(sch.Tasks(), ShouldHaveLength, 4) 1321 1322 // BuildCompleted metric should be set to 1 with SUCCESS. 1323 v2fs := []any{pb.Status_name[int32(syncedBuild.Status)], "None"} 1324 So(store.Get(ctx, metrics.V2.BuildCountCompleted, time.Time{}, v2fs), ShouldEqual, 1) 1325 }) 1326 1327 Convey("status unchanged(in STARTED) while bot dimensions changed", func() { 1328 b.Proto.Status = pb.Status_STARTED 1329 bs.Status = b.Proto.Status 1330 So(datastore.Put(ctx, b, bs), ShouldBeNil) 1331 body := makeSwarmingPubsubMsg(&userdata{ 1332 BuildID: 123, 1333 CreatedTS: 1517260502000000, 1334 SwarmingHostname: "swarm", 1335 }, "task123", "msg1") 1336 mockSwarm.EXPECT().GetTaskResult(ctx, "task123").Return(&apipb.TaskResultResponse{ 1337 State: apipb.TaskState_RUNNING, 1338 StartedTs: ×tamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000}, 1339 BotDimensions: []*apipb.StringListPair{ 1340 { 1341 Key: "new_key", 1342 Value: []string{"new_val"}, 1343 }, 1344 }, 1345 }, nil) 1346 err := SubNotify(ctx, body) 1347 So(err, ShouldBeNil) 1348 syncedBuild := &model.Build{ID: 123} 1349 So(datastore.Get(ctx, syncedBuild), ShouldBeNil) 1350 So(syncedBuild.Status, ShouldEqual, pb.Status_STARTED) 1351 1352 syncedInfra := &model.BuildInfra{Build: datastore.KeyForObj(ctx, syncedBuild)} 1353 So(datastore.Get(ctx, syncedInfra), ShouldBeNil) 1354 So(syncedInfra.Proto.Swarming.BotDimensions, ShouldResembleProto, []*pb.StringPair{{ 1355 Key: "new_key", 1356 Value: "new_val", 1357 }}) 1358 syncedBuildStatus := &model.BuildStatus{Build: datastore.KeyForObj(ctx, syncedBuild)} 1359 So(datastore.Get(ctx, syncedBuildStatus), ShouldBeNil) 1360 So(syncedBuildStatus.Status, ShouldEqual, pb.Status_STARTED) 1361 1362 So(sch.Tasks(), ShouldHaveLength, 0) 1363 }) 1364 1365 Convey("status unchanged(not in STARTED) while bot dimensions changed", func() { 1366 b.Proto.Status = pb.Status_STARTED 1367 bs.Status = b.Proto.Status 1368 So(datastore.Put(ctx, b, bs), ShouldBeNil) 1369 body := makeSwarmingPubsubMsg(&userdata{ 1370 BuildID: 123, 1371 CreatedTS: 1517260502000000, 1372 SwarmingHostname: "swarm", 1373 }, "task123", "msg1") 1374 mockSwarm.EXPECT().GetTaskResult(ctx, "task123").Return(&apipb.TaskResultResponse{ 1375 State: apipb.TaskState_PENDING, 1376 BotDimensions: []*apipb.StringListPair{ 1377 { 1378 Key: "new_key", 1379 Value: []string{"new_val"}, 1380 }, 1381 }, 1382 }, nil) 1383 err := SubNotify(ctx, body) 1384 So(err, ShouldBeNil) 1385 syncedBuild := &model.Build{ID: 123} 1386 So(datastore.Get(ctx, syncedBuild), ShouldBeNil) 1387 So(syncedBuild.Status, ShouldEqual, pb.Status_STARTED) 1388 1389 currentInfra := &model.BuildInfra{Build: datastore.KeyForObj(ctx, syncedBuild)} 1390 So(datastore.Get(ctx, currentInfra), ShouldBeNil) 1391 So(currentInfra.Proto.Swarming.BotDimensions, ShouldBeEmpty) 1392 1393 syncedBuildStatus := &model.BuildStatus{Build: datastore.KeyForObj(ctx, syncedBuild)} 1394 So(datastore.Get(ctx, syncedBuildStatus), ShouldBeNil) 1395 So(syncedBuildStatus.Status, ShouldEqual, pb.Status_STARTED) 1396 1397 So(sch.Tasks(), ShouldHaveLength, 0) 1398 }) 1399 1400 Convey("duplicate message", func() { 1401 cache := caching.GlobalCache(ctx, "swarming-pubsub-msg-id") 1402 err := cache.Set(ctx, "msg123", []byte{1}, 0*time.Second) 1403 So(err, ShouldBeNil) 1404 1405 body := makeSwarmingPubsubMsg(&userdata{ 1406 BuildID: 123, 1407 CreatedTS: 1517260502000000, 1408 SwarmingHostname: "swarm", 1409 }, "task123", "msg123") 1410 mockSwarm.EXPECT().GetTaskResult(ctx, "task_id").Times(0) 1411 err = SubNotify(ctx, body) 1412 So(err, ShouldBeNil) 1413 }) 1414 }) 1415 } 1416 1417 func makeSwarmingPubsubMsg(userdata *userdata, taskID string, msgID string) io.Reader { 1418 ud, _ := json.Marshal(userdata) 1419 data := struct { 1420 TaskID string `json:"task_id"` 1421 Userdata string `json:"userdata"` 1422 }{TaskID: taskID, Userdata: string(ud)} 1423 bd, _ := json.Marshal(data) 1424 msg := struct { 1425 Message struct { 1426 Data string 1427 MessageID string 1428 } 1429 }{struct { 1430 Data string 1431 MessageID string 1432 }{Data: base64.StdEncoding.EncodeToString(bd), MessageID: msgID}} 1433 jmsg, _ := json.Marshal(msg) 1434 return bytes.NewReader(jmsg) 1435 } 1436 1437 type expectedBuildFields struct { 1438 status pb.Status 1439 startT *timestamppb.Timestamp 1440 endT *timestamppb.Timestamp 1441 isTimeOut bool 1442 isResourceExhaustion bool 1443 botDimensions []*pb.StringPair 1444 }