go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/buildbucket/appengine/tasks/swarming_test.go (about)

     1  // Copyright 2022 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tasks
    16  
    17  import (
    18  	"bytes"
    19  	"context"
    20  	"encoding/base64"
    21  	"encoding/json"
    22  	"fmt"
    23  	"io"
    24  	"path/filepath"
    25  	"testing"
    26  	"time"
    27  
    28  	"github.com/golang/mock/gomock"
    29  	"google.golang.org/api/googleapi"
    30  	"google.golang.org/protobuf/types/known/durationpb"
    31  	"google.golang.org/protobuf/types/known/timestamppb"
    32  
    33  	"go.chromium.org/luci/common/clock/testclock"
    34  	"go.chromium.org/luci/common/retry/transient"
    35  	"go.chromium.org/luci/common/tsmon"
    36  	"go.chromium.org/luci/gae/filter/txndefer"
    37  	"go.chromium.org/luci/gae/impl/memory"
    38  	"go.chromium.org/luci/gae/service/datastore"
    39  	"go.chromium.org/luci/server/caching"
    40  	"go.chromium.org/luci/server/caching/cachingtest"
    41  	"go.chromium.org/luci/server/secrets"
    42  	"go.chromium.org/luci/server/secrets/testsecrets"
    43  	"go.chromium.org/luci/server/tq"
    44  	apipb "go.chromium.org/luci/swarming/proto/api_v2"
    45  
    46  	"go.chromium.org/luci/buildbucket/appengine/internal/clients"
    47  	"go.chromium.org/luci/buildbucket/appengine/internal/metrics"
    48  	"go.chromium.org/luci/buildbucket/appengine/model"
    49  	taskdefs "go.chromium.org/luci/buildbucket/appengine/tasks/defs"
    50  	"go.chromium.org/luci/buildbucket/cmd/bbagent/bbinput"
    51  	pb "go.chromium.org/luci/buildbucket/proto"
    52  	"go.chromium.org/luci/buildbucket/protoutil"
    53  
    54  	. "github.com/smartystreets/goconvey/convey"
    55  	. "go.chromium.org/luci/common/testing/assertions"
    56  )
    57  
    58  func TestTaskDef(t *testing.T) {
    59  	Convey("compute task slice", t, func() {
    60  		b := &model.Build{
    61  			ID: 123,
    62  			Proto: &pb.Build{
    63  				Id: 123,
    64  				SchedulingTimeout: &durationpb.Duration{
    65  					Seconds: 3600,
    66  				},
    67  				ExecutionTimeout: &durationpb.Duration{
    68  					Seconds: 4800,
    69  				},
    70  				GracePeriod: &durationpb.Duration{
    71  					Seconds: 60,
    72  				},
    73  				Infra: &pb.BuildInfra{
    74  					Swarming: &pb.BuildInfra_Swarming{},
    75  					Bbagent: &pb.BuildInfra_BBAgent{
    76  						CacheDir:    "cache",
    77  						PayloadPath: "kitchen-checkout",
    78  					},
    79  					Buildbucket: &pb.BuildInfra_Buildbucket{
    80  						Agent: &pb.BuildInfra_Buildbucket_Agent{
    81  							Source: &pb.BuildInfra_Buildbucket_Agent_Source{
    82  								DataType: &pb.BuildInfra_Buildbucket_Agent_Source_Cipd{
    83  									Cipd: &pb.BuildInfra_Buildbucket_Agent_Source_CIPD{
    84  										Package: "infra/tools/luci/bbagent/${platform}",
    85  										Version: "canary-version",
    86  										Server:  "cipd server",
    87  									},
    88  								},
    89  							},
    90  							CipdClientCache: &pb.CacheEntry{
    91  								Name: "cipd_client_hash",
    92  								Path: "cipd_client",
    93  							},
    94  							CipdPackagesCache: &pb.CacheEntry{
    95  								Name: "cipd_cache_hash",
    96  								Path: "cipd_cache",
    97  							},
    98  						},
    99  					},
   100  				},
   101  			},
   102  		}
   103  		Convey("only base slice", func() {
   104  			b.Proto.Infra.Swarming = &pb.BuildInfra_Swarming{
   105  				Caches: []*pb.BuildInfra_Swarming_CacheEntry{
   106  					{Name: "shared_builder_cache", Path: "builder"},
   107  				},
   108  				TaskDimensions: []*pb.RequestedDimension{
   109  					{Key: "pool", Value: "Chrome"},
   110  				},
   111  			}
   112  			slices, err := computeTaskSlice(b)
   113  			So(err, ShouldBeNil)
   114  			So(len(slices), ShouldEqual, 1)
   115  			So(slices[0].Properties.Caches, ShouldResemble, []*apipb.CacheEntry{
   116  				{
   117  					Path: filepath.Join("cache", "builder"),
   118  					Name: "shared_builder_cache",
   119  				},
   120  				{
   121  					Path: filepath.Join("cache", "cipd_client"),
   122  					Name: "cipd_client_hash",
   123  				},
   124  				{
   125  					Path: filepath.Join("cache", "cipd_cache"),
   126  					Name: "cipd_cache_hash",
   127  				},
   128  			})
   129  			So(slices[0].Properties.Dimensions, ShouldResemble, []*apipb.StringPair{
   130  				{
   131  					Key:   "pool",
   132  					Value: "Chrome",
   133  				},
   134  			})
   135  		})
   136  
   137  		Convey("multiple dimensions and cache fallback", func() {
   138  			// Creates 4 task_slices by modifying the buildercfg in 2 ways:
   139  			//  - Add two named caches, one expiring at 60 seconds, one at 360 seconds.
   140  			//  - Add an optional builder dimension, expiring at 120 seconds.
   141  			//
   142  			// This ensures the combination of these features works correctly, and that
   143  			// multiple 'caches' dimensions can be injected.
   144  			b.Proto.Infra.Swarming = &pb.BuildInfra_Swarming{
   145  				Caches: []*pb.BuildInfra_Swarming_CacheEntry{
   146  					{Name: "shared_builder_cache", Path: "builder", WaitForWarmCache: &durationpb.Duration{Seconds: 60}},
   147  					{Name: "second_cache", Path: "second", WaitForWarmCache: &durationpb.Duration{Seconds: 360}},
   148  				},
   149  				TaskDimensions: []*pb.RequestedDimension{
   150  					{Key: "a", Value: "1", Expiration: &durationpb.Duration{Seconds: 120}},
   151  					{Key: "a", Value: "2", Expiration: &durationpb.Duration{Seconds: 120}},
   152  					{Key: "pool", Value: "Chrome"},
   153  				},
   154  			}
   155  			slices, err := computeTaskSlice(b)
   156  			So(err, ShouldBeNil)
   157  			So(len(slices), ShouldEqual, 4)
   158  
   159  			// All slices properties fields have the same value except dimensions.
   160  			for _, tSlice := range slices {
   161  				So(tSlice.Properties.ExecutionTimeoutSecs, ShouldEqual, 4800)
   162  				So(tSlice.Properties.GracePeriodSecs, ShouldEqual, 240)
   163  				So(tSlice.Properties.Caches, ShouldResemble, []*apipb.CacheEntry{
   164  					{Path: filepath.Join("cache", "builder"), Name: "shared_builder_cache"},
   165  					{Path: filepath.Join("cache", "second"), Name: "second_cache"},
   166  					{Path: filepath.Join("cache", "cipd_client"), Name: "cipd_client_hash"},
   167  					{Path: filepath.Join("cache", "cipd_cache"), Name: "cipd_cache_hash"},
   168  				})
   169  				So(tSlice.Properties.Env, ShouldResemble, []*apipb.StringPair{
   170  					{Key: "BUILDBUCKET_EXPERIMENTAL", Value: "FALSE"},
   171  				})
   172  			}
   173  
   174  			So(slices[0].ExpirationSecs, ShouldEqual, 60)
   175  			// The dimensions are different. 'a' and 'caches' are injected.
   176  			So(slices[0].Properties.Dimensions, ShouldResemble, []*apipb.StringPair{
   177  				{Key: "a", Value: "1"},
   178  				{Key: "a", Value: "2"},
   179  				{Key: "caches", Value: "second_cache"},
   180  				{Key: "caches", Value: "shared_builder_cache"},
   181  				{Key: "pool", Value: "Chrome"},
   182  			})
   183  
   184  			// 120 - 60
   185  			So(slices[1].ExpirationSecs, ShouldEqual, 60)
   186  			// The dimensions are different. 'a' and 'caches' are injected.
   187  			So(slices[1].Properties.Dimensions, ShouldResemble, []*apipb.StringPair{
   188  				{Key: "a", Value: "1"},
   189  				{Key: "a", Value: "2"},
   190  				{Key: "caches", Value: "second_cache"},
   191  				{Key: "pool", Value: "Chrome"},
   192  			})
   193  
   194  			// 360 - 120
   195  			So(slices[2].ExpirationSecs, ShouldEqual, 240)
   196  			// 'a' expired, one 'caches' remains.
   197  			So(slices[2].Properties.Dimensions, ShouldResemble, []*apipb.StringPair{
   198  				{Key: "caches", Value: "second_cache"},
   199  				{Key: "pool", Value: "Chrome"},
   200  			})
   201  
   202  			// 3600-360
   203  			So(slices[3].ExpirationSecs, ShouldEqual, 3240)
   204  			// # The cold fallback; the last 'caches' expired.
   205  			So(slices[3].Properties.Dimensions, ShouldResemble, []*apipb.StringPair{
   206  				{Key: "pool", Value: "Chrome"},
   207  			})
   208  		})
   209  	})
   210  
   211  	Convey("compute bbagent command", t, func() {
   212  		b := &model.Build{
   213  			ID: 123,
   214  			Proto: &pb.Build{
   215  				Infra: &pb.BuildInfra{
   216  					Buildbucket: &pb.BuildInfra_Buildbucket{
   217  						Hostname: "bbhost.com",
   218  					},
   219  				},
   220  			},
   221  		}
   222  		Convey("bbagent_getbuild experiment", func() {
   223  			b.Experiments = []string{"+luci.buildbucket.bbagent_getbuild"}
   224  			bbagentCmd := computeCommand(b)
   225  			So(bbagentCmd, ShouldResemble, []string{
   226  				"bbagent${EXECUTABLE_SUFFIX}",
   227  				"-host",
   228  				"bbhost.com",
   229  				"-build-id",
   230  				"123",
   231  			})
   232  		})
   233  
   234  		Convey("no bbagent_getbuild experiment", func() {
   235  			b.Experiments = []string{"-luci.buildbucket.bbagent_getbuild"}
   236  			b.Proto.Infra.Bbagent = &pb.BuildInfra_BBAgent{
   237  				CacheDir:    "cache",
   238  				PayloadPath: "payload_path",
   239  			}
   240  			bbagentCmd := computeCommand(b)
   241  			expectedEncoded := bbinput.Encode(&pb.BBAgentArgs{
   242  				Build:       b.Proto,
   243  				CacheDir:    "cache",
   244  				PayloadPath: "payload_path",
   245  			})
   246  			So(bbagentCmd, ShouldResemble, []string{
   247  				"bbagent${EXECUTABLE_SUFFIX}",
   248  				expectedEncoded,
   249  			})
   250  		})
   251  	})
   252  
   253  	Convey("compute env_prefixes", t, func() {
   254  		b := &model.Build{
   255  			ID: 123,
   256  			Proto: &pb.Build{
   257  				Infra: &pb.BuildInfra{
   258  					Swarming: &pb.BuildInfra_Swarming{},
   259  				},
   260  			},
   261  		}
   262  		Convey("empty swarming cache", func() {
   263  			prefixes := computeEnvPrefixes(b)
   264  			So(prefixes, ShouldResemble, []*apipb.StringListPair{})
   265  		})
   266  
   267  		Convey("normal", func() {
   268  			b.Proto.Infra.Swarming.Caches = []*pb.BuildInfra_Swarming_CacheEntry{
   269  				{Path: "vpython", Name: "vpython", EnvVar: "VPYTHON_VIRTUALENV_ROOT"},
   270  				{Path: "abc", Name: "abc", EnvVar: "ABC"},
   271  			}
   272  			prefixes := computeEnvPrefixes(b)
   273  			So(prefixes, ShouldResemble, []*apipb.StringListPair{
   274  				{Key: "ABC", Value: []string{filepath.Join("cache", "abc")}},
   275  				{Key: "VPYTHON_VIRTUALENV_ROOT", Value: []string{filepath.Join("cache", "vpython")}},
   276  			})
   277  		})
   278  	})
   279  
   280  	Convey("compute swarming new task req", t, func() {
   281  		ctx := memory.UseWithAppID(context.Background(), "dev~app-id")
   282  		ctx, _ = testclock.UseTime(ctx, time.Unix(1444945245, 0).UTC())
   283  		b := &model.Build{
   284  			ID:        123,
   285  			Project:   "project",
   286  			BucketID:  "bucket",
   287  			BuilderID: "builder",
   288  			Proto: &pb.Build{
   289  				Id:     123,
   290  				Number: 1,
   291  				Builder: &pb.BuilderID{
   292  					Project: "project",
   293  					Bucket:  "bucket",
   294  					Builder: "builder",
   295  				},
   296  				Infra: &pb.BuildInfra{
   297  					Swarming: &pb.BuildInfra_Swarming{
   298  						Priority:           20,
   299  						TaskServiceAccount: "abc",
   300  						Hostname:           "swarm.com",
   301  					},
   302  					Bbagent: &pb.BuildInfra_BBAgent{},
   303  					Buildbucket: &pb.BuildInfra_Buildbucket{
   304  						Hostname: "app-id.appspot.com",
   305  						Agent: &pb.BuildInfra_Buildbucket_Agent{
   306  							Source: &pb.BuildInfra_Buildbucket_Agent_Source{
   307  								DataType: &pb.BuildInfra_Buildbucket_Agent_Source_Cipd{
   308  									Cipd: &pb.BuildInfra_Buildbucket_Agent_Source_CIPD{
   309  										Package: "infra/tools/luci/bbagent/${platform}",
   310  										Version: "canary-version",
   311  										Server:  "cipd server",
   312  									},
   313  								},
   314  							},
   315  						},
   316  					},
   317  				},
   318  			},
   319  		}
   320  
   321  		req, err := computeSwarmingNewTaskReq(ctx, b)
   322  		// Strip out TaskSlices. It has been tested in other tests
   323  		req.TaskSlices = []*apipb.TaskSlice(nil)
   324  		So(err, ShouldBeNil)
   325  		ud, _ := json.Marshal(&userdata{
   326  			BuildID:          123,
   327  			CreatedTS:        1444945245000000,
   328  			SwarmingHostname: "swarm.com",
   329  		})
   330  		expected := &apipb.NewTaskRequest{
   331  			RequestUuid:      "203882df-ce4b-5012-b32a-2c1d29c321a7",
   332  			Name:             "bb-123-builder-1",
   333  			Realm:            "project:bucket",
   334  			Tags:             []string{"buildbucket_bucket:bucket", "buildbucket_build_id:123", "buildbucket_hostname:app-id.appspot.com", "buildbucket_template_canary:0", "luci_project:project"},
   335  			Priority:         int32(20),
   336  			PubsubTopic:      "projects/app-id/topics/swarming-go",
   337  			PubsubUserdata:   string(ud),
   338  			ServiceAccount:   "abc",
   339  			PoolTaskTemplate: apipb.NewTaskRequest_SKIP,
   340  		}
   341  		So(req, ShouldResemble, expected)
   342  	})
   343  }
   344  
   345  func TestSyncBuild(t *testing.T) {
   346  	t.Parallel()
   347  	Convey("SyncBuild", t, func() {
   348  		ctl := gomock.NewController(t)
   349  		defer ctl.Finish()
   350  		now := testclock.TestRecentTimeUTC
   351  		mockSwarm := clients.NewMockSwarmingClient(ctl)
   352  		ctx, _ := testclock.UseTime(context.Background(), now)
   353  		ctx = context.WithValue(ctx, &clients.MockSwarmingClientKey, mockSwarm)
   354  		ctx = memory.UseWithAppID(ctx, "dev~app-id")
   355  		ctx = txndefer.FilterRDS(ctx)
   356  		ctx = metrics.WithServiceInfo(ctx, "svc", "job", "ins")
   357  		ctx = metrics.WithBuilder(ctx, "proj", "bucket", "builder")
   358  		datastore.GetTestable(ctx).AutoIndex(true)
   359  		datastore.GetTestable(ctx).Consistent(true)
   360  		ctx, sch := tq.TestingContext(ctx, nil)
   361  		store := &testsecrets.Store{
   362  			Secrets: map[string]secrets.Secret{
   363  				"key": {Active: []byte("stuff")},
   364  			},
   365  		}
   366  		ctx = secrets.Use(ctx, store)
   367  		ctx = secrets.GeneratePrimaryTinkAEADForTest(ctx)
   368  		ctx, _ = tsmon.WithDummyInMemory(ctx)
   369  		metricsStore := tsmon.Store(ctx)
   370  
   371  		b := &model.Build{
   372  			ID: 123,
   373  			Proto: &pb.Build{
   374  				Id:         123,
   375  				Status:     pb.Status_SCHEDULED,
   376  				CreateTime: &timestamppb.Timestamp{Seconds: now.UnixNano() / 1000000000},
   377  				SchedulingTimeout: &durationpb.Duration{
   378  					Seconds: 3600,
   379  				},
   380  				ExecutionTimeout: &durationpb.Duration{
   381  					Seconds: 4800,
   382  				},
   383  				GracePeriod: &durationpb.Duration{
   384  					Seconds: 60,
   385  				},
   386  				Builder: &pb.BuilderID{
   387  					Project: "proj",
   388  					Bucket:  "bucket",
   389  					Builder: "builder",
   390  				},
   391  			},
   392  		}
   393  		inf := &model.BuildInfra{
   394  			ID:    1,
   395  			Build: datastore.KeyForObj(ctx, &model.Build{ID: 123}),
   396  			Proto: &pb.BuildInfra{
   397  				Swarming: &pb.BuildInfra_Swarming{
   398  					Hostname: "swarm",
   399  					Caches: []*pb.BuildInfra_Swarming_CacheEntry{
   400  						{Name: "shared_builder_cache", Path: "builder", WaitForWarmCache: &durationpb.Duration{Seconds: 60}},
   401  						{Name: "second_cache", Path: "second", WaitForWarmCache: &durationpb.Duration{Seconds: 360}},
   402  					},
   403  					TaskDimensions: []*pb.RequestedDimension{
   404  						{Key: "a", Value: "1", Expiration: &durationpb.Duration{Seconds: 120}},
   405  						{Key: "a", Value: "2", Expiration: &durationpb.Duration{Seconds: 120}},
   406  						{Key: "pool", Value: "Chrome"},
   407  					},
   408  				},
   409  				Bbagent: &pb.BuildInfra_BBAgent{
   410  					CacheDir:    "cache",
   411  					PayloadPath: "kitchen-checkout",
   412  				},
   413  				Buildbucket: &pb.BuildInfra_Buildbucket{
   414  					Agent: &pb.BuildInfra_Buildbucket_Agent{
   415  						Source: &pb.BuildInfra_Buildbucket_Agent_Source{
   416  							DataType: &pb.BuildInfra_Buildbucket_Agent_Source_Cipd{
   417  								Cipd: &pb.BuildInfra_Buildbucket_Agent_Source_CIPD{
   418  									Package: "infra/tools/luci/bbagent/${platform}",
   419  									Version: "canary-version",
   420  									Server:  "cipd server",
   421  								},
   422  							},
   423  						},
   424  					},
   425  				},
   426  			},
   427  		}
   428  		bs := &model.BuildStatus{
   429  			Build:  datastore.KeyForObj(ctx, b),
   430  			Status: b.Proto.Status,
   431  		}
   432  		So(datastore.Put(ctx, b, inf, bs), ShouldBeNil)
   433  		Convey("swarming-build-create", func() {
   434  
   435  			Convey("build not found", func() {
   436  				err := SyncBuild(ctx, 789, 0)
   437  				So(err, ShouldErrLike, "build 789 or buildInfra not found")
   438  			})
   439  
   440  			Convey("build too old", func() {
   441  				So(datastore.Put(ctx, &model.Build{
   442  					ID:         111,
   443  					CreateTime: now.AddDate(0, 0, -3),
   444  					Proto: &pb.Build{
   445  						Builder: &pb.BuilderID{},
   446  					},
   447  				}), ShouldBeNil)
   448  
   449  				So(datastore.Put(ctx, &model.BuildInfra{
   450  					ID:    1,
   451  					Build: datastore.KeyForObj(ctx, &model.Build{ID: 111}),
   452  				}), ShouldBeNil)
   453  				err := SyncBuild(ctx, 111, 0)
   454  				So(err, ShouldBeNil)
   455  				So(sch.Tasks(), ShouldHaveLength, 0)
   456  			})
   457  
   458  			Convey("build ended", func() {
   459  				So(datastore.Put(ctx, &model.Build{
   460  					ID:     111,
   461  					Status: pb.Status_SUCCESS,
   462  					Proto: &pb.Build{
   463  						Builder: &pb.BuilderID{},
   464  					},
   465  				}), ShouldBeNil)
   466  				So(datastore.Put(ctx, &model.BuildInfra{
   467  					ID:    1,
   468  					Build: datastore.KeyForObj(ctx, &model.Build{ID: 111}),
   469  				}), ShouldBeNil)
   470  				err := SyncBuild(ctx, 111, 0)
   471  				So(err, ShouldBeNil)
   472  				So(sch.Tasks(), ShouldHaveLength, 0)
   473  			})
   474  
   475  			Convey("create swarming success", func() {
   476  				mockSwarm.EXPECT().CreateTask(gomock.Any(), gomock.Any()).Return(&apipb.TaskRequestMetadataResponse{
   477  					TaskId: "task123",
   478  				}, nil)
   479  				err := SyncBuild(ctx, 123, 0)
   480  				So(err, ShouldBeNil)
   481  				updatedBuild := &model.Build{ID: 123}
   482  				updatedInfra := &model.BuildInfra{Build: datastore.KeyForObj(ctx, updatedBuild)}
   483  				So(datastore.Get(ctx, updatedBuild), ShouldBeNil)
   484  				So(datastore.Get(ctx, updatedInfra), ShouldBeNil)
   485  				So(updatedBuild.UpdateToken, ShouldNotBeEmpty)
   486  				So(updatedInfra.Proto.Swarming.TaskId, ShouldEqual, "task123")
   487  				So(sch.Tasks(), ShouldHaveLength, 1)
   488  			})
   489  
   490  			Convey("create swarming http 400 err", func() {
   491  				mockSwarm.EXPECT().CreateTask(gomock.Any(), gomock.Any()).Return(nil, &googleapi.Error{Code: 400})
   492  				err := SyncBuild(ctx, 123, 0)
   493  				So(err, ShouldBeNil)
   494  				failedBuild := &model.Build{ID: 123}
   495  				bldStatus := &model.BuildStatus{
   496  					Build: datastore.KeyForObj(ctx, &model.Build{ID: 123}),
   497  				}
   498  				So(datastore.Get(ctx, failedBuild, bldStatus), ShouldBeNil)
   499  				So(failedBuild.Status, ShouldEqual, pb.Status_INFRA_FAILURE)
   500  				So(failedBuild.Proto.SummaryMarkdown, ShouldContainSubstring, "failed to create a swarming task: googleapi: got HTTP response code 400")
   501  				So(sch.Tasks(), ShouldHaveLength, 4)
   502  				So(bldStatus.Status, ShouldEqual, pb.Status_INFRA_FAILURE)
   503  			})
   504  
   505  			Convey("create swarming http 500 err", func() {
   506  				mockSwarm.EXPECT().CreateTask(gomock.Any(), gomock.Any()).Return(nil, &googleapi.Error{Code: 500})
   507  				err := SyncBuild(ctx, 123, 0)
   508  				So(err, ShouldErrLike, "failed to create a swarming task")
   509  				So(transient.Tag.In(err), ShouldBeTrue)
   510  				bld := &model.Build{ID: 123}
   511  				So(datastore.Get(ctx, bld), ShouldBeNil)
   512  				So(bld.Status, ShouldEqual, pb.Status_SCHEDULED)
   513  			})
   514  
   515  			Convey("create swarming http 500 err give up", func() {
   516  				ctx1, _ := testclock.UseTime(ctx, now.Add(swarmingCreateTaskGiveUpTimeout))
   517  				mockSwarm.EXPECT().CreateTask(gomock.Any(), gomock.Any()).Return(nil, &googleapi.Error{Code: 500})
   518  				err := SyncBuild(ctx1, 123, 0)
   519  				So(err, ShouldBeNil)
   520  				failedBuild := &model.Build{ID: 123}
   521  				So(datastore.Get(ctx, failedBuild), ShouldBeNil)
   522  				So(failedBuild.Status, ShouldEqual, pb.Status_INFRA_FAILURE)
   523  				So(failedBuild.Proto.SummaryMarkdown, ShouldContainSubstring, "failed to create a swarming task: googleapi: got HTTP response code 500")
   524  				So(sch.Tasks(), ShouldHaveLength, 4)
   525  			})
   526  
   527  			Convey("swarming task creation success but update build fail", func() {
   528  				mockSwarm.EXPECT().CreateTask(gomock.Any(), gomock.Any()).DoAndReturn(func(ctx context.Context, req *apipb.NewTaskRequest) (*apipb.TaskRequestMetadataResponse, error) {
   529  					// Hack to make the build update fail when trying to update build with the new task ID.
   530  					inf.Proto.Swarming.TaskId = "old task ID"
   531  					So(datastore.Put(ctx, inf), ShouldBeNil)
   532  					return &apipb.TaskRequestMetadataResponse{TaskId: "new task ID"}, nil
   533  				})
   534  
   535  				err := SyncBuild(ctx, 123, 0)
   536  				So(err, ShouldErrLike, "failed to update build 123: build already has a task old task ID")
   537  				currentInfra := &model.BuildInfra{Build: datastore.KeyForObj(ctx, &model.Build{
   538  					ID: 123,
   539  				})}
   540  				So(datastore.Get(ctx, currentInfra), ShouldBeNil)
   541  				So(currentInfra.Proto.Swarming.TaskId, ShouldEqual, "old task ID")
   542  				So(sch.Tasks(), ShouldHaveLength, 0)
   543  			})
   544  		})
   545  
   546  		Convey("swarming sync", func() {
   547  			inf.Proto.Swarming.TaskId = "task_id"
   548  			So(datastore.Put(ctx, inf), ShouldBeNil)
   549  
   550  			Convey("non-existing task ID", func() {
   551  				mockSwarm.EXPECT().GetTaskResult(ctx, "task_id").Return(nil, &googleapi.Error{Code: 404})
   552  				err := syncBuildWithTaskResult(ctx, 123, "task_id", mockSwarm)
   553  				So(err, ShouldBeNil)
   554  				failedBuild := &model.Build{ID: 123}
   555  				So(datastore.Get(ctx, failedBuild), ShouldBeNil)
   556  				So(failedBuild.Status, ShouldEqual, pb.Status_INFRA_FAILURE)
   557  				So(failedBuild.Proto.SummaryMarkdown, ShouldContainSubstring, "invalid swarming task task_id")
   558  			})
   559  
   560  			Convey("swarming server 500", func() {
   561  				mockSwarm.EXPECT().GetTaskResult(ctx, "task_id").Return(nil, &googleapi.Error{Code: 500})
   562  				err := syncBuildWithTaskResult(ctx, 123, "task_id", mockSwarm)
   563  				So(transient.Tag.In(err), ShouldBeTrue)
   564  			})
   565  
   566  			Convey("empty task result", func() {
   567  				mockSwarm.EXPECT().GetTaskResult(ctx, "task_id").Return(nil, nil)
   568  				err := syncBuildWithTaskResult(ctx, 123, "task_id", mockSwarm)
   569  				So(err, ShouldBeNil)
   570  				failedBuild := &model.Build{ID: 123}
   571  				So(datastore.Get(ctx, failedBuild), ShouldBeNil)
   572  				So(failedBuild.Status, ShouldEqual, pb.Status_INFRA_FAILURE)
   573  				So(failedBuild.Proto.SummaryMarkdown, ShouldContainSubstring, "Swarming task task_id unexpectedly disappeared")
   574  			})
   575  
   576  			Convey("invalid task result state", func() {
   577  				// syncBuildWithTaskResult should return Fatal error
   578  				mockSwarm.EXPECT().GetTaskResult(ctx, "task_id").Return(&apipb.TaskResultResponse{State: apipb.TaskState_INVALID}, nil)
   579  				err := syncBuildWithTaskResult(ctx, 123, "task_id", mockSwarm)
   580  				So(tq.Fatal.In(err), ShouldBeTrue)
   581  				bb := &model.Build{ID: 123}
   582  				So(datastore.Get(ctx, bb), ShouldBeNil)
   583  				So(bb.Status, ShouldEqual, pb.Status_SCHEDULED) // build status should not been impacted
   584  
   585  				// The swarming-build-sync flow shouldn't bubble up the Fatal error.
   586  				// It should ignore and enqueue the next generation of sync task.
   587  				mockSwarm.EXPECT().GetTaskResult(ctx, "task_id").Return(&apipb.TaskResultResponse{State: apipb.TaskState_INVALID}, nil)
   588  				err = SyncBuild(ctx, 123, 1)
   589  				So(err, ShouldBeNil)
   590  				bb = &model.Build{ID: 123}
   591  				So(datastore.Get(ctx, bb), ShouldBeNil)
   592  				So(bb.Status, ShouldEqual, pb.Status_SCHEDULED)
   593  				So(sch.Tasks(), ShouldHaveLength, 1)
   594  				So(sch.Tasks().Payloads()[0], ShouldResembleProto, &taskdefs.SyncSwarmingBuildTask{
   595  					BuildId:    123,
   596  					Generation: 2,
   597  				})
   598  			})
   599  
   600  			Convey("cancel incomplete steps for an ended build", func() {
   601  				mockSwarm.EXPECT().GetTaskResult(ctx, "task_id").Return(&apipb.TaskResultResponse{
   602  					State:       apipb.TaskState_BOT_DIED,
   603  					StartedTs:   &timestamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000},
   604  					CompletedTs: &timestamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000},
   605  				}, nil)
   606  				steps := model.BuildSteps{
   607  					ID:    1,
   608  					Build: datastore.KeyForObj(ctx, &model.Build{ID: 123}),
   609  				}
   610  				So(steps.FromProto([]*pb.Step{
   611  					{Name: "step1", Status: pb.Status_SUCCESS},
   612  					{Name: "step2", Status: pb.Status_STARTED},
   613  				}), ShouldBeNil)
   614  				So(datastore.Put(ctx, &steps), ShouldBeNil)
   615  
   616  				err := syncBuildWithTaskResult(ctx, 123, "task_id", mockSwarm)
   617  				So(err, ShouldBeNil)
   618  				failedBuild := &model.Build{ID: 123}
   619  				So(datastore.Get(ctx, failedBuild), ShouldBeNil)
   620  				So(failedBuild.Status, ShouldEqual, pb.Status_INFRA_FAILURE)
   621  				allSteps := &model.BuildSteps{
   622  					ID:    1,
   623  					Build: datastore.KeyForObj(ctx, &model.Build{ID: 123}),
   624  				}
   625  				So(datastore.Get(ctx, allSteps), ShouldBeNil)
   626  				mSteps, err := allSteps.ToProto(ctx)
   627  				So(err, ShouldBeNil)
   628  				So(mSteps, ShouldResembleProto, []*pb.Step{
   629  					{
   630  						Name:   "step1",
   631  						Status: pb.Status_SUCCESS,
   632  					},
   633  					{
   634  						Name:    "step2",
   635  						Status:  pb.Status_CANCELED,
   636  						EndTime: &timestamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000},
   637  					},
   638  				})
   639  			})
   640  
   641  			Convey("build has output status set to FAILURE", func() {
   642  				fakeTaskResult := &apipb.TaskResultResponse{
   643  					State:   apipb.TaskState_COMPLETED,
   644  					Failure: true,
   645  				}
   646  				mockSwarm.EXPECT().GetTaskResult(ctx, "task567").Return(fakeTaskResult, nil)
   647  				b := &model.Build{
   648  					ID: 567,
   649  					Proto: &pb.Build{
   650  						Builder: &pb.BuilderID{
   651  							Project: "proj",
   652  							Bucket:  "bucket",
   653  							Builder: "builder",
   654  						},
   655  						CreateTime: &timestamppb.Timestamp{Seconds: now.UnixNano() / 1000000000},
   656  						Status:     pb.Status_STARTED,
   657  						Output: &pb.Build_Output{
   658  							Status: pb.Status_FAILURE,
   659  						},
   660  					},
   661  				}
   662  				inf := &model.BuildInfra{
   663  					ID:    1,
   664  					Build: datastore.KeyForObj(ctx, &model.Build{ID: 567}),
   665  					Proto: &pb.BuildInfra{
   666  						Swarming: &pb.BuildInfra_Swarming{
   667  							Hostname: "swarm",
   668  							TaskId:   "task567",
   669  						},
   670  					},
   671  				}
   672  				bs := &model.BuildStatus{
   673  					Build:  datastore.KeyForObj(ctx, b),
   674  					Status: b.Proto.Status,
   675  				}
   676  				So(datastore.Put(ctx, b, inf, bs), ShouldBeNil)
   677  				err := SyncBuild(ctx, 567, 1)
   678  				So(err, ShouldBeNil)
   679  				So(datastore.Get(ctx, b), ShouldBeNil)
   680  				So(b.Proto.Status, ShouldEqual, pb.Status_FAILURE)
   681  			})
   682  
   683  			Convey("build has output status set to CANCELED", func() {
   684  				fakeTaskResult := &apipb.TaskResultResponse{
   685  					State:   apipb.TaskState_COMPLETED,
   686  					Failure: true,
   687  				}
   688  				mockSwarm.EXPECT().GetTaskResult(ctx, "task567").Return(fakeTaskResult, nil)
   689  				b := &model.Build{
   690  					ID: 567,
   691  					Proto: &pb.Build{
   692  						Builder: &pb.BuilderID{
   693  							Project: "proj",
   694  							Bucket:  "bucket",
   695  							Builder: "builder",
   696  						},
   697  						CreateTime: &timestamppb.Timestamp{Seconds: now.UnixNano() / 1000000000},
   698  						Status:     pb.Status_STARTED,
   699  						Output: &pb.Build_Output{
   700  							Status: pb.Status_CANCELED,
   701  						},
   702  					},
   703  				}
   704  				inf := &model.BuildInfra{
   705  					ID:    1,
   706  					Build: datastore.KeyForObj(ctx, &model.Build{ID: 567}),
   707  					Proto: &pb.BuildInfra{
   708  						Swarming: &pb.BuildInfra_Swarming{
   709  							Hostname: "swarm",
   710  							TaskId:   "task567",
   711  						},
   712  					},
   713  				}
   714  				bs := &model.BuildStatus{
   715  					Build:  datastore.KeyForObj(ctx, b),
   716  					Status: b.Proto.Status,
   717  				}
   718  				So(datastore.Put(ctx, b, inf, bs), ShouldBeNil)
   719  				err := SyncBuild(ctx, 567, 1)
   720  				So(err, ShouldBeNil)
   721  				So(datastore.Get(ctx, b), ShouldBeNil)
   722  				So(b.Proto.Status, ShouldEqual, pb.Status_CANCELED)
   723  			})
   724  
   725  			Convey("build has output status set to CANCELED while swarming task succeeded", func() {
   726  				fakeTaskResult := &apipb.TaskResultResponse{
   727  					State: apipb.TaskState_COMPLETED,
   728  				}
   729  				mockSwarm.EXPECT().GetTaskResult(ctx, "task567").Return(fakeTaskResult, nil)
   730  				b := &model.Build{
   731  					ID: 567,
   732  					Proto: &pb.Build{
   733  						Builder: &pb.BuilderID{
   734  							Project: "proj",
   735  							Bucket:  "bucket",
   736  							Builder: "builder",
   737  						},
   738  						CreateTime: &timestamppb.Timestamp{Seconds: now.UnixNano() / 1000000000},
   739  						Status:     pb.Status_STARTED,
   740  						Output: &pb.Build_Output{
   741  							Status: pb.Status_CANCELED,
   742  						},
   743  					},
   744  				}
   745  				inf := &model.BuildInfra{
   746  					ID:    1,
   747  					Build: datastore.KeyForObj(ctx, &model.Build{ID: 567}),
   748  					Proto: &pb.BuildInfra{
   749  						Swarming: &pb.BuildInfra_Swarming{
   750  							Hostname: "swarm",
   751  							TaskId:   "task567",
   752  						},
   753  					},
   754  				}
   755  				bs := &model.BuildStatus{
   756  					Build:  datastore.KeyForObj(ctx, b),
   757  					Status: b.Proto.Status,
   758  				}
   759  				So(datastore.Put(ctx, b, inf, bs), ShouldBeNil)
   760  				err := SyncBuild(ctx, 567, 1)
   761  				So(err, ShouldBeNil)
   762  				So(datastore.Get(ctx, b), ShouldBeNil)
   763  				So(b.Proto.Status, ShouldEqual, pb.Status_CANCELED)
   764  			})
   765  
   766  			Convey("task has no resource", func() {
   767  				fakeTaskResult := &apipb.TaskResultResponse{
   768  					State:       apipb.TaskState_NO_RESOURCE,
   769  					AbandonedTs: &timestamppb.Timestamp{Seconds: now.UnixNano() / 1000000000},
   770  				}
   771  				mockSwarm.EXPECT().GetTaskResult(ctx, "task567").Return(fakeTaskResult, nil)
   772  				b := &model.Build{
   773  					ID: 567,
   774  					Proto: &pb.Build{
   775  						Builder: &pb.BuilderID{
   776  							Project: "proj",
   777  							Bucket:  "bucket",
   778  							Builder: "builder",
   779  						},
   780  						CreateTime: &timestamppb.Timestamp{Seconds: now.UnixNano() / 1000000000},
   781  						Status:     pb.Status_SCHEDULED,
   782  					},
   783  				}
   784  				inf := &model.BuildInfra{
   785  					ID:    1,
   786  					Build: datastore.KeyForObj(ctx, &model.Build{ID: 567}),
   787  					Proto: &pb.BuildInfra{
   788  						Swarming: &pb.BuildInfra_Swarming{
   789  							Hostname: "swarm",
   790  							TaskId:   "task567",
   791  						},
   792  					},
   793  				}
   794  				bs := &model.BuildStatus{
   795  					Build:  datastore.KeyForObj(ctx, b),
   796  					Status: b.Proto.Status,
   797  				}
   798  				So(datastore.Put(ctx, b, inf, bs), ShouldBeNil)
   799  				err := SyncBuild(ctx, 567, 1)
   800  				So(err, ShouldBeNil)
   801  				So(datastore.Get(ctx, b), ShouldBeNil)
   802  				So(b.Proto.Status, ShouldEqual, pb.Status_INFRA_FAILURE)
   803  				So(b.Proto.StartTime, ShouldBeNil)
   804  			})
   805  
   806  			var cases = []struct {
   807  				fakeTaskResult *apipb.TaskResultResponse
   808  				expected       *expectedBuildFields
   809  			}{
   810  				{
   811  					fakeTaskResult: &apipb.TaskResultResponse{State: apipb.TaskState_PENDING},
   812  					expected: &expectedBuildFields{
   813  						status: pb.Status_SCHEDULED,
   814  					},
   815  				},
   816  				{
   817  					fakeTaskResult: &apipb.TaskResultResponse{
   818  						State:     apipb.TaskState_RUNNING,
   819  						StartedTs: &timestamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000},
   820  					},
   821  					expected: &expectedBuildFields{
   822  						status: pb.Status_STARTED,
   823  						startT: &timestamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000},
   824  					},
   825  				},
   826  				{
   827  					fakeTaskResult: &apipb.TaskResultResponse{
   828  						State:       apipb.TaskState_COMPLETED,
   829  						StartedTs:   &timestamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000},
   830  						CompletedTs: &timestamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000},
   831  					},
   832  					expected: &expectedBuildFields{
   833  						status: pb.Status_SUCCESS,
   834  						startT: &timestamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000},
   835  						endT:   &timestamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000},
   836  					},
   837  				},
   838  				{
   839  					fakeTaskResult: &apipb.TaskResultResponse{
   840  						State:       apipb.TaskState_COMPLETED,
   841  						StartedTs:   &timestamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000},
   842  						CompletedTs: &timestamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000},
   843  						BotDimensions: []*apipb.StringListPair{
   844  							{
   845  								Key:   "os",
   846  								Value: []string{"Ubuntu", "Trusty"},
   847  							},
   848  							{
   849  								Key:   "pool",
   850  								Value: []string{"luci.chromium.try"},
   851  							},
   852  							{
   853  								Key:   "id",
   854  								Value: []string{"bot1"},
   855  							},
   856  							{
   857  								Key: "empty",
   858  							},
   859  						},
   860  					},
   861  					expected: &expectedBuildFields{
   862  						status: pb.Status_SUCCESS,
   863  						startT: &timestamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000},
   864  						endT:   &timestamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000},
   865  						botDimensions: []*pb.StringPair{
   866  							{Key: "id", Value: "bot1"},
   867  							{Key: "os", Value: "Trusty"},
   868  							{Key: "os", Value: "Ubuntu"},
   869  							{Key: "pool", Value: "luci.chromium.try"},
   870  						},
   871  					},
   872  				},
   873  				{
   874  					fakeTaskResult: &apipb.TaskResultResponse{
   875  						State:       apipb.TaskState_COMPLETED,
   876  						Failure:     true,
   877  						StartedTs:   &timestamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000},
   878  						CompletedTs: &timestamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000},
   879  					},
   880  					expected: &expectedBuildFields{
   881  						status: pb.Status_INFRA_FAILURE,
   882  						startT: &timestamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000},
   883  						endT:   &timestamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000},
   884  					},
   885  				},
   886  				{
   887  					fakeTaskResult: &apipb.TaskResultResponse{
   888  						State:       apipb.TaskState_BOT_DIED,
   889  						StartedTs:   &timestamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000},
   890  						CompletedTs: &timestamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000},
   891  					},
   892  					expected: &expectedBuildFields{
   893  						status: pb.Status_INFRA_FAILURE,
   894  						startT: &timestamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000},
   895  						endT:   &timestamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000},
   896  					},
   897  				},
   898  				{
   899  					fakeTaskResult: &apipb.TaskResultResponse{
   900  						State:       apipb.TaskState_TIMED_OUT,
   901  						StartedTs:   &timestamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000},
   902  						CompletedTs: &timestamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000},
   903  					},
   904  					expected: &expectedBuildFields{
   905  						status:    pb.Status_INFRA_FAILURE,
   906  						isTimeOut: true,
   907  						startT:    &timestamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000},
   908  						endT:      &timestamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000},
   909  					},
   910  				},
   911  				{
   912  					fakeTaskResult: &apipb.TaskResultResponse{
   913  						State:       apipb.TaskState_EXPIRED,
   914  						AbandonedTs: &timestamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000},
   915  					},
   916  					expected: &expectedBuildFields{
   917  						status:               pb.Status_INFRA_FAILURE,
   918  						isResourceExhaustion: true,
   919  						isTimeOut:            true,
   920  						endT:                 &timestamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000},
   921  					},
   922  				},
   923  				{
   924  					fakeTaskResult: &apipb.TaskResultResponse{
   925  						State:       apipb.TaskState_KILLED,
   926  						AbandonedTs: &timestamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000},
   927  					},
   928  					expected: &expectedBuildFields{
   929  						status: pb.Status_CANCELED,
   930  						endT:   &timestamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000},
   931  					},
   932  				},
   933  				{
   934  					fakeTaskResult: &apipb.TaskResultResponse{
   935  						State:       apipb.TaskState_NO_RESOURCE,
   936  						AbandonedTs: &timestamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000},
   937  					},
   938  					expected: &expectedBuildFields{
   939  						status:               pb.Status_INFRA_FAILURE,
   940  						isResourceExhaustion: true,
   941  						endT:                 &timestamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000},
   942  					},
   943  				},
   944  				{
   945  					fakeTaskResult: &apipb.TaskResultResponse{
   946  						State:       apipb.TaskState_NO_RESOURCE,
   947  						AbandonedTs: &timestamppb.Timestamp{Seconds: now.UnixNano() / 1000000000},
   948  					},
   949  					expected: &expectedBuildFields{
   950  						status:               pb.Status_INFRA_FAILURE,
   951  						isResourceExhaustion: true,
   952  						endT:                 &timestamppb.Timestamp{Seconds: now.UnixNano() / 1000000000},
   953  					},
   954  				},
   955  			}
   956  			for i, tCase := range cases {
   957  				Convey(fmt.Sprintf("test %d - task %s", i, tCase.fakeTaskResult.State), func() {
   958  					mockSwarm.EXPECT().GetTaskResult(ctx, "task_id").Return(tCase.fakeTaskResult, nil)
   959  					err := SyncBuild(ctx, 123, 1)
   960  					So(err, ShouldBeNil)
   961  					syncedBuild := &model.Build{ID: 123}
   962  					So(datastore.Get(ctx, syncedBuild), ShouldBeNil)
   963  					So(syncedBuild.Status, ShouldEqual, tCase.expected.status)
   964  					if tCase.expected.isResourceExhaustion {
   965  						So(syncedBuild.Proto.StatusDetails.ResourceExhaustion, ShouldResembleProto, &pb.StatusDetails_ResourceExhaustion{})
   966  					} else {
   967  						So(syncedBuild.Proto.StatusDetails.GetResourceExhaustion(), ShouldBeNil)
   968  					}
   969  					if tCase.expected.isTimeOut {
   970  						So(syncedBuild.Proto.StatusDetails.Timeout, ShouldResembleProto, &pb.StatusDetails_Timeout{})
   971  					} else {
   972  						So(syncedBuild.Proto.StatusDetails.GetTimeout(), ShouldBeNil)
   973  					}
   974  					if tCase.expected.startT != nil {
   975  						So(syncedBuild.Proto.StartTime, ShouldResembleProto, tCase.expected.startT)
   976  					}
   977  					if tCase.expected.endT != nil {
   978  						So(syncedBuild.Proto.EndTime, ShouldResembleProto, tCase.expected.endT)
   979  					}
   980  					if tCase.expected.botDimensions != nil {
   981  						syncedInfra := &model.BuildInfra{Build: datastore.KeyForObj(ctx, syncedBuild)}
   982  						So(datastore.Get(ctx, syncedInfra), ShouldBeNil)
   983  						So(syncedInfra.Proto.Swarming.BotDimensions, ShouldResembleProto, tCase.expected.botDimensions)
   984  					}
   985  					if protoutil.IsEnded(syncedBuild.Status) {
   986  						// FinalizeResultDB, ExportBigQuery, NotifyPubSub, NotifyPubSubGoProxy and a continuation sync task.
   987  						So(sch.Tasks(), ShouldHaveLength, 5)
   988  
   989  						v2fs := []any{pb.Status_name[int32(syncedBuild.Status)], "None"}
   990  						So(metricsStore.Get(ctx, metrics.V2.BuildCountCompleted, time.Time{}, v2fs), ShouldEqual, 1)
   991  
   992  					} else if syncedBuild.Status == pb.Status_STARTED {
   993  						// NotifyPubSub, NotifyPubSubGoProxy and a continuation sync task.
   994  						So(sch.Tasks(), ShouldHaveLength, 3)
   995  						So(metricsStore.Get(ctx, metrics.V2.BuildCountStarted, time.Time{}, []any{"None"}), ShouldEqual, 1)
   996  					}
   997  					syncedBuildStatus := &model.BuildStatus{Build: datastore.KeyForObj(ctx, syncedBuild)}
   998  					So(datastore.Get(ctx, syncedBuildStatus), ShouldBeNil)
   999  					So(syncedBuildStatus.Status, ShouldEqual, syncedBuild.Proto.Status)
  1000  				})
  1001  			}
  1002  		})
  1003  	})
  1004  
  1005  }
  1006  
  1007  func TestHandleCancelSwarmingTask(t *testing.T) {
  1008  	t.Parallel()
  1009  	Convey("HandleCancelSwarmingTask", t, func() {
  1010  		ctl := gomock.NewController(t)
  1011  		defer ctl.Finish()
  1012  		now := testclock.TestRecentTimeUTC
  1013  		mockSwarm := clients.NewMockSwarmingClient(ctl)
  1014  		ctx, _ := testclock.UseTime(context.Background(), now)
  1015  		ctx = context.WithValue(ctx, &clients.MockSwarmingClientKey, mockSwarm)
  1016  		ctx = memory.UseWithAppID(ctx, "dev~app-id")
  1017  		ctx = txndefer.FilterRDS(ctx)
  1018  		ctx = metrics.WithServiceInfo(ctx, "svc", "job", "ins")
  1019  		datastore.GetTestable(ctx).AutoIndex(true)
  1020  		datastore.GetTestable(ctx).Consistent(true)
  1021  
  1022  		Convey("wrong", func() {
  1023  			Convey("empty hostname", func() {
  1024  				err := HandleCancelSwarmingTask(ctx, "", "task123", "project:bucket")
  1025  				So(err, ShouldErrLike, "hostname is empty")
  1026  				So(tq.Fatal.In(err), ShouldBeTrue)
  1027  			})
  1028  
  1029  			Convey("empty taskID", func() {
  1030  				err := HandleCancelSwarmingTask(ctx, "hostname", "", "project:bucket")
  1031  				So(err, ShouldErrLike, "taskID is empty")
  1032  				So(tq.Fatal.In(err), ShouldBeTrue)
  1033  			})
  1034  
  1035  			Convey("wrong realm", func() {
  1036  				err := HandleCancelSwarmingTask(ctx, "hostname", "task123", "bad_realm")
  1037  				So(err, ShouldErrLike, `bad global realm name "bad_realm"`)
  1038  				So(tq.Fatal.In(err), ShouldBeTrue)
  1039  			})
  1040  
  1041  			Convey("swarming http 500", func() {
  1042  				mockSwarm.EXPECT().CancelTask(ctx, gomock.Any()).Return(nil, &googleapi.Error{Code: 500, Message: "swarming internal error"})
  1043  				err := HandleCancelSwarmingTask(ctx, "hostname", "task123", "project:bucket")
  1044  
  1045  				So(err, ShouldErrLike, "transient error in cancelling the task task123")
  1046  				So(transient.Tag.In(err), ShouldBeTrue)
  1047  			})
  1048  
  1049  			Convey("swarming http <500", func() {
  1050  				mockSwarm.EXPECT().CancelTask(ctx, gomock.Any()).Return(nil, &googleapi.Error{Code: 400, Message: "bad request"})
  1051  				err := HandleCancelSwarmingTask(ctx, "hostname", "task123", "project:bucket")
  1052  
  1053  				So(err, ShouldErrLike, "fatal error in cancelling the task task123")
  1054  				So(tq.Fatal.In(err), ShouldBeTrue)
  1055  			})
  1056  		})
  1057  
  1058  		Convey("success", func() {
  1059  			Convey("response.ok", func() {
  1060  				mockSwarm.EXPECT().CancelTask(ctx, gomock.Any()).Return(&apipb.CancelResponse{Canceled: true}, nil)
  1061  				So(HandleCancelSwarmingTask(ctx, "hostname", "task123", "project:bucket"), ShouldBeNil)
  1062  			})
  1063  
  1064  			Convey("!response.ok", func() {
  1065  				mockSwarm.EXPECT().CancelTask(ctx, gomock.Any()).Return(&apipb.CancelResponse{Canceled: false, WasRunning: false}, nil)
  1066  				So(HandleCancelSwarmingTask(ctx, "hostname", "task123", "project:bucket"), ShouldBeNil)
  1067  			})
  1068  		})
  1069  	})
  1070  }
  1071  
  1072  func TestSubNotify(t *testing.T) {
  1073  	t.Parallel()
  1074  	Convey("SubNotify", t, func() {
  1075  		ctl := gomock.NewController(t)
  1076  		defer ctl.Finish()
  1077  		now := testclock.TestRecentTimeUTC
  1078  		mockSwarm := clients.NewMockSwarmingClient(ctl)
  1079  		ctx, _ := testclock.UseTime(context.Background(), now)
  1080  		ctx = context.WithValue(ctx, &clients.MockSwarmingClientKey, mockSwarm)
  1081  		ctx = memory.UseWithAppID(ctx, "dev~app-id")
  1082  		ctx = txndefer.FilterRDS(ctx)
  1083  		ctx = metrics.WithServiceInfo(ctx, "svc", "job", "ins")
  1084  		ctx = metrics.WithBuilder(ctx, "proj", "bucket", "builder")
  1085  		datastore.GetTestable(ctx).AutoIndex(true)
  1086  		datastore.GetTestable(ctx).Consistent(true)
  1087  		ctx, _ = tsmon.WithDummyInMemory(ctx)
  1088  		store := tsmon.Store(ctx)
  1089  		ctx, _ = tq.TestingContext(ctx, nil)
  1090  		ctx = cachingtest.WithGlobalCache(ctx, map[string]caching.BlobCache{
  1091  			"swarming-pubsub-msg-id": cachingtest.NewBlobCache(),
  1092  		})
  1093  		ctx, sch := tq.TestingContext(ctx, nil)
  1094  
  1095  		b := &model.Build{
  1096  			ID: 123,
  1097  			Proto: &pb.Build{
  1098  				Id:         123,
  1099  				Status:     pb.Status_SCHEDULED,
  1100  				CreateTime: &timestamppb.Timestamp{Seconds: now.UnixNano() / 1000000000},
  1101  				SchedulingTimeout: &durationpb.Duration{
  1102  					Seconds: 3600,
  1103  				},
  1104  				ExecutionTimeout: &durationpb.Duration{
  1105  					Seconds: 4800,
  1106  				},
  1107  				GracePeriod: &durationpb.Duration{
  1108  					Seconds: 60,
  1109  				},
  1110  				Builder: &pb.BuilderID{
  1111  					Project: "proj",
  1112  					Bucket:  "bucket",
  1113  					Builder: "builder",
  1114  				},
  1115  			},
  1116  		}
  1117  		inf := &model.BuildInfra{
  1118  			ID:    1,
  1119  			Build: datastore.KeyForObj(ctx, &model.Build{ID: 123}),
  1120  			Proto: &pb.BuildInfra{
  1121  				Swarming: &pb.BuildInfra_Swarming{
  1122  					Hostname: "swarm",
  1123  					TaskId:   "task123",
  1124  					Caches: []*pb.BuildInfra_Swarming_CacheEntry{
  1125  						{Name: "shared_builder_cache", Path: "builder", WaitForWarmCache: &durationpb.Duration{Seconds: 60}},
  1126  						{Name: "second_cache", Path: "second", WaitForWarmCache: &durationpb.Duration{Seconds: 360}},
  1127  					},
  1128  					TaskDimensions: []*pb.RequestedDimension{
  1129  						{Key: "a", Value: "1", Expiration: &durationpb.Duration{Seconds: 120}},
  1130  						{Key: "a", Value: "2", Expiration: &durationpb.Duration{Seconds: 120}},
  1131  						{Key: "pool", Value: "Chrome"},
  1132  					},
  1133  				},
  1134  				Bbagent: &pb.BuildInfra_BBAgent{
  1135  					CacheDir:    "cache",
  1136  					PayloadPath: "kitchen-checkout",
  1137  				},
  1138  				Buildbucket: &pb.BuildInfra_Buildbucket{
  1139  					Agent: &pb.BuildInfra_Buildbucket_Agent{
  1140  						Source: &pb.BuildInfra_Buildbucket_Agent_Source{
  1141  							DataType: &pb.BuildInfra_Buildbucket_Agent_Source_Cipd{
  1142  								Cipd: &pb.BuildInfra_Buildbucket_Agent_Source_CIPD{
  1143  									Package: "infra/tools/luci/bbagent/${platform}",
  1144  									Version: "canary-version",
  1145  									Server:  "cipd server",
  1146  								},
  1147  							},
  1148  						},
  1149  					},
  1150  				},
  1151  			},
  1152  		}
  1153  		bs := &model.BuildStatus{
  1154  			Build:  datastore.KeyForObj(ctx, b),
  1155  			Status: b.Proto.Status,
  1156  		}
  1157  		So(datastore.Put(ctx, b, inf, bs), ShouldBeNil)
  1158  
  1159  		Convey("bad msg data", func() {
  1160  			body := makeSwarmingPubsubMsg(&userdata{
  1161  				BuildID:          999,
  1162  				CreatedTS:        1448841600000,
  1163  				SwarmingHostname: "swarm",
  1164  			}, "", "msg1")
  1165  			err := SubNotify(ctx, body)
  1166  			So(err, ShouldErrLike, "task_id not found in message data")
  1167  			So(transient.Tag.In(err), ShouldBeFalse)
  1168  
  1169  			body = makeSwarmingPubsubMsg(&userdata{
  1170  				CreatedTS:        1448841600000,
  1171  				SwarmingHostname: "swarm",
  1172  			}, "task123", "msg1")
  1173  			err = SubNotify(ctx, body)
  1174  			So(err, ShouldErrLike, "invalid build_id 0")
  1175  
  1176  			body = makeSwarmingPubsubMsg(&userdata{
  1177  				BuildID:          999,
  1178  				SwarmingHostname: "swarm",
  1179  			}, "task123", "msg1")
  1180  			err = SubNotify(ctx, body)
  1181  			So(err, ShouldErrLike, "invalid created_ts 0")
  1182  
  1183  			body = makeSwarmingPubsubMsg(&userdata{
  1184  				BuildID:          999,
  1185  				CreatedTS:        1448841600000,
  1186  				SwarmingHostname: " ",
  1187  			}, "task123", "msg1")
  1188  			err = SubNotify(ctx, body)
  1189  			So(err, ShouldErrLike, "swarming hostname not found in userdata")
  1190  
  1191  			body = makeSwarmingPubsubMsg(&userdata{
  1192  				BuildID:          999,
  1193  				CreatedTS:        1448841600000,
  1194  				SwarmingHostname: "https://swarm.com",
  1195  			}, "task123", "msg1")
  1196  			err = SubNotify(ctx, body)
  1197  			So(err, ShouldErrLike, "swarming hostname https://swarm.com must not contain '://'")
  1198  		})
  1199  
  1200  		Convey("build not found", func() {
  1201  			old := now.Add(-time.Minute).UnixNano() / int64(time.Microsecond)
  1202  			body := makeSwarmingPubsubMsg(&userdata{
  1203  				BuildID:          999,
  1204  				CreatedTS:        old,
  1205  				SwarmingHostname: "swarm",
  1206  			}, "task123", "msg1")
  1207  			err := SubNotify(ctx, body)
  1208  			So(err, ShouldErrLike, "Build 999 or BuildInfra for task https://swarm/task?id=task123 not found")
  1209  			So(transient.Tag.In(err), ShouldBeFalse)
  1210  
  1211  			recent := now.Add(-50*time.Second).UnixNano() / int64(time.Microsecond)
  1212  			body = makeSwarmingPubsubMsg(&userdata{
  1213  				BuildID:          999,
  1214  				CreatedTS:        recent,
  1215  				SwarmingHostname: "swarm",
  1216  			}, "task123", "msg1")
  1217  			err = SubNotify(ctx, body)
  1218  			So(err, ShouldErrLike, "Build 999 or BuildInfra for task https://swarm/task?id=task123 not found yet")
  1219  			So(transient.Tag.In(err), ShouldBeTrue)
  1220  		})
  1221  
  1222  		Convey("different swarming hostname", func() {
  1223  
  1224  			body := makeSwarmingPubsubMsg(&userdata{
  1225  				BuildID:          123,
  1226  				CreatedTS:        1517260502000000,
  1227  				SwarmingHostname: "swarm2",
  1228  			}, "task123", "msg1")
  1229  			err := SubNotify(ctx, body)
  1230  			So(err, ShouldErrLike, "swarming_hostname swarm of build 123 does not match swarm2")
  1231  			So(transient.Tag.In(err), ShouldBeFalse)
  1232  		})
  1233  
  1234  		Convey("different task id", func() {
  1235  			body := makeSwarmingPubsubMsg(&userdata{
  1236  				BuildID:          123,
  1237  				CreatedTS:        1517260502000000,
  1238  				SwarmingHostname: "swarm",
  1239  			}, "task345", "msg1")
  1240  			err := SubNotify(ctx, body)
  1241  			So(err, ShouldErrLike, "swarming_task_id task123 of build 123 does not match task345")
  1242  			So(transient.Tag.In(err), ShouldBeFalse)
  1243  		})
  1244  
  1245  		Convey("swarming 500s error", func() {
  1246  			body := makeSwarmingPubsubMsg(&userdata{
  1247  				BuildID:          123,
  1248  				CreatedTS:        1517260502000000,
  1249  				SwarmingHostname: "swarm",
  1250  			}, "task123", "msg1")
  1251  			mockSwarm.EXPECT().GetTaskResult(ctx, "task123").Return(nil, &googleapi.Error{Code: 500, Message: "swarming internal error"})
  1252  			err := SubNotify(ctx, body)
  1253  			So(err, ShouldErrLike, "googleapi: Error 500: swarming internal error")
  1254  			So(transient.Tag.In(err), ShouldBeTrue)
  1255  
  1256  			cache := caching.GlobalCache(ctx, "swarming-pubsub-msg-id")
  1257  			_, err = cache.Get(ctx, "msg1")
  1258  			So(err, ShouldEqual, caching.ErrCacheMiss)
  1259  		})
  1260  
  1261  		Convey("status already ended", func() {
  1262  			b.Proto.Status = pb.Status_SUCCESS
  1263  			So(datastore.Put(ctx, b), ShouldBeNil)
  1264  
  1265  			body := makeSwarmingPubsubMsg(&userdata{
  1266  				BuildID:          123,
  1267  				CreatedTS:        1517260502000000,
  1268  				SwarmingHostname: "swarm",
  1269  			}, "task123", "msg1")
  1270  			err := SubNotify(ctx, body)
  1271  			So(err, ShouldBeNil)
  1272  			mockSwarm.EXPECT().CreateTask(gomock.Any(), gomock.Any()).Times(0)
  1273  
  1274  			So(sch.Tasks(), ShouldHaveLength, 0)
  1275  		})
  1276  
  1277  		Convey("status changed to success", func() {
  1278  			body := makeSwarmingPubsubMsg(&userdata{
  1279  				BuildID:          123,
  1280  				CreatedTS:        1517260502000000,
  1281  				SwarmingHostname: "swarm",
  1282  			}, "task123", "msg1")
  1283  			mockSwarm.EXPECT().GetTaskResult(ctx, "task123").Return(&apipb.TaskResultResponse{
  1284  				State:       apipb.TaskState_COMPLETED,
  1285  				StartedTs:   &timestamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000},
  1286  				CompletedTs: &timestamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000},
  1287  				BotDimensions: []*apipb.StringListPair{
  1288  					{
  1289  						Key:   "new_key",
  1290  						Value: []string{"new_val"},
  1291  					},
  1292  				},
  1293  			}, nil)
  1294  			err := SubNotify(ctx, body)
  1295  			So(err, ShouldBeNil)
  1296  			syncedBuild := &model.Build{ID: 123}
  1297  			So(datastore.Get(ctx, syncedBuild), ShouldBeNil)
  1298  			So(syncedBuild.Status, ShouldEqual, pb.Status_SUCCESS)
  1299  			So(syncedBuild.Proto.StartTime, ShouldResembleProto, &timestamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000})
  1300  			So(syncedBuild.Proto.EndTime, ShouldResembleProto, &timestamppb.Timestamp{Seconds: 1517271318, Nanos: 162860000})
  1301  
  1302  			syncedInfra := &model.BuildInfra{Build: datastore.KeyForObj(ctx, syncedBuild)}
  1303  			So(datastore.Get(ctx, syncedInfra), ShouldBeNil)
  1304  			So(syncedInfra.Proto.Swarming.BotDimensions, ShouldResembleProto, []*pb.StringPair{
  1305  				{
  1306  					Key:   "new_key",
  1307  					Value: "new_val",
  1308  				},
  1309  			})
  1310  			syncedBuildStatus := &model.BuildStatus{Build: datastore.KeyForObj(ctx, syncedBuild)}
  1311  			So(datastore.Get(ctx, syncedBuildStatus), ShouldBeNil)
  1312  			So(syncedBuildStatus.Status, ShouldEqual, pb.Status_SUCCESS)
  1313  
  1314  			cache := caching.GlobalCache(ctx, "swarming-pubsub-msg-id")
  1315  			cached, err := cache.Get(ctx, "msg1")
  1316  			So(err, ShouldBeNil)
  1317  			So(cached, ShouldResemble, []byte{1})
  1318  
  1319  			// FinalizeResultDB, ExportBigQuery, NotifyPubSub, NotifyPubSubGoProxy tasks.
  1320  			So(sch.Tasks(), ShouldHaveLength, 4)
  1321  
  1322  			// BuildCompleted metric should be set to 1 with SUCCESS.
  1323  			v2fs := []any{pb.Status_name[int32(syncedBuild.Status)], "None"}
  1324  			So(store.Get(ctx, metrics.V2.BuildCountCompleted, time.Time{}, v2fs), ShouldEqual, 1)
  1325  		})
  1326  
  1327  		Convey("status unchanged(in STARTED) while bot dimensions changed", func() {
  1328  			b.Proto.Status = pb.Status_STARTED
  1329  			bs.Status = b.Proto.Status
  1330  			So(datastore.Put(ctx, b, bs), ShouldBeNil)
  1331  			body := makeSwarmingPubsubMsg(&userdata{
  1332  				BuildID:          123,
  1333  				CreatedTS:        1517260502000000,
  1334  				SwarmingHostname: "swarm",
  1335  			}, "task123", "msg1")
  1336  			mockSwarm.EXPECT().GetTaskResult(ctx, "task123").Return(&apipb.TaskResultResponse{
  1337  				State:     apipb.TaskState_RUNNING,
  1338  				StartedTs: &timestamppb.Timestamp{Seconds: 1517260502, Nanos: 649750000},
  1339  				BotDimensions: []*apipb.StringListPair{
  1340  					{
  1341  						Key:   "new_key",
  1342  						Value: []string{"new_val"},
  1343  					},
  1344  				},
  1345  			}, nil)
  1346  			err := SubNotify(ctx, body)
  1347  			So(err, ShouldBeNil)
  1348  			syncedBuild := &model.Build{ID: 123}
  1349  			So(datastore.Get(ctx, syncedBuild), ShouldBeNil)
  1350  			So(syncedBuild.Status, ShouldEqual, pb.Status_STARTED)
  1351  
  1352  			syncedInfra := &model.BuildInfra{Build: datastore.KeyForObj(ctx, syncedBuild)}
  1353  			So(datastore.Get(ctx, syncedInfra), ShouldBeNil)
  1354  			So(syncedInfra.Proto.Swarming.BotDimensions, ShouldResembleProto, []*pb.StringPair{{
  1355  				Key:   "new_key",
  1356  				Value: "new_val",
  1357  			}})
  1358  			syncedBuildStatus := &model.BuildStatus{Build: datastore.KeyForObj(ctx, syncedBuild)}
  1359  			So(datastore.Get(ctx, syncedBuildStatus), ShouldBeNil)
  1360  			So(syncedBuildStatus.Status, ShouldEqual, pb.Status_STARTED)
  1361  
  1362  			So(sch.Tasks(), ShouldHaveLength, 0)
  1363  		})
  1364  
  1365  		Convey("status unchanged(not in STARTED) while bot dimensions changed", func() {
  1366  			b.Proto.Status = pb.Status_STARTED
  1367  			bs.Status = b.Proto.Status
  1368  			So(datastore.Put(ctx, b, bs), ShouldBeNil)
  1369  			body := makeSwarmingPubsubMsg(&userdata{
  1370  				BuildID:          123,
  1371  				CreatedTS:        1517260502000000,
  1372  				SwarmingHostname: "swarm",
  1373  			}, "task123", "msg1")
  1374  			mockSwarm.EXPECT().GetTaskResult(ctx, "task123").Return(&apipb.TaskResultResponse{
  1375  				State: apipb.TaskState_PENDING,
  1376  				BotDimensions: []*apipb.StringListPair{
  1377  					{
  1378  						Key:   "new_key",
  1379  						Value: []string{"new_val"},
  1380  					},
  1381  				},
  1382  			}, nil)
  1383  			err := SubNotify(ctx, body)
  1384  			So(err, ShouldBeNil)
  1385  			syncedBuild := &model.Build{ID: 123}
  1386  			So(datastore.Get(ctx, syncedBuild), ShouldBeNil)
  1387  			So(syncedBuild.Status, ShouldEqual, pb.Status_STARTED)
  1388  
  1389  			currentInfra := &model.BuildInfra{Build: datastore.KeyForObj(ctx, syncedBuild)}
  1390  			So(datastore.Get(ctx, currentInfra), ShouldBeNil)
  1391  			So(currentInfra.Proto.Swarming.BotDimensions, ShouldBeEmpty)
  1392  
  1393  			syncedBuildStatus := &model.BuildStatus{Build: datastore.KeyForObj(ctx, syncedBuild)}
  1394  			So(datastore.Get(ctx, syncedBuildStatus), ShouldBeNil)
  1395  			So(syncedBuildStatus.Status, ShouldEqual, pb.Status_STARTED)
  1396  
  1397  			So(sch.Tasks(), ShouldHaveLength, 0)
  1398  		})
  1399  
  1400  		Convey("duplicate message", func() {
  1401  			cache := caching.GlobalCache(ctx, "swarming-pubsub-msg-id")
  1402  			err := cache.Set(ctx, "msg123", []byte{1}, 0*time.Second)
  1403  			So(err, ShouldBeNil)
  1404  
  1405  			body := makeSwarmingPubsubMsg(&userdata{
  1406  				BuildID:          123,
  1407  				CreatedTS:        1517260502000000,
  1408  				SwarmingHostname: "swarm",
  1409  			}, "task123", "msg123")
  1410  			mockSwarm.EXPECT().GetTaskResult(ctx, "task_id").Times(0)
  1411  			err = SubNotify(ctx, body)
  1412  			So(err, ShouldBeNil)
  1413  		})
  1414  	})
  1415  }
  1416  
  1417  func makeSwarmingPubsubMsg(userdata *userdata, taskID string, msgID string) io.Reader {
  1418  	ud, _ := json.Marshal(userdata)
  1419  	data := struct {
  1420  		TaskID   string `json:"task_id"`
  1421  		Userdata string `json:"userdata"`
  1422  	}{TaskID: taskID, Userdata: string(ud)}
  1423  	bd, _ := json.Marshal(data)
  1424  	msg := struct {
  1425  		Message struct {
  1426  			Data      string
  1427  			MessageID string
  1428  		}
  1429  	}{struct {
  1430  		Data      string
  1431  		MessageID string
  1432  	}{Data: base64.StdEncoding.EncodeToString(bd), MessageID: msgID}}
  1433  	jmsg, _ := json.Marshal(msg)
  1434  	return bytes.NewReader(jmsg)
  1435  }
  1436  
  1437  type expectedBuildFields struct {
  1438  	status               pb.Status
  1439  	startT               *timestamppb.Timestamp
  1440  	endT                 *timestamppb.Timestamp
  1441  	isTimeOut            bool
  1442  	isResourceExhaustion bool
  1443  	botDimensions        []*pb.StringPair
  1444  }