go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/buildbucket/appengine/rpc/start_build.go (about) 1 // Copyright 2023 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package rpc 16 17 import ( 18 "context" 19 "fmt" 20 "strings" 21 22 "google.golang.org/grpc/codes" 23 24 "go.chromium.org/luci/common/clock" 25 "go.chromium.org/luci/common/errors" 26 "go.chromium.org/luci/common/logging" 27 "go.chromium.org/luci/common/proto/protowalk" 28 "go.chromium.org/luci/gae/service/datastore" 29 "go.chromium.org/luci/grpc/appstatus" 30 31 "go.chromium.org/luci/buildbucket" 32 "go.chromium.org/luci/buildbucket/appengine/common" 33 "go.chromium.org/luci/buildbucket/appengine/internal/buildstatus" 34 "go.chromium.org/luci/buildbucket/appengine/internal/buildtoken" 35 "go.chromium.org/luci/buildbucket/appengine/internal/metrics" 36 "go.chromium.org/luci/buildbucket/appengine/model" 37 "go.chromium.org/luci/buildbucket/appengine/tasks" 38 pb "go.chromium.org/luci/buildbucket/proto" 39 "go.chromium.org/luci/buildbucket/protoutil" 40 ) 41 42 func validateStartBuildRequest(ctx context.Context, req *pb.StartBuildRequest) error { 43 if procRes := protowalk.Fields(req, &protowalk.RequiredProcessor{}); procRes != nil { 44 if resStrs := procRes.Strings(); len(resStrs) > 0 { 45 logging.Infof(ctx, strings.Join(resStrs, ". ")) 46 } 47 return procRes.Err() 48 } 49 return nil 50 } 51 52 // startBuildOnSwarming starts a build if it runs on swarming. 53 // 54 // For builds on Swarming, the swarming task id and update_token have been 55 // saved in datastore during task creation. 56 func startBuildOnSwarming(ctx context.Context, req *pb.StartBuildRequest, tok string) (*model.Build, bool, error) { 57 var b *model.Build 58 buildStatusChanged := false 59 txErr := datastore.RunInTransaction(ctx, func(ctx context.Context) error { 60 entities, err := common.GetBuildEntities(ctx, req.BuildId, model.BuildKind, model.BuildInfraKind) 61 if err != nil { 62 return errors.Annotate(err, "failed to get build %d", req.BuildId).Err() 63 } 64 b = entities[0].(*model.Build) 65 infra := entities[1].(*model.BuildInfra) 66 67 if infra.Proto.GetSwarming() == nil { 68 return appstatus.Errorf(codes.Internal, "the build %d does not run on swarming", req.BuildId) 69 } 70 71 // First StartBuild request. 72 if b.StartBuildRequestID == "" { 73 if infra.Proto.Swarming.TaskId != req.TaskId && infra.Proto.Swarming.TaskId != "" { 74 // Duplicated task. 75 return buildbucket.DuplicateTask.Apply(appstatus.Errorf(codes.AlreadyExists, "build %d has associated with task %q", req.BuildId, infra.Proto.Swarming.TaskId)) 76 } 77 78 if protoutil.IsEnded(b.Status) { 79 // The build has ended. 80 // For example the StartBuild request reaches Buildbucket late, when the task 81 // has crashed (e.g. BOT_DIED). 82 return appstatus.Errorf(codes.FailedPrecondition, "cannot start ended build %d", b.ID) 83 } 84 85 var toPut []any 86 if infra.Proto.Swarming.TaskId == "" { 87 // In rare cases that a StartBuild request for a build can reach Buildbucket 88 // before tasks.CreateSwarmingBuildTask: e.g. 89 // 1. CreateSwarmingBuildTask for a build succeeds to create a 90 // swarming task for the build, but it fails to save the task to datastore, 91 // then this task needs to retry 92 // 2. in the meantime swarming starts to run the task created in 1, sends 93 // a StartBuild request 94 // 3. after StartBuild, the retried CreateSwarmingBuildTask tries to 95 // update datastore again with the same task id (because now creating task 96 // is idempotent). 97 infra.Proto.Swarming.TaskId = req.TaskId 98 b.UpdateToken = tok 99 toPut = append(toPut, infra) 100 } 101 102 // Start the build. 103 b.StartBuildRequestID = req.RequestId 104 toPut = append(toPut, b) 105 if b.Proto.Output == nil { 106 b.Proto.Output = &pb.Build_Output{} 107 } 108 b.Proto.Output.Status = pb.Status_STARTED 109 statusUpdater := buildstatus.Updater{ 110 Build: b, 111 OutputStatus: &buildstatus.StatusWithDetails{Status: pb.Status_STARTED}, 112 UpdateTime: clock.Now(ctx), 113 PostProcess: tasks.SendOnBuildStatusChange, 114 } 115 var bs *model.BuildStatus 116 bs, err = statusUpdater.Do(ctx) 117 if err != nil { 118 return appstatus.Errorf(codes.Internal, "failed to update status for build %d: %s", b.ID, err) 119 } 120 if bs != nil { 121 buildStatusChanged = true 122 toPut = append(toPut, bs) 123 } 124 125 if err = datastore.Put(ctx, toPut...); err != nil { 126 return appstatus.Errorf(codes.Internal, "failed to start build %d: %s", b.ID, err) 127 } 128 return nil 129 } 130 return checkSubsequentRequest(req, b.StartBuildRequestID, infra.Proto.Swarming.TaskId) 131 }, nil) 132 if txErr != nil { 133 return nil, false, txErr 134 } 135 return b, buildStatusChanged, nil 136 } 137 138 func startBuildOnBackendOnFirstReq(ctx context.Context, req *pb.StartBuildRequest, b *model.Build, infra *model.BuildInfra) (bool, error) { 139 taskID := infra.Proto.Backend.Task.GetId() 140 if taskID.GetId() != "" && taskID.GetId() != req.TaskId { 141 // The build has been associated with another task, possible from a previous 142 // RegisterBuildTask call from a different task. 143 return false, buildbucket.DuplicateTask.Apply(appstatus.Errorf(codes.AlreadyExists, "build %d has associated with task %q", req.BuildId, taskID.Id)) 144 } 145 146 if protoutil.IsEnded(b.Status) { 147 // The build has ended. 148 // For example the StartBuild request reaches Buildbucket late, when the task 149 // has crashed (e.g. BOT_DIED). 150 return false, appstatus.Errorf(codes.FailedPrecondition, "cannot start ended build %d", b.ID) 151 } 152 153 if b.Status == pb.Status_STARTED { 154 // The build has started. 155 // Currently for builds on backend this should not happen. 156 return false, appstatus.Errorf(codes.FailedPrecondition, "cannot start started build %d", b.ID) 157 } 158 159 // Start the build. 160 toSave := []any{b} 161 b.StartBuildRequestID = req.RequestId 162 updateBuildToken, err := buildtoken.GenerateToken(ctx, b.ID, pb.TokenBody_BUILD) 163 if err != nil { 164 return false, errors.Annotate(err, "failed to generate BUILD token for build %d", b.ID).Err() 165 } 166 b.UpdateToken = updateBuildToken 167 if b.Proto.Output == nil { 168 b.Proto.Output = &pb.Build_Output{} 169 } 170 b.Proto.Output.Status = pb.Status_STARTED 171 statusUpdater := buildstatus.Updater{ 172 Build: b, 173 OutputStatus: &buildstatus.StatusWithDetails{Status: pb.Status_STARTED}, 174 UpdateTime: clock.Now(ctx), 175 PostProcess: tasks.SendOnBuildStatusChange, 176 } 177 var bs *model.BuildStatus 178 bs, err = statusUpdater.Do(ctx) 179 if err != nil { 180 return false, appstatus.Errorf(codes.Internal, "failed to update status for build %d: %s", b.ID, err) 181 } 182 if bs != nil { 183 toSave = append(toSave, bs) 184 } 185 186 if taskID.GetId() == "" { 187 // First handshake, associate the task with the build. 188 taskID.Id = req.TaskId 189 toSave = append(toSave, infra) 190 } 191 err = datastore.Put(ctx, toSave) 192 if err != nil { 193 return false, errors.Annotate(err, "failed to start build %d: %s", b.ID, err).Err() 194 } 195 196 return true, nil 197 } 198 199 func startBuildOnBackend(ctx context.Context, req *pb.StartBuildRequest) (*model.Build, bool, error) { 200 var b *model.Build 201 buildStatusChanged := false 202 txErr := datastore.RunInTransaction(ctx, func(ctx context.Context) error { 203 entities, err := common.GetBuildEntities(ctx, req.BuildId, model.BuildKind, model.BuildInfraKind) 204 if err != nil { 205 return errors.Annotate(err, "failed to get build %d", req.BuildId).Err() 206 } 207 b = entities[0].(*model.Build) 208 infra := entities[1].(*model.BuildInfra) 209 210 if infra.Proto.GetBackend().GetTask() == nil { 211 return errors.Reason("the build %d does not run on task backend", req.BuildId).Err() 212 } 213 214 if b.StartBuildRequestID == "" { 215 // First StartBuild for the build. 216 buildStatusChanged, err = startBuildOnBackendOnFirstReq(ctx, req, b, infra) 217 return err 218 } 219 220 return checkSubsequentRequest(req, b.StartBuildRequestID, infra.Proto.Backend.Task.GetId().GetId()) 221 }, nil) 222 if txErr != nil { 223 return nil, false, txErr 224 } 225 return b, buildStatusChanged, nil 226 } 227 228 func checkSubsequentRequest(req *pb.StartBuildRequest, savedReqID, savedTaskID string) error { 229 // Subsequent StartBuild request. 230 if savedReqID != req.RequestId { 231 // Different request id, deduplicate. 232 return buildbucket.DuplicateTask.Apply(appstatus.Errorf(codes.AlreadyExists, "build %d has recorded another StartBuild with request id %q", req.BuildId, savedReqID)) 233 } 234 235 if savedTaskID != req.TaskId { 236 // Same request id, different task id. 237 return errors.Reason("build %d has associated with task id %q with StartBuild request id %q", req.BuildId, savedTaskID, savedReqID).Tag(buildbucket.TaskWithCollidedRequestID).Err() 238 } 239 240 // Idempotent 241 return nil 242 } 243 244 // StartBuild handles a request to start a build. Implements pb.BuildsServer. 245 func (*Builds) StartBuild(ctx context.Context, req *pb.StartBuildRequest) (*pb.StartBuildResponse, error) { 246 if err := validateStartBuildRequest(ctx, req); err != nil { 247 return nil, appstatus.BadRequest(err) 248 } 249 250 var b *model.Build 251 var buildStatusChanged bool 252 var err error 253 254 // a token is required 255 rawToken, err := getBuildbucketToken(ctx, false) 256 if err != nil { 257 return nil, err 258 } 259 260 // token can either be BUILD or START_BUILD 261 tok, err := buildtoken.ParseToTokenBody(ctx, rawToken, req.BuildId, pb.TokenBody_START_BUILD, pb.TokenBody_BUILD) 262 if err != nil { 263 return nil, err 264 } 265 266 switch tok.Purpose { 267 case pb.TokenBody_BUILD: 268 b, buildStatusChanged, err = startBuildOnSwarming(ctx, req, rawToken) 269 case pb.TokenBody_START_BUILD: 270 b, buildStatusChanged, err = startBuildOnBackend(ctx, req) 271 default: 272 panic(fmt.Sprintf("impossible: invalid token purpose: %s", tok.Purpose)) 273 } 274 if err != nil { 275 return nil, err 276 } 277 278 if buildStatusChanged { 279 // Update metrics. 280 logging.Infof(ctx, "Build %d: started", b.ID) 281 metrics.BuildStarted(ctx, b) 282 } 283 284 mask, err := model.NewBuildMask("", nil, &pb.BuildMask{AllFields: true}) 285 if err != nil { 286 return nil, errors.Annotate(err, "failed to construct build mask").Err() 287 } 288 bp, err := b.ToProto(ctx, mask, nil) 289 if err != nil { 290 return nil, errors.Annotate(err, "failed to generate build proto from model").Err() 291 } 292 293 return &pb.StartBuildResponse{Build: bp, UpdateBuildToken: b.UpdateToken}, nil 294 }