go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/buildbucket/appengine/rpc/start_build.go (about)

     1  // Copyright 2023 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package rpc
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"strings"
    21  
    22  	"google.golang.org/grpc/codes"
    23  
    24  	"go.chromium.org/luci/common/clock"
    25  	"go.chromium.org/luci/common/errors"
    26  	"go.chromium.org/luci/common/logging"
    27  	"go.chromium.org/luci/common/proto/protowalk"
    28  	"go.chromium.org/luci/gae/service/datastore"
    29  	"go.chromium.org/luci/grpc/appstatus"
    30  
    31  	"go.chromium.org/luci/buildbucket"
    32  	"go.chromium.org/luci/buildbucket/appengine/common"
    33  	"go.chromium.org/luci/buildbucket/appengine/internal/buildstatus"
    34  	"go.chromium.org/luci/buildbucket/appengine/internal/buildtoken"
    35  	"go.chromium.org/luci/buildbucket/appengine/internal/metrics"
    36  	"go.chromium.org/luci/buildbucket/appengine/model"
    37  	"go.chromium.org/luci/buildbucket/appengine/tasks"
    38  	pb "go.chromium.org/luci/buildbucket/proto"
    39  	"go.chromium.org/luci/buildbucket/protoutil"
    40  )
    41  
    42  func validateStartBuildRequest(ctx context.Context, req *pb.StartBuildRequest) error {
    43  	if procRes := protowalk.Fields(req, &protowalk.RequiredProcessor{}); procRes != nil {
    44  		if resStrs := procRes.Strings(); len(resStrs) > 0 {
    45  			logging.Infof(ctx, strings.Join(resStrs, ". "))
    46  		}
    47  		return procRes.Err()
    48  	}
    49  	return nil
    50  }
    51  
    52  // startBuildOnSwarming starts a build if it runs on swarming.
    53  //
    54  // For builds on Swarming, the swarming task id and update_token have been
    55  // saved in datastore during task creation.
    56  func startBuildOnSwarming(ctx context.Context, req *pb.StartBuildRequest, tok string) (*model.Build, bool, error) {
    57  	var b *model.Build
    58  	buildStatusChanged := false
    59  	txErr := datastore.RunInTransaction(ctx, func(ctx context.Context) error {
    60  		entities, err := common.GetBuildEntities(ctx, req.BuildId, model.BuildKind, model.BuildInfraKind)
    61  		if err != nil {
    62  			return errors.Annotate(err, "failed to get build %d", req.BuildId).Err()
    63  		}
    64  		b = entities[0].(*model.Build)
    65  		infra := entities[1].(*model.BuildInfra)
    66  
    67  		if infra.Proto.GetSwarming() == nil {
    68  			return appstatus.Errorf(codes.Internal, "the build %d does not run on swarming", req.BuildId)
    69  		}
    70  
    71  		// First StartBuild request.
    72  		if b.StartBuildRequestID == "" {
    73  			if infra.Proto.Swarming.TaskId != req.TaskId && infra.Proto.Swarming.TaskId != "" {
    74  				// Duplicated task.
    75  				return buildbucket.DuplicateTask.Apply(appstatus.Errorf(codes.AlreadyExists, "build %d has associated with task %q", req.BuildId, infra.Proto.Swarming.TaskId))
    76  			}
    77  
    78  			if protoutil.IsEnded(b.Status) {
    79  				// The build has ended.
    80  				// For example the StartBuild request reaches Buildbucket late, when the task
    81  				// has crashed (e.g. BOT_DIED).
    82  				return appstatus.Errorf(codes.FailedPrecondition, "cannot start ended build %d", b.ID)
    83  			}
    84  
    85  			var toPut []any
    86  			if infra.Proto.Swarming.TaskId == "" {
    87  				// In rare cases that a StartBuild request for a build can reach Buildbucket
    88  				// before tasks.CreateSwarmingBuildTask: e.g.
    89  				// 1. CreateSwarmingBuildTask for a build succeeds to create a
    90  				//    swarming task for the build, but it fails to save the task to datastore,
    91  				//    then this task needs to retry
    92  				// 2. in the meantime swarming starts to run the task created in 1, sends
    93  				//    a StartBuild request
    94  				// 3. after StartBuild, the retried CreateSwarmingBuildTask tries to
    95  				//    update datastore again with the same task id (because now creating task
    96  				//    is idempotent).
    97  				infra.Proto.Swarming.TaskId = req.TaskId
    98  				b.UpdateToken = tok
    99  				toPut = append(toPut, infra)
   100  			}
   101  
   102  			// Start the build.
   103  			b.StartBuildRequestID = req.RequestId
   104  			toPut = append(toPut, b)
   105  			if b.Proto.Output == nil {
   106  				b.Proto.Output = &pb.Build_Output{}
   107  			}
   108  			b.Proto.Output.Status = pb.Status_STARTED
   109  			statusUpdater := buildstatus.Updater{
   110  				Build:        b,
   111  				OutputStatus: &buildstatus.StatusWithDetails{Status: pb.Status_STARTED},
   112  				UpdateTime:   clock.Now(ctx),
   113  				PostProcess:  tasks.SendOnBuildStatusChange,
   114  			}
   115  			var bs *model.BuildStatus
   116  			bs, err = statusUpdater.Do(ctx)
   117  			if err != nil {
   118  				return appstatus.Errorf(codes.Internal, "failed to update status for build %d: %s", b.ID, err)
   119  			}
   120  			if bs != nil {
   121  				buildStatusChanged = true
   122  				toPut = append(toPut, bs)
   123  			}
   124  
   125  			if err = datastore.Put(ctx, toPut...); err != nil {
   126  				return appstatus.Errorf(codes.Internal, "failed to start build %d: %s", b.ID, err)
   127  			}
   128  			return nil
   129  		}
   130  		return checkSubsequentRequest(req, b.StartBuildRequestID, infra.Proto.Swarming.TaskId)
   131  	}, nil)
   132  	if txErr != nil {
   133  		return nil, false, txErr
   134  	}
   135  	return b, buildStatusChanged, nil
   136  }
   137  
   138  func startBuildOnBackendOnFirstReq(ctx context.Context, req *pb.StartBuildRequest, b *model.Build, infra *model.BuildInfra) (bool, error) {
   139  	taskID := infra.Proto.Backend.Task.GetId()
   140  	if taskID.GetId() != "" && taskID.GetId() != req.TaskId {
   141  		// The build has been associated with another task, possible from a previous
   142  		// RegisterBuildTask call from a different task.
   143  		return false, buildbucket.DuplicateTask.Apply(appstatus.Errorf(codes.AlreadyExists, "build %d has associated with task %q", req.BuildId, taskID.Id))
   144  	}
   145  
   146  	if protoutil.IsEnded(b.Status) {
   147  		// The build has ended.
   148  		// For example the StartBuild request reaches Buildbucket late, when the task
   149  		// has crashed (e.g. BOT_DIED).
   150  		return false, appstatus.Errorf(codes.FailedPrecondition, "cannot start ended build %d", b.ID)
   151  	}
   152  
   153  	if b.Status == pb.Status_STARTED {
   154  		// The build has started.
   155  		// Currently for builds on backend this should not happen.
   156  		return false, appstatus.Errorf(codes.FailedPrecondition, "cannot start started build %d", b.ID)
   157  	}
   158  
   159  	// Start the build.
   160  	toSave := []any{b}
   161  	b.StartBuildRequestID = req.RequestId
   162  	updateBuildToken, err := buildtoken.GenerateToken(ctx, b.ID, pb.TokenBody_BUILD)
   163  	if err != nil {
   164  		return false, errors.Annotate(err, "failed to generate BUILD token for build %d", b.ID).Err()
   165  	}
   166  	b.UpdateToken = updateBuildToken
   167  	if b.Proto.Output == nil {
   168  		b.Proto.Output = &pb.Build_Output{}
   169  	}
   170  	b.Proto.Output.Status = pb.Status_STARTED
   171  	statusUpdater := buildstatus.Updater{
   172  		Build:        b,
   173  		OutputStatus: &buildstatus.StatusWithDetails{Status: pb.Status_STARTED},
   174  		UpdateTime:   clock.Now(ctx),
   175  		PostProcess:  tasks.SendOnBuildStatusChange,
   176  	}
   177  	var bs *model.BuildStatus
   178  	bs, err = statusUpdater.Do(ctx)
   179  	if err != nil {
   180  		return false, appstatus.Errorf(codes.Internal, "failed to update status for build %d: %s", b.ID, err)
   181  	}
   182  	if bs != nil {
   183  		toSave = append(toSave, bs)
   184  	}
   185  
   186  	if taskID.GetId() == "" {
   187  		// First handshake, associate the task with the build.
   188  		taskID.Id = req.TaskId
   189  		toSave = append(toSave, infra)
   190  	}
   191  	err = datastore.Put(ctx, toSave)
   192  	if err != nil {
   193  		return false, errors.Annotate(err, "failed to start build %d: %s", b.ID, err).Err()
   194  	}
   195  
   196  	return true, nil
   197  }
   198  
   199  func startBuildOnBackend(ctx context.Context, req *pb.StartBuildRequest) (*model.Build, bool, error) {
   200  	var b *model.Build
   201  	buildStatusChanged := false
   202  	txErr := datastore.RunInTransaction(ctx, func(ctx context.Context) error {
   203  		entities, err := common.GetBuildEntities(ctx, req.BuildId, model.BuildKind, model.BuildInfraKind)
   204  		if err != nil {
   205  			return errors.Annotate(err, "failed to get build %d", req.BuildId).Err()
   206  		}
   207  		b = entities[0].(*model.Build)
   208  		infra := entities[1].(*model.BuildInfra)
   209  
   210  		if infra.Proto.GetBackend().GetTask() == nil {
   211  			return errors.Reason("the build %d does not run on task backend", req.BuildId).Err()
   212  		}
   213  
   214  		if b.StartBuildRequestID == "" {
   215  			// First StartBuild for the build.
   216  			buildStatusChanged, err = startBuildOnBackendOnFirstReq(ctx, req, b, infra)
   217  			return err
   218  		}
   219  
   220  		return checkSubsequentRequest(req, b.StartBuildRequestID, infra.Proto.Backend.Task.GetId().GetId())
   221  	}, nil)
   222  	if txErr != nil {
   223  		return nil, false, txErr
   224  	}
   225  	return b, buildStatusChanged, nil
   226  }
   227  
   228  func checkSubsequentRequest(req *pb.StartBuildRequest, savedReqID, savedTaskID string) error {
   229  	// Subsequent StartBuild request.
   230  	if savedReqID != req.RequestId {
   231  		// Different request id, deduplicate.
   232  		return buildbucket.DuplicateTask.Apply(appstatus.Errorf(codes.AlreadyExists, "build %d has recorded another StartBuild with request id %q", req.BuildId, savedReqID))
   233  	}
   234  
   235  	if savedTaskID != req.TaskId {
   236  		// Same request id, different task id.
   237  		return errors.Reason("build %d has associated with task id %q with StartBuild request id %q", req.BuildId, savedTaskID, savedReqID).Tag(buildbucket.TaskWithCollidedRequestID).Err()
   238  	}
   239  
   240  	// Idempotent
   241  	return nil
   242  }
   243  
   244  // StartBuild handles a request to start a build. Implements pb.BuildsServer.
   245  func (*Builds) StartBuild(ctx context.Context, req *pb.StartBuildRequest) (*pb.StartBuildResponse, error) {
   246  	if err := validateStartBuildRequest(ctx, req); err != nil {
   247  		return nil, appstatus.BadRequest(err)
   248  	}
   249  
   250  	var b *model.Build
   251  	var buildStatusChanged bool
   252  	var err error
   253  
   254  	// a token is required
   255  	rawToken, err := getBuildbucketToken(ctx, false)
   256  	if err != nil {
   257  		return nil, err
   258  	}
   259  
   260  	// token can either be BUILD or START_BUILD
   261  	tok, err := buildtoken.ParseToTokenBody(ctx, rawToken, req.BuildId, pb.TokenBody_START_BUILD, pb.TokenBody_BUILD)
   262  	if err != nil {
   263  		return nil, err
   264  	}
   265  
   266  	switch tok.Purpose {
   267  	case pb.TokenBody_BUILD:
   268  		b, buildStatusChanged, err = startBuildOnSwarming(ctx, req, rawToken)
   269  	case pb.TokenBody_START_BUILD:
   270  		b, buildStatusChanged, err = startBuildOnBackend(ctx, req)
   271  	default:
   272  		panic(fmt.Sprintf("impossible: invalid token purpose: %s", tok.Purpose))
   273  	}
   274  	if err != nil {
   275  		return nil, err
   276  	}
   277  
   278  	if buildStatusChanged {
   279  		// Update metrics.
   280  		logging.Infof(ctx, "Build %d: started", b.ID)
   281  		metrics.BuildStarted(ctx, b)
   282  	}
   283  
   284  	mask, err := model.NewBuildMask("", nil, &pb.BuildMask{AllFields: true})
   285  	if err != nil {
   286  		return nil, errors.Annotate(err, "failed to construct build mask").Err()
   287  	}
   288  	bp, err := b.ToProto(ctx, mask, nil)
   289  	if err != nil {
   290  		return nil, errors.Annotate(err, "failed to generate build proto from model").Err()
   291  	}
   292  
   293  	return &pb.StartBuildResponse{Build: bp, UpdateBuildToken: b.UpdateToken}, nil
   294  }