go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/buildbucket/appengine/tasks/cancel.go (about)

     1  // Copyright 2022 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tasks
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"strings"
    21  	"time"
    22  
    23  	"golang.org/x/sync/errgroup"
    24  	"google.golang.org/protobuf/types/known/timestamppb"
    25  
    26  	"go.chromium.org/luci/auth/identity"
    27  	"go.chromium.org/luci/common/clock"
    28  	"go.chromium.org/luci/common/errors"
    29  	"go.chromium.org/luci/common/logging"
    30  	"go.chromium.org/luci/gae/service/datastore"
    31  	"go.chromium.org/luci/server/auth"
    32  	"go.chromium.org/luci/server/tq"
    33  
    34  	"go.chromium.org/luci/buildbucket"
    35  	"go.chromium.org/luci/buildbucket/appengine/internal/metrics"
    36  	"go.chromium.org/luci/buildbucket/appengine/internal/perm"
    37  	"go.chromium.org/luci/buildbucket/appengine/model"
    38  	taskdefs "go.chromium.org/luci/buildbucket/appengine/tasks/defs"
    39  	pb "go.chromium.org/luci/buildbucket/proto"
    40  	"go.chromium.org/luci/buildbucket/protoutil"
    41  )
    42  
    43  // StartCancel starts canceling a build and schedules the delayed task to finally cancel it
    44  func StartCancel(ctx context.Context, bID int64, summary string) (*model.Build, error) {
    45  	bld := &model.Build{ID: bID}
    46  	err := datastore.RunInTransaction(ctx, func(ctx context.Context) error {
    47  		switch err := datastore.Get(ctx, bld); {
    48  		case err == datastore.ErrNoSuchEntity:
    49  			return perm.NotFoundErr(ctx)
    50  		case err != nil:
    51  			return errors.Annotate(err, "failed to fetch build: %d", bld.ID).Err()
    52  		case protoutil.IsEnded(bld.Proto.Status):
    53  			return nil
    54  		case bld.Proto.CancelTime != nil:
    55  			return nil
    56  		}
    57  		now := timestamppb.New(clock.Now(ctx).UTC())
    58  		bld.Proto.CancelTime = now
    59  		bld.Proto.UpdateTime = now
    60  		bld.Proto.CancellationMarkdown = summary
    61  		canceledBy := "buildbucket"
    62  		if auth.CurrentIdentity(ctx) != identity.AnonymousIdentity {
    63  			canceledBy = string(auth.CurrentIdentity(ctx))
    64  		}
    65  
    66  		bld.Proto.CanceledBy = canceledBy
    67  		if err := datastore.Put(ctx, bld); err != nil {
    68  			return errors.Annotate(err, "failed to store build: %d", bld.ID).Err()
    69  		}
    70  		// Enqueue the task to finally cancel the build.
    71  		if err := ScheduleCancelBuildTask(ctx, bID, buildbucket.MinUpdateBuildInterval+bld.Proto.GracePeriod.AsDuration()); err != nil {
    72  			return errors.Annotate(err, "failed to enqueue cancel task for build: %d", bld.ID).Err()
    73  		}
    74  		return nil
    75  	}, nil)
    76  	if err != nil {
    77  		return bld, errors.Annotate(err, "failed to set the build to CANCELING: %d", bID).Err()
    78  	}
    79  
    80  	// TODO(crbug.com/1031205): alternatively, we could just map out the entire
    81  	// cancellation tree and then feed those build IDs into a pool to do bulk
    82  	// cancel. We could add a bool argument to StartCancel to control if the
    83  	// function should cancel the entire tree or just the build itself.
    84  	// Discussion: https://chromium-review.googlesource.com/c/infra/luci/luci-go/+/3402796/comments/8aba3108_b4ca9f76
    85  	if err := CancelChildren(ctx, bID); err != nil {
    86  		// Failures of canceling children should not block canceling parent.
    87  		logging.Debugf(ctx, "failed to cancel children of %d: %s", bID, err)
    88  	}
    89  
    90  	return bld, err
    91  }
    92  
    93  // CancelChildren cancels a build's children.
    94  // NOTE: This process is best-effort; Builds call UpdateBuild at a minimum
    95  // frequency and Buildbucket will inform them if they should start the cancel
    96  // process (by checking the parent build, if any).
    97  // So, even if this fails, the next UpdateBuild will catch it.
    98  func CancelChildren(ctx context.Context, bID int64) error {
    99  	// Look for the build's children to cancel.
   100  	children, err := childrenToCancel(ctx, bID)
   101  	if err != nil {
   102  		return err
   103  	}
   104  	if len(children) == 0 {
   105  		return nil
   106  	}
   107  
   108  	eg, ctx := errgroup.WithContext(ctx)
   109  	summary := fmt.Sprintf("cancel since the parent %d is canceled", bID)
   110  
   111  	for _, child := range children {
   112  		child := child
   113  		eg.Go(func() error {
   114  			_, err := StartCancel(ctx, child.ID, summary)
   115  			return err
   116  		})
   117  	}
   118  	return eg.Wait()
   119  }
   120  
   121  // childrenToCancel returns the child build ids that should be canceled with
   122  // the parent.
   123  func childrenToCancel(ctx context.Context, bID int64) (children []*model.Build, err error) {
   124  	q := datastore.NewQuery(model.BuildKind).Eq("parent_id", bID)
   125  	err = datastore.Run(ctx, q, func(bld *model.Build) error {
   126  		switch {
   127  		case protoutil.IsEnded(bld.Proto.Status):
   128  			return nil
   129  		case bld.Proto.CanOutliveParent:
   130  			return nil
   131  		default:
   132  			children = append(children, bld)
   133  			return nil
   134  		}
   135  	})
   136  	return
   137  }
   138  
   139  // Cancel actually cancels a build.
   140  func Cancel(ctx context.Context, bID int64) (*model.Build, error) {
   141  	bld := &model.Build{ID: bID}
   142  	canceled := false
   143  	err := datastore.RunInTransaction(ctx, func(ctx context.Context) error {
   144  		canceled = false // reset canceled in case of retries.
   145  		inf := &model.BuildInfra{Build: datastore.KeyForObj(ctx, bld)}
   146  		stp := &model.BuildSteps{Build: inf.Build}
   147  		bs := &model.BuildStatus{Build: inf.Build}
   148  
   149  		cancelSteps := true
   150  		if err := datastore.Get(ctx, bld, inf, stp, bs); err != nil {
   151  			switch merr, ok := err.(errors.MultiError); {
   152  			case !ok:
   153  				return errors.Annotate(err, "failed to fetch build: %d", bld.ID).Err()
   154  			case merr[0] == datastore.ErrNoSuchEntity:
   155  				return perm.NotFoundErr(ctx)
   156  			case merr[0] != nil:
   157  				return errors.Annotate(merr[0], "failed to fetch build: %d", bld.ID).Err()
   158  			case merr[1] != nil && merr[1] != datastore.ErrNoSuchEntity:
   159  				return errors.Annotate(merr[1], "failed to fetch build infra: %d", bld.ID).Err()
   160  			case merr[2] != nil && merr[2] != datastore.ErrNoSuchEntity:
   161  				return errors.Annotate(merr[2], "failed to fetch build steps: %d", bld.ID).Err()
   162  			case merr[3] != nil && merr[3] != datastore.ErrNoSuchEntity:
   163  				// TODO(crbug.com/1430324): also check ErrNoSuchEntity.
   164  				return errors.Annotate(merr[3], "failed to fetch build status: %d", bld.ID).Err()
   165  			case merr[2] == datastore.ErrNoSuchEntity:
   166  				cancelSteps = false
   167  			}
   168  		}
   169  		if protoutil.IsEnded(bld.Proto.Status) {
   170  			return nil
   171  		}
   172  
   173  		if sw := inf.Proto.GetSwarming(); sw.GetHostname() != "" && sw.TaskId != "" {
   174  			if err := CancelSwarmingTask(ctx, &taskdefs.CancelSwarmingTaskGo{
   175  				Hostname: sw.Hostname,
   176  				TaskId:   sw.TaskId,
   177  				Realm:    bld.Realm(),
   178  			}); err != nil {
   179  				return errors.Annotate(err, "failed to enqueue swarming task cancellation task: %d", bld.ID).Err()
   180  			}
   181  		}
   182  		if bk := inf.Proto.GetBackend(); bk.GetTask().GetId().GetId() != "" && bk.GetTask().GetId().GetTarget() != "" {
   183  			if err := CancelBackendTask(ctx, &taskdefs.CancelBackendTask{
   184  				Target:  bk.Task.Id.Target,
   185  				TaskId:  bk.Task.Id.Id,
   186  				Project: bld.Project,
   187  			}); err != nil {
   188  				return errors.Annotate(err, "failed to enqueue backend task cancelation task: %d", bld.ID).Err()
   189  			}
   190  		}
   191  		if rdb := inf.Proto.GetResultdb(); rdb.GetHostname() != "" && rdb.Invocation != "" {
   192  			if err := FinalizeResultDB(ctx, &taskdefs.FinalizeResultDBGo{
   193  				BuildId: bld.ID,
   194  			}); err != nil {
   195  				return errors.Annotate(err, "failed to enqueue resultdb finalization task: %d", bld.ID).Err()
   196  			}
   197  		}
   198  		if err := ExportBigQuery(ctx, bld.ID, strings.Contains(bld.ExperimentsString(), buildbucket.ExperimentBqExporterGo)); err != nil {
   199  			return errors.Annotate(err, "failed to enqueue bigquery export task: %d", bld.ID).Err()
   200  		}
   201  		if err := NotifyPubSub(ctx, bld); err != nil {
   202  			return errors.Annotate(err, "failed to enqueue pubsub notification task: %d", bld.ID).Err()
   203  		}
   204  
   205  		now := clock.Now(ctx).UTC()
   206  
   207  		bld.Leasee = nil
   208  		bld.LeaseExpirationDate = time.Time{}
   209  		bld.LeaseKey = 0
   210  
   211  		protoutil.SetStatus(now, bld.Proto, pb.Status_CANCELED)
   212  		logging.Debugf(ctx, fmt.Sprintf("Build %d status has now been set as canceled.", bld.ID))
   213  		canceled = true
   214  		toPut := []any{bld}
   215  
   216  		if bs.Status != pb.Status_STATUS_UNSPECIFIED {
   217  			bs.Status = pb.Status_CANCELED
   218  			toPut = append(toPut, bs)
   219  		}
   220  
   221  		if cancelSteps {
   222  			switch changed, err := stp.CancelIncomplete(ctx, timestamppb.New(now)); {
   223  			case err != nil:
   224  				return errors.Annotate(err, "failed to mark steps cancelled: %d", bld.ID).Err()
   225  			case changed:
   226  				toPut = append(toPut, stp)
   227  			}
   228  		}
   229  
   230  		if err := datastore.Put(ctx, toPut...); err != nil {
   231  			return errors.Annotate(err, "failed to store build: %d", bld.ID).Err()
   232  		}
   233  		return nil
   234  	}, nil)
   235  	if err != nil {
   236  		return nil, err
   237  	}
   238  	if protoutil.IsEnded(bld.Status) && canceled {
   239  		metrics.BuildCompleted(ctx, bld)
   240  	}
   241  
   242  	return bld, nil
   243  }
   244  
   245  // ScheduleCancelBuildTask enqueues a CancelBuildTask.
   246  func ScheduleCancelBuildTask(ctx context.Context, bID int64, delay time.Duration) error {
   247  	return tq.AddTask(ctx, &tq.Task{
   248  		Title: fmt.Sprintf("cancel-%d", bID),
   249  		Payload: &taskdefs.CancelBuildTask{
   250  			BuildId: bID,
   251  		},
   252  		Delay: delay,
   253  	})
   254  }