go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/buildbucket/appengine/tasks/check_build_liveness.go (about)

     1  // Copyright 2023 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tasks
    16  
    17  import (
    18  	"context"
    19  	"time"
    20  
    21  	"google.golang.org/protobuf/types/known/timestamppb"
    22  
    23  	"go.chromium.org/luci/common/clock"
    24  	"go.chromium.org/luci/common/errors"
    25  	"go.chromium.org/luci/common/logging"
    26  	"go.chromium.org/luci/common/retry/transient"
    27  	"go.chromium.org/luci/gae/service/datastore"
    28  
    29  	"go.chromium.org/luci/buildbucket/appengine/common"
    30  	"go.chromium.org/luci/buildbucket/appengine/internal/buildstatus"
    31  	"go.chromium.org/luci/buildbucket/appengine/internal/metrics"
    32  	"go.chromium.org/luci/buildbucket/appengine/model"
    33  	pb "go.chromium.org/luci/buildbucket/proto"
    34  	"go.chromium.org/luci/buildbucket/protoutil"
    35  )
    36  
    37  // CheckLiveness is to check if the given build has received any updates during
    38  // the timeout period.
    39  func CheckLiveness(ctx context.Context, buildID int64, heartbeatTimeout uint32) error {
    40  	bld, err := common.GetBuild(ctx, buildID)
    41  	if err != nil {
    42  		return errors.Annotate(err, "failed to get build %d", buildID).Err()
    43  	}
    44  
    45  	if protoutil.IsEnded(bld.Status) {
    46  		// No need to check for an ended build.
    47  		return nil
    48  	}
    49  	// Enqueue a continuation CheckBuildLiveness task
    50  	if !isTimeout(ctx, bld, heartbeatTimeout) {
    51  		// lefted excution timeout.
    52  		delay := bld.Proto.ExecutionTimeout.AsDuration() - (clock.Now(ctx).Sub(bld.Proto.StartTime.AsTime()))
    53  		if heartbeatTimeout > 0 && uint32(delay.Seconds()) > heartbeatTimeout {
    54  			delay = time.Duration(heartbeatTimeout) * time.Second
    55  		}
    56  		return transient.Tag.Apply(CheckBuildLiveness(ctx, buildID, heartbeatTimeout, delay))
    57  	}
    58  
    59  	// Time out. Should mark the build as INFRA_FAILURE.
    60  	enqueueTask := false
    61  	txnErr := datastore.RunInTransaction(ctx, func(ctx context.Context) error {
    62  		// Fetch and check the build again before failing it. In case the build is
    63  		// changed during the short time window from the first check to now.
    64  		//
    65  		// Fetch build steps as well. Ignore any ErrNoSuchEntity error.
    66  		// `step.CancelIncomplete` will return false if no steps and steps update
    67  		//  will be skipped.
    68  		steps := &model.BuildSteps{Build: datastore.KeyForObj(ctx, bld)}
    69  		if err := model.GetIgnoreMissing(ctx, bld, steps); err != nil {
    70  			return err
    71  		}
    72  
    73  		if !isTimeout(ctx, bld, heartbeatTimeout) {
    74  			if !protoutil.IsEnded(bld.Status) {
    75  				enqueueTask = true
    76  			}
    77  			return nil
    78  		}
    79  
    80  		oldStatus := bld.Proto.Status
    81  		now := clock.Now(ctx)
    82  		statusUpdater := buildstatus.Updater{
    83  			Build:       bld,
    84  			BuildStatus: &buildstatus.StatusWithDetails{Status: pb.Status_INFRA_FAILURE},
    85  			UpdateTime:  now,
    86  			PostProcess: SendOnBuildStatusChange,
    87  		}
    88  		bs, err := statusUpdater.Do(ctx)
    89  		if err != nil {
    90  			return errors.Annotate(err, "failed to update status").Err()
    91  		}
    92  		toSave := []any{bld, bs}
    93  		switch changed, err := steps.CancelIncomplete(ctx, timestamppb.New(now)); {
    94  		case err != nil:
    95  			return errors.Annotate(err, "failed to cancel steps").Err()
    96  		case changed:
    97  			toSave = append(toSave, steps)
    98  		}
    99  		logging.Infof(ctx, "Build %d timed out, updating status: %s -> %s", bld.ID, oldStatus, bld.Proto.Status)
   100  		return datastore.Put(ctx, toSave)
   101  	}, nil)
   102  	if txnErr != nil {
   103  		return transient.Tag.Apply(errors.Annotate(txnErr, "failed to fail the build %d", buildID).Err())
   104  	}
   105  
   106  	if enqueueTask {
   107  		// Enqueue a continuation Task
   108  		return transient.Tag.Apply(CheckBuildLiveness(ctx, buildID, heartbeatTimeout, time.Duration(heartbeatTimeout)*time.Second))
   109  	}
   110  	metrics.BuildCompleted(ctx, bld)
   111  	return nil
   112  }
   113  
   114  // isTimeout checks if the build exceeds the scheduling timeout, execution
   115  // timeout or heartbeat timeout after the last touch.
   116  func isTimeout(ctx context.Context, bld *model.Build, heartbeatTimeout uint32) bool {
   117  	now := clock.Now(ctx)
   118  	switch bld.Proto.Status {
   119  	case pb.Status_SCHEDULED:
   120  		if now.Sub(bld.Proto.CreateTime.AsTime()) >= bld.Proto.SchedulingTimeout.AsDuration() {
   121  			return true
   122  		}
   123  	case pb.Status_STARTED:
   124  		if now.Sub(bld.Proto.StartTime.AsTime()) >= bld.Proto.ExecutionTimeout.AsDuration() {
   125  			return true
   126  		}
   127  		if heartbeatTimeout != 0 && now.Sub(bld.Proto.UpdateTime.AsTime()) >= time.Duration(heartbeatTimeout)*time.Second {
   128  			return true
   129  		}
   130  	}
   131  	return false
   132  }