go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/buildbucket/appengine/tasks/check_build_liveness.go (about) 1 // Copyright 2023 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tasks 16 17 import ( 18 "context" 19 "time" 20 21 "google.golang.org/protobuf/types/known/timestamppb" 22 23 "go.chromium.org/luci/common/clock" 24 "go.chromium.org/luci/common/errors" 25 "go.chromium.org/luci/common/logging" 26 "go.chromium.org/luci/common/retry/transient" 27 "go.chromium.org/luci/gae/service/datastore" 28 29 "go.chromium.org/luci/buildbucket/appengine/common" 30 "go.chromium.org/luci/buildbucket/appengine/internal/buildstatus" 31 "go.chromium.org/luci/buildbucket/appengine/internal/metrics" 32 "go.chromium.org/luci/buildbucket/appengine/model" 33 pb "go.chromium.org/luci/buildbucket/proto" 34 "go.chromium.org/luci/buildbucket/protoutil" 35 ) 36 37 // CheckLiveness is to check if the given build has received any updates during 38 // the timeout period. 39 func CheckLiveness(ctx context.Context, buildID int64, heartbeatTimeout uint32) error { 40 bld, err := common.GetBuild(ctx, buildID) 41 if err != nil { 42 return errors.Annotate(err, "failed to get build %d", buildID).Err() 43 } 44 45 if protoutil.IsEnded(bld.Status) { 46 // No need to check for an ended build. 47 return nil 48 } 49 // Enqueue a continuation CheckBuildLiveness task 50 if !isTimeout(ctx, bld, heartbeatTimeout) { 51 // lefted excution timeout. 52 delay := bld.Proto.ExecutionTimeout.AsDuration() - (clock.Now(ctx).Sub(bld.Proto.StartTime.AsTime())) 53 if heartbeatTimeout > 0 && uint32(delay.Seconds()) > heartbeatTimeout { 54 delay = time.Duration(heartbeatTimeout) * time.Second 55 } 56 return transient.Tag.Apply(CheckBuildLiveness(ctx, buildID, heartbeatTimeout, delay)) 57 } 58 59 // Time out. Should mark the build as INFRA_FAILURE. 60 enqueueTask := false 61 txnErr := datastore.RunInTransaction(ctx, func(ctx context.Context) error { 62 // Fetch and check the build again before failing it. In case the build is 63 // changed during the short time window from the first check to now. 64 // 65 // Fetch build steps as well. Ignore any ErrNoSuchEntity error. 66 // `step.CancelIncomplete` will return false if no steps and steps update 67 // will be skipped. 68 steps := &model.BuildSteps{Build: datastore.KeyForObj(ctx, bld)} 69 if err := model.GetIgnoreMissing(ctx, bld, steps); err != nil { 70 return err 71 } 72 73 if !isTimeout(ctx, bld, heartbeatTimeout) { 74 if !protoutil.IsEnded(bld.Status) { 75 enqueueTask = true 76 } 77 return nil 78 } 79 80 oldStatus := bld.Proto.Status 81 now := clock.Now(ctx) 82 statusUpdater := buildstatus.Updater{ 83 Build: bld, 84 BuildStatus: &buildstatus.StatusWithDetails{Status: pb.Status_INFRA_FAILURE}, 85 UpdateTime: now, 86 PostProcess: SendOnBuildStatusChange, 87 } 88 bs, err := statusUpdater.Do(ctx) 89 if err != nil { 90 return errors.Annotate(err, "failed to update status").Err() 91 } 92 toSave := []any{bld, bs} 93 switch changed, err := steps.CancelIncomplete(ctx, timestamppb.New(now)); { 94 case err != nil: 95 return errors.Annotate(err, "failed to cancel steps").Err() 96 case changed: 97 toSave = append(toSave, steps) 98 } 99 logging.Infof(ctx, "Build %d timed out, updating status: %s -> %s", bld.ID, oldStatus, bld.Proto.Status) 100 return datastore.Put(ctx, toSave) 101 }, nil) 102 if txnErr != nil { 103 return transient.Tag.Apply(errors.Annotate(txnErr, "failed to fail the build %d", buildID).Err()) 104 } 105 106 if enqueueTask { 107 // Enqueue a continuation Task 108 return transient.Tag.Apply(CheckBuildLiveness(ctx, buildID, heartbeatTimeout, time.Duration(heartbeatTimeout)*time.Second)) 109 } 110 metrics.BuildCompleted(ctx, bld) 111 return nil 112 } 113 114 // isTimeout checks if the build exceeds the scheduling timeout, execution 115 // timeout or heartbeat timeout after the last touch. 116 func isTimeout(ctx context.Context, bld *model.Build, heartbeatTimeout uint32) bool { 117 now := clock.Now(ctx) 118 switch bld.Proto.Status { 119 case pb.Status_SCHEDULED: 120 if now.Sub(bld.Proto.CreateTime.AsTime()) >= bld.Proto.SchedulingTimeout.AsDuration() { 121 return true 122 } 123 case pb.Status_STARTED: 124 if now.Sub(bld.Proto.StartTime.AsTime()) >= bld.Proto.ExecutionTimeout.AsDuration() { 125 return true 126 } 127 if heartbeatTimeout != 0 && now.Sub(bld.Proto.UpdateTime.AsTime()) >= time.Duration(heartbeatTimeout)*time.Second { 128 return true 129 } 130 } 131 return false 132 }