go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/run/impl/handler/poke.go (about)

     1  // Copyright 2021 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package handler
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"time"
    21  
    22  	"go.chromium.org/luci/gae/service/datastore"
    23  
    24  	"go.chromium.org/luci/common/clock"
    25  	"go.chromium.org/luci/common/errors"
    26  	"go.chromium.org/luci/common/logging"
    27  	"go.chromium.org/luci/common/sync/parallel"
    28  
    29  	"go.chromium.org/luci/cv/internal/changelist"
    30  	"go.chromium.org/luci/cv/internal/common"
    31  	"go.chromium.org/luci/cv/internal/configs/prjcfg"
    32  	"go.chromium.org/luci/cv/internal/run"
    33  	"go.chromium.org/luci/cv/internal/run/impl/state"
    34  	"go.chromium.org/luci/cv/internal/tryjob"
    35  )
    36  
    37  const (
    38  	treeCheckInterval          = time.Minute
    39  	clRefreshInterval          = 10 * time.Minute
    40  	tryjobRefreshInterval      = 150 * time.Second
    41  	treeStatusFailureTimeLimit = 10 * time.Minute
    42  )
    43  
    44  // Poke implements Handler interface.
    45  func (impl *Impl) Poke(ctx context.Context, rs *state.RunState) (*Result, error) {
    46  	if !run.IsEnded(rs.Status) && !rs.StartTime.IsZero() && clock.Since(ctx, rs.CreateTime) > 2*24*time.Hour {
    47  		logging.Warningf(ctx, "FIXME - crbug/40946713: run has been active for 2 days. This is likely caused by the referenced bug.")
    48  	}
    49  	if !run.IsEnded(rs.Status) && clock.Since(ctx, rs.CreateTime) > common.MaxRunTotalDuration {
    50  		return impl.Cancel(ctx, rs, []string{
    51  			fmt.Sprintf("max run duration of %s has reached", common.MaxRunTotalDuration),
    52  		})
    53  	}
    54  
    55  	rs = rs.ShallowCopy()
    56  	if shouldCheckTree(ctx, rs.Status, rs.Submission) {
    57  		rs.CloneSubmission()
    58  		switch open, err := rs.CheckTree(ctx, impl.TreeClient); {
    59  		case err != nil && clock.Since(ctx, rs.Submission.TreeErrorSince.AsTime()) > treeStatusFailureTimeLimit:
    60  			// The tree has been erroring for too long. Reset the triggers and
    61  			// fail the run.
    62  			cg, err := prjcfg.GetConfigGroup(ctx, rs.ID.LUCIProject(), rs.ConfigGroupID)
    63  			if err != nil {
    64  				return nil, err
    65  			}
    66  			rims := make(map[common.CLID]reviewInputMeta, len(rs.CLs))
    67  			whoms := rs.Mode.GerritNotifyTargets()
    68  			meta := reviewInputMeta{
    69  				notify: whoms,
    70  				// Add the same set of group/people to the attention set.
    71  				addToAttention: whoms,
    72  				reason:         submissionFailureAttentionReason,
    73  				message:        fmt.Sprintf(persistentTreeStatusAppFailureTemplate, cg.Content.GetVerifiers().GetTreeStatus().GetUrl()),
    74  			}
    75  			for _, cl := range rs.CLs {
    76  				if !rs.HasRootCL() || rs.RootCL == cl {
    77  					rims[cl] = meta
    78  				}
    79  			}
    80  			scheduleTriggersReset(ctx, rs, rims, run.Status_FAILED)
    81  			return &Result{
    82  				State: rs,
    83  			}, nil
    84  		case err != nil:
    85  			logging.Warningf(ctx, "tree status check failed with error: %s", err)
    86  			fallthrough
    87  		case !open:
    88  			if err := impl.RM.PokeAfter(ctx, rs.ID, treeCheckInterval); err != nil {
    89  				return nil, err
    90  			}
    91  		default:
    92  			return impl.OnReadyForSubmission(ctx, rs)
    93  		}
    94  	}
    95  
    96  	// If it's scheduled to be cancelled, skip the refresh.
    97  	// The long op might have been expired, but it should be removed at the end
    98  	// of this call first, and then the next Poke() will run this check again.
    99  	if !isCurrentlyResettingTriggers(rs) && shouldRefreshCLs(ctx, rs) {
   100  		cg, runCLs, cls, err := loadCLsAndConfig(ctx, rs, rs.CLs)
   101  		if err != nil {
   102  			return nil, err
   103  		}
   104  		switch ok, err := checkRunCreate(ctx, rs, cg, runCLs, cls); {
   105  		case err != nil:
   106  			return nil, err
   107  		case ok:
   108  			if err := impl.CLUpdater.ScheduleBatch(
   109  				ctx, rs.ID.LUCIProject(), cls,
   110  				changelist.UpdateCLTask_RUN_POKE); err != nil {
   111  				return nil, err
   112  			}
   113  			rs.LatestCLsRefresh = datastore.RoundTime(clock.Now(ctx).UTC())
   114  		}
   115  	}
   116  
   117  	if shouldRefreshTryjobs(ctx, rs) {
   118  		executions := rs.Tryjobs.GetState().GetExecutions()
   119  		errs := errors.NewLazyMultiError(len(executions))
   120  		poolErr := parallel.WorkPool(min(8, len(executions)), func(workCh chan<- func() error) {
   121  			for i, execution := range executions {
   122  				// Only care about the latest attempt with the assumption that all
   123  				// earlier attempt should have been ended already.
   124  				switch latestAttempt := tryjob.LatestAttempt(execution); {
   125  				case latestAttempt == nil:
   126  				case latestAttempt.GetExternalId() == "":
   127  				case latestAttempt.GetStatus() == tryjob.Status_TRIGGERED:
   128  					// Only update Tryjob if it has been triggered at the Tryjob backend.
   129  					i := i
   130  					workCh <- func() error {
   131  						errs.Assign(i, impl.TN.ScheduleUpdate(ctx,
   132  							common.TryjobID(latestAttempt.GetTryjobId()),
   133  							tryjob.ExternalID(latestAttempt.GetExternalId())))
   134  						return nil
   135  					}
   136  				}
   137  
   138  			}
   139  		})
   140  		switch {
   141  		case poolErr != nil:
   142  			panic(poolErr)
   143  		case errs.Get() != nil:
   144  			return nil, common.MostSevereError(errs.Get())
   145  		default:
   146  			rs.LatestTryjobsRefresh = datastore.RoundTime(clock.Now(ctx).UTC())
   147  		}
   148  	}
   149  
   150  	return impl.processExpiredLongOps(ctx, rs)
   151  }
   152  
   153  func shouldCheckTree(ctx context.Context, st run.Status, sub *run.Submission) bool {
   154  	switch {
   155  	case st != run.Status_WAITING_FOR_SUBMISSION:
   156  	case sub.GetLastTreeCheckTime() == nil:
   157  		return true
   158  	case !sub.GetTreeOpen():
   159  		return clock.Now(ctx).Sub(sub.GetLastTreeCheckTime().AsTime()) >= treeCheckInterval
   160  	}
   161  	return false
   162  }
   163  
   164  func shouldRefreshCLs(ctx context.Context, rs *state.RunState) bool {
   165  	return shouldRefresh(ctx, rs, rs.LatestCLsRefresh, clRefreshInterval)
   166  }
   167  
   168  func shouldRefreshTryjobs(ctx context.Context, rs *state.RunState) bool {
   169  	return shouldRefresh(ctx, rs, rs.LatestTryjobsRefresh, tryjobRefreshInterval)
   170  }
   171  
   172  func shouldRefresh(ctx context.Context, rs *state.RunState, last time.Time, interval time.Duration) bool {
   173  	switch {
   174  	case run.IsEnded(rs.Status):
   175  		return false
   176  	case last.IsZero():
   177  		last = rs.CreateTime
   178  		fallthrough
   179  	default:
   180  		return clock.Since(ctx, last) > interval
   181  	}
   182  }