go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/buildbucket/appengine/internal/buildcron/expired.go (about)

     1  // Copyright 2022 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package buildcron
    16  
    17  import (
    18  	"context"
    19  	"strings"
    20  
    21  	"go.chromium.org/luci/common/clock"
    22  	"go.chromium.org/luci/common/errors"
    23  	"go.chromium.org/luci/common/logging"
    24  	"go.chromium.org/luci/common/sync/parallel"
    25  	"go.chromium.org/luci/gae/service/datastore"
    26  
    27  	"go.chromium.org/luci/buildbucket"
    28  	"go.chromium.org/luci/buildbucket/appengine/internal/metrics"
    29  	"go.chromium.org/luci/buildbucket/appengine/model"
    30  	"go.chromium.org/luci/buildbucket/appengine/tasks"
    31  	taskdefs "go.chromium.org/luci/buildbucket/appengine/tasks/defs"
    32  	pb "go.chromium.org/luci/buildbucket/proto"
    33  	"go.chromium.org/luci/buildbucket/protoutil"
    34  )
    35  
    36  func expireBuilds(ctx context.Context, bs []*model.Build, mr parallel.MultiRunner) error {
    37  	nOrig := len(bs)
    38  	if nOrig == 0 {
    39  		return nil
    40  	}
    41  
    42  	toUpdate := make([]*model.Build, 0, len(bs))
    43  	err := datastore.RunInTransaction(ctx, func(ctx context.Context) error {
    44  		// Clear the slice since it may contains value from the previous
    45  		// failed transaction.
    46  		toUpdate = toUpdate[:0]
    47  		if err := datastore.Get(ctx, bs); err != nil {
    48  			return err
    49  		}
    50  
    51  		now := clock.Now(ctx)
    52  		for _, b := range bs {
    53  			// skip updating, if it's no longer in a non-terminal status.
    54  			if protoutil.IsEnded(b.Proto.Status) {
    55  				continue
    56  			}
    57  
    58  			protoutil.SetStatus(now, b.Proto, pb.Status_INFRA_FAILURE)
    59  			if b.Proto.StatusDetails == nil {
    60  				b.Proto.StatusDetails = &pb.StatusDetails{}
    61  			}
    62  			b.Proto.StatusDetails.Timeout = &pb.StatusDetails_Timeout{}
    63  			b.ClearLease()
    64  			// TODO(crbug.com/1414540): A temporary code to mitigate the issue. Should
    65  			// delete it after the cron job is executed.
    66  			if b.Proto.Input.GetProperties() != nil {
    67  				b.Proto.Input.Properties = nil
    68  			}
    69  			toUpdate = append(toUpdate, b)
    70  		}
    71  
    72  		if len(toUpdate) == 0 {
    73  			return nil
    74  		}
    75  		return mr.RunMulti(func(workC chan<- func() error) {
    76  			for _, b := range toUpdate {
    77  				b := b
    78  				workC <- func() error { return tasks.NotifyPubSub(ctx, b) }
    79  				workC <- func() error {
    80  					return tasks.ExportBigQuery(ctx, b.ID, strings.Contains(b.ExperimentsString(), buildbucket.ExperimentBqExporterGo))
    81  				}
    82  				workC <- func() error {
    83  					return tasks.FinalizeResultDB(ctx, &taskdefs.FinalizeResultDBGo{BuildId: b.ID})
    84  				}
    85  			}
    86  			workC <- func() error { return datastore.Put(ctx, toUpdate) }
    87  			workC <- func() error {
    88  				return updateBuildStatuses(ctx, toUpdate, pb.Status_INFRA_FAILURE)
    89  			}
    90  		})
    91  	}, nil)
    92  
    93  	if err == nil {
    94  		logging.Infof(ctx, "Processed %d/%d expired builds", len(toUpdate), nOrig)
    95  		for _, b := range toUpdate {
    96  			logging.Infof(ctx, "Build %d: completed by cron(expire_builds) with status %q.",
    97  				b.ID, b.Status)
    98  			metrics.BuildCompleted(ctx, b)
    99  		}
   100  	}
   101  	return err
   102  }
   103  
   104  // TimeoutExpiredBuilds marks incomplete builds that were created longer than
   105  // model.BuildMaxCompletionTime w/ INFRA_FAILURE.
   106  func TimeoutExpiredBuilds(ctx context.Context) error {
   107  	// expireBuilds() updates 5 entities for each of the given builds within
   108  	// a single transaction, and a ds transaction can update at most
   109  	// 25 entities.
   110  	//
   111  	// Hence, this batchSize must be 5 or lower.
   112  	const batchSize = 25 / 5
   113  	// Processing each batch requires at most 5 goroutines.
   114  	// - 1 for ds.RunTransaction()
   115  	// - 4 for add tasks into TQ and ds.Put()
   116  	//
   117  	// Also, there is another goroutine for scanning expired builds.
   118  	// Hence, this can run at most 6 transactions in parallel.
   119  	const nWorkers = 32
   120  	q := datastore.NewQuery(model.BuildKind).
   121  		Gt("__key__", buildKeyByAge(ctx, model.BuildMaxCompletionTime)).
   122  		KeysOnly(true)
   123  
   124  	return parallel.RunMulti(ctx, nWorkers, func(mr parallel.MultiRunner) error {
   125  		return mr.RunMulti(func(workC chan<- func() error) {
   126  			ch := make(chan []*model.Build, nWorkers)
   127  			workC <- func() error {
   128  				defer close(ch)
   129  
   130  				// Queries within a transcation must include an Ancestor filter.
   131  				// Hence, this searches expired builds out of a transaction first,
   132  				// and then update them in a transaction.
   133  				for _, st := range []pb.Status{pb.Status_SCHEDULED, pb.Status_STARTED} {
   134  					bs := make([]*model.Build, 0, batchSize)
   135  					err := datastore.RunBatch(ctx, int32(batchSize), q.Eq("status_v2", st),
   136  						func(b *model.Build) error {
   137  							bs = append(bs, b)
   138  							if len(bs) == batchSize {
   139  								ch <- bs
   140  								bs = make([]*model.Build, 0, batchSize)
   141  							}
   142  							return nil
   143  						},
   144  					)
   145  					if len(bs) > 0 {
   146  						ch <- bs
   147  					}
   148  					if err != nil {
   149  						return errors.Annotate(err, "querying expired %s builds", st).Err()
   150  					}
   151  				}
   152  				return nil
   153  			}
   154  
   155  			for bs := range ch {
   156  				bs := bs
   157  				workC <- func() error {
   158  					return errors.Annotate(expireBuilds(ctx, bs, mr), "expireBuilds").Err()
   159  				}
   160  			}
   161  		})
   162  	})
   163  }
   164  
   165  func resetLeases(ctx context.Context, bs []*model.Build, mr parallel.MultiRunner) error {
   166  	nOrig := len(bs)
   167  	if nOrig == 0 {
   168  		return nil
   169  	}
   170  
   171  	toReset := make([]*model.Build, 0, len(bs))
   172  	err := datastore.RunInTransaction(ctx, func(ctx context.Context) error {
   173  		if err := datastore.Get(ctx, bs); err != nil {
   174  			return err
   175  		}
   176  
   177  		now := clock.Now(ctx)
   178  		for _, b := range bs {
   179  			if !b.IsLeased || b.LeaseExpirationDate.After(now) {
   180  				continue
   181  			}
   182  			// A terminated build cannot be leased.
   183  			// It must be that the data is corrupted or there is a bug.
   184  			if protoutil.IsEnded(b.Proto.Status) {
   185  				logging.Errorf(ctx, "Build %d is leased and terminated", b.ID)
   186  			} else {
   187  				protoutil.SetStatus(now, b.Proto, pb.Status_SCHEDULED)
   188  			}
   189  			b.ClearLease()
   190  			toReset = append(toReset, b)
   191  		}
   192  		if len(toReset) == 0 {
   193  			return nil
   194  		}
   195  		return mr.RunMulti(func(workC chan<- func() error) {
   196  			for _, b := range toReset {
   197  				b := b
   198  				workC <- func() error { return tasks.NotifyPubSub(ctx, b) }
   199  			}
   200  			workC <- func() error { return datastore.Put(ctx, toReset) }
   201  			workC <- func() error {
   202  				return updateBuildStatuses(ctx, toReset, pb.Status_SCHEDULED)
   203  			}
   204  		})
   205  	}, nil)
   206  
   207  	if err == nil {
   208  		logging.Infof(ctx, "Reset %d/%d expired leases.", len(toReset), nOrig)
   209  		for _, b := range toReset {
   210  			logging.Infof(ctx, "Build %d: expired lease was reset", b.ID)
   211  			metrics.ExpiredLeaseReset(ctx, b)
   212  		}
   213  	}
   214  	return err
   215  }
   216  
   217  // ResetExpiredLeases resets expired leases.
   218  func ResetExpiredLeases(ctx context.Context) error {
   219  	// resetLeases() updates 3 entities for each of the given builds within
   220  	// a single transaction, and a ds transaction can update at most
   221  	// 25 entities.
   222  	//
   223  	// Hence, this batchSize must be 8 or lower.
   224  	const batchSize = 25 / 3
   225  	const nWorkers = 12
   226  	q := datastore.NewQuery(model.BuildKind).
   227  		Eq("is_leased", true).
   228  		Lte("lease_expiration_date", clock.Now(ctx).UTC()).
   229  		KeysOnly(true)
   230  
   231  	return parallel.RunMulti(ctx, nWorkers, func(mr parallel.MultiRunner) error {
   232  		return mr.RunMulti(func(workC chan<- func() error) {
   233  			ch := make(chan []*model.Build, nWorkers)
   234  			workC <- func() error {
   235  				defer close(ch)
   236  				bs := make([]*model.Build, 0, batchSize)
   237  				err := datastore.RunBatch(ctx, int32(batchSize), q, func(b *model.Build) error {
   238  					bs = append(bs, b)
   239  					if len(bs) == batchSize {
   240  						ch <- bs
   241  						bs = make([]*model.Build, 0, batchSize)
   242  					}
   243  					return nil
   244  				})
   245  				if len(bs) > 0 {
   246  					ch <- bs
   247  				}
   248  				return errors.Annotate(err, "querying expired, leased builds").Err()
   249  			}
   250  
   251  			for bs := range ch {
   252  				bs := bs
   253  				workC <- func() error {
   254  					return errors.Annotate(resetLeases(ctx, bs, mr),
   255  						"resetting %d expired leases", len(bs)).Err()
   256  				}
   257  			}
   258  		})
   259  	})
   260  }
   261  
   262  func updateBuildStatuses(ctx context.Context, builds []*model.Build, status pb.Status) error {
   263  	buildStatuses := make([]*model.BuildStatus, 0, len(builds))
   264  	for _, b := range builds {
   265  		buildStatuses = append(buildStatuses, &model.BuildStatus{
   266  			Build: datastore.KeyForObj(ctx, b),
   267  		})
   268  	}
   269  	err := datastore.Get(ctx, buildStatuses)
   270  	if err == nil {
   271  		for _, s := range buildStatuses {
   272  			s.Status = status
   273  		}
   274  		return datastore.Put(ctx, buildStatuses)
   275  	}
   276  
   277  	merr, ok := err.(errors.MultiError)
   278  	if !ok {
   279  		return err
   280  	}
   281  	existBuildStatuses := make([]*model.BuildStatus, 0, len(buildStatuses))
   282  	for i, me := range merr {
   283  		if me == nil {
   284  			existBuildStatuses = append(existBuildStatuses, buildStatuses[i])
   285  		}
   286  		// It is allowed for build created before BuildStatus rollout to not have
   287  		// BuildStatus.
   288  		// TODO(crbug.com/1430324): also disallow ErrNoSuchEntity for BuildStatus.
   289  	}
   290  
   291  	for _, s := range existBuildStatuses {
   292  		s.Status = status
   293  	}
   294  	return datastore.Put(ctx, existBuildStatuses)
   295  }