go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/buildbucket/appengine/internal/metrics/builder.go (about)

     1  // Copyright 2021 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package metrics
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"strings"
    21  	"time"
    22  
    23  	"golang.org/x/sync/errgroup"
    24  
    25  	"go.chromium.org/luci/common/clock"
    26  	"go.chromium.org/luci/common/data/stringset"
    27  	"go.chromium.org/luci/common/errors"
    28  	"go.chromium.org/luci/common/sync/parallel"
    29  	"go.chromium.org/luci/common/tsmon"
    30  	"go.chromium.org/luci/gae/service/datastore"
    31  
    32  	"go.chromium.org/luci/buildbucket/appengine/model"
    33  	pb "go.chromium.org/luci/buildbucket/proto"
    34  	"go.chromium.org/luci/buildbucket/protoutil"
    35  )
    36  
    37  // ReportBuilderMetrics computes and reports Builder metrics.
    38  func ReportBuilderMetrics(ctx context.Context) error {
    39  	// Reset the metric to stop reporting no-longer-existing builders.
    40  	tsmon.GetState(ctx).Store().Reset(ctx, V2.BuilderPresence)
    41  	luciBuckets, err := fetchLUCIBuckets(ctx)
    42  	if err != nil {
    43  		return errors.Annotate(err, "fetching LUCI buckets w/ swarming config").Err()
    44  	}
    45  
    46  	return parallel.WorkPool(256, func(taskC chan<- func() error) {
    47  		q := datastore.NewQuery(model.BuilderStatKind)
    48  		err := datastore.RunBatch(ctx, 64, q, func(k *datastore.Key) error {
    49  			project, bucket, builder := mustParseBuilderStatID(k.StringID())
    50  			tctx := WithBuilder(ctx, project, bucket, builder)
    51  			legacyBucket := bucket
    52  			// V1 metrics format the bucket name in "luci.$project.$bucket"
    53  			// if the bucket config has a swarming config.
    54  			if luciBuckets.Has(protoutil.FormatBucketID(project, bucket)) {
    55  				legacyBucket = legacyBucketName(project, bucket)
    56  			}
    57  			V2.BuilderPresence.Set(tctx, true)
    58  
    59  			taskC <- func() error {
    60  				return errors.Annotate(
    61  					reportMaxAge(tctx, project, bucket, legacyBucket, builder),
    62  					"reportMaxAge",
    63  				).Err()
    64  			}
    65  			taskC <- func() error {
    66  				return errors.Annotate(
    67  					reportBuildCount(tctx, project, bucket, legacyBucket, builder),
    68  					"reportBuildCount",
    69  				).Err()
    70  			}
    71  			taskC <- func() error {
    72  				return errors.Annotate(
    73  					reportConsecutiveFailures(tctx, project, bucket, builder),
    74  					"reportConsecutiveFailures",
    75  				).Err()
    76  			}
    77  			return nil
    78  		})
    79  		if err != nil {
    80  			taskC <- func() error { return errors.Annotate(err, "datastore.RunBatch").Err() }
    81  		}
    82  	})
    83  }
    84  
    85  func mustParseBuilderStatID(id string) (project, bucket, builder string) {
    86  	parts := strings.Split(id, ":")
    87  	if len(parts) != 3 {
    88  		panic(fmt.Errorf("invalid BuilderStatID: %s", id))
    89  	}
    90  	project, bucket, builder = parts[0], parts[1], parts[2]
    91  	return
    92  }
    93  
    94  // fetchLUCIBuckets returns a stringset.Set with the ID of the buckets
    95  // w/ swarming config.
    96  func fetchLUCIBuckets(ctx context.Context) (stringset.Set, error) {
    97  	ret := stringset.Set{}
    98  	err := datastore.RunBatch(
    99  		ctx, 128, datastore.NewQuery(model.BucketKind),
   100  		func(bucket *model.Bucket) error {
   101  			if bucket.Proto.GetSwarming() != nil {
   102  				ret.Add(protoutil.FormatBucketID(bucket.Parent.StringID(), bucket.ID))
   103  			}
   104  			return nil
   105  		},
   106  	)
   107  	return ret, err
   108  }
   109  
   110  // reportMaxAge computes and reports the age of the oldest builds with SCHEDULED.
   111  func reportMaxAge(ctx context.Context, project, bucket, legacyBucket, builder string) error {
   112  	var leasedCT, neverLeasedCT time.Time
   113  	q := datastore.NewQuery(model.BuildKind).
   114  		Eq("bucket_id", protoutil.FormatBucketID(project, bucket)).
   115  		Eq("tags", "builder:"+builder).
   116  		Eq("status_v2", pb.Status_SCHEDULED).
   117  		Eq("experimental", false).
   118  		Order("create_time").
   119  		Limit(1)
   120  	eg, ctx := errgroup.WithContext(ctx)
   121  	eg.Go(func() error {
   122  		var b []*model.Build
   123  		if err := datastore.GetAll(ctx, q.Eq("never_leased", false), &b); err != nil {
   124  			return err
   125  		}
   126  		if len(b) > 0 {
   127  			leasedCT = b[0].CreateTime
   128  		}
   129  		return nil
   130  	})
   131  	eg.Go(func() error {
   132  		var b []*model.Build
   133  		if err := datastore.GetAll(ctx, q.Eq("never_leased", true), &b); err != nil {
   134  			return err
   135  		}
   136  		if len(b) > 0 {
   137  			neverLeasedCT = b[0].CreateTime
   138  		}
   139  		return nil
   140  	})
   141  	if err := eg.Wait(); err != nil {
   142  		return err
   143  	}
   144  
   145  	var max, neverLeasedMax float64
   146  	now := clock.Now(ctx)
   147  	if !neverLeasedCT.IsZero() {
   148  		neverLeasedMax = now.Sub(neverLeasedCT).Seconds()
   149  	}
   150  
   151  	// In V1, the metric value of a stream with "must_be_never_leased == false"
   152  	// is the age of the oldest build w/ "must_be_never_leased == true|false".
   153  	//
   154  	// That is, it's the age of the oldest build regardless of the value
   155  	// in must_be_never_leased.
   156  	if !leasedCT.IsZero() {
   157  		max = now.Sub(leasedCT).Seconds()
   158  	}
   159  	if max < neverLeasedMax {
   160  		max = neverLeasedMax
   161  	}
   162  	V1.MaxAgeScheduled.Set(ctx, max, legacyBucket, builder, false /*must_be_never_leased*/)
   163  	V1.MaxAgeScheduled.Set(ctx, neverLeasedMax, legacyBucket, builder, true)
   164  	V2.MaxAgeScheduled.Set(ctx, max)
   165  	return nil
   166  }
   167  
   168  // reportBuildCount computes and reports # of builds with SCHEDULED and STARTED.
   169  func reportBuildCount(ctx context.Context, project, bucket, legacyBucket, builder string) error {
   170  	var nScheduled, nStarted int64
   171  	q := datastore.NewQuery(model.BuildKind).
   172  		Eq("bucket_id", protoutil.FormatBucketID(project, bucket)).
   173  		Eq("experimental", false).
   174  		Eq("tags", "builder:"+builder)
   175  	eg, ctx := errgroup.WithContext(ctx)
   176  	eg.Go(func() (err error) {
   177  		nScheduled, err = datastore.Count(ctx, q.Eq("status_v2", pb.Status_SCHEDULED))
   178  		return
   179  	})
   180  	eg.Go(func() (err error) {
   181  		nStarted, err = datastore.Count(ctx, q.Eq("status_v2", pb.Status_STARTED))
   182  		return
   183  	})
   184  	if err := eg.Wait(); err != nil {
   185  		return err
   186  	}
   187  
   188  	V1.BuildCount.Set(ctx, nScheduled, legacyBucket, builder, pb.Status_name[int32(pb.Status_SCHEDULED)])
   189  	V1.BuildCount.Set(ctx, nStarted, legacyBucket, builder, pb.Status_name[int32(pb.Status_STARTED)])
   190  	V2.BuildCount.Set(ctx, nScheduled, pb.Status_name[int32(pb.Status_SCHEDULED)])
   191  	V2.BuildCount.Set(ctx, nStarted, pb.Status_name[int32(pb.Status_STARTED)])
   192  	return nil
   193  }
   194  
   195  func reportConsecutiveFailures(ctx context.Context, project, bucket, builder string) error {
   196  	var b []*model.Build
   197  	q := datastore.NewQuery(model.BuildKind).
   198  		Eq("bucket_id", protoutil.FormatBucketID(project, bucket)).
   199  		Eq("tags", "builder:"+builder).
   200  		Order("-status_changed_time")
   201  	if err := datastore.GetAll(ctx, q.Eq("status_v2", pb.Status_SUCCESS).Limit(1), &b); err != nil {
   202  		return err
   203  	}
   204  
   205  	// if there was at least one successful build, add Ge()
   206  	// to narrow the scope of the index scan.
   207  	if len(b) > 0 {
   208  		q = q.Gt("status_changed_time", b[0].StatusChangedTime.UTC())
   209  	}
   210  
   211  	var nCancels, nFailures, nInfraFailures int64
   212  	eg, ctx := errgroup.WithContext(ctx)
   213  	eg.Go(func() (err error) {
   214  		nCancels, err = datastore.Count(ctx, q.Eq("status_v2", pb.Status_CANCELED))
   215  		return
   216  	})
   217  	eg.Go(func() (err error) {
   218  		nFailures, err = datastore.Count(ctx, q.Eq("status_v2", pb.Status_FAILURE))
   219  		return
   220  	})
   221  	eg.Go(func() (err error) {
   222  		nInfraFailures, err = datastore.Count(ctx, q.Eq("status_v2", pb.Status_INFRA_FAILURE))
   223  		return
   224  	})
   225  	if err := eg.Wait(); err != nil {
   226  		return err
   227  	}
   228  
   229  	// These counts can be inaccurate a bit, but should be accurate enough.
   230  	V2.ConsecutiveFailureCount.Set(ctx, nCancels, pb.Status_name[int32(pb.Status_CANCELED)])
   231  	V2.ConsecutiveFailureCount.Set(ctx, nFailures, pb.Status_name[int32(pb.Status_FAILURE)])
   232  	V2.ConsecutiveFailureCount.Set(ctx, nInfraFailures, pb.Status_name[int32(pb.Status_INFRA_FAILURE)])
   233  	return nil
   234  }