go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/buildbucket/appengine/internal/metrics/builder.go (about) 1 // Copyright 2021 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package metrics 16 17 import ( 18 "context" 19 "fmt" 20 "strings" 21 "time" 22 23 "golang.org/x/sync/errgroup" 24 25 "go.chromium.org/luci/common/clock" 26 "go.chromium.org/luci/common/data/stringset" 27 "go.chromium.org/luci/common/errors" 28 "go.chromium.org/luci/common/sync/parallel" 29 "go.chromium.org/luci/common/tsmon" 30 "go.chromium.org/luci/gae/service/datastore" 31 32 "go.chromium.org/luci/buildbucket/appengine/model" 33 pb "go.chromium.org/luci/buildbucket/proto" 34 "go.chromium.org/luci/buildbucket/protoutil" 35 ) 36 37 // ReportBuilderMetrics computes and reports Builder metrics. 38 func ReportBuilderMetrics(ctx context.Context) error { 39 // Reset the metric to stop reporting no-longer-existing builders. 40 tsmon.GetState(ctx).Store().Reset(ctx, V2.BuilderPresence) 41 luciBuckets, err := fetchLUCIBuckets(ctx) 42 if err != nil { 43 return errors.Annotate(err, "fetching LUCI buckets w/ swarming config").Err() 44 } 45 46 return parallel.WorkPool(256, func(taskC chan<- func() error) { 47 q := datastore.NewQuery(model.BuilderStatKind) 48 err := datastore.RunBatch(ctx, 64, q, func(k *datastore.Key) error { 49 project, bucket, builder := mustParseBuilderStatID(k.StringID()) 50 tctx := WithBuilder(ctx, project, bucket, builder) 51 legacyBucket := bucket 52 // V1 metrics format the bucket name in "luci.$project.$bucket" 53 // if the bucket config has a swarming config. 54 if luciBuckets.Has(protoutil.FormatBucketID(project, bucket)) { 55 legacyBucket = legacyBucketName(project, bucket) 56 } 57 V2.BuilderPresence.Set(tctx, true) 58 59 taskC <- func() error { 60 return errors.Annotate( 61 reportMaxAge(tctx, project, bucket, legacyBucket, builder), 62 "reportMaxAge", 63 ).Err() 64 } 65 taskC <- func() error { 66 return errors.Annotate( 67 reportBuildCount(tctx, project, bucket, legacyBucket, builder), 68 "reportBuildCount", 69 ).Err() 70 } 71 taskC <- func() error { 72 return errors.Annotate( 73 reportConsecutiveFailures(tctx, project, bucket, builder), 74 "reportConsecutiveFailures", 75 ).Err() 76 } 77 return nil 78 }) 79 if err != nil { 80 taskC <- func() error { return errors.Annotate(err, "datastore.RunBatch").Err() } 81 } 82 }) 83 } 84 85 func mustParseBuilderStatID(id string) (project, bucket, builder string) { 86 parts := strings.Split(id, ":") 87 if len(parts) != 3 { 88 panic(fmt.Errorf("invalid BuilderStatID: %s", id)) 89 } 90 project, bucket, builder = parts[0], parts[1], parts[2] 91 return 92 } 93 94 // fetchLUCIBuckets returns a stringset.Set with the ID of the buckets 95 // w/ swarming config. 96 func fetchLUCIBuckets(ctx context.Context) (stringset.Set, error) { 97 ret := stringset.Set{} 98 err := datastore.RunBatch( 99 ctx, 128, datastore.NewQuery(model.BucketKind), 100 func(bucket *model.Bucket) error { 101 if bucket.Proto.GetSwarming() != nil { 102 ret.Add(protoutil.FormatBucketID(bucket.Parent.StringID(), bucket.ID)) 103 } 104 return nil 105 }, 106 ) 107 return ret, err 108 } 109 110 // reportMaxAge computes and reports the age of the oldest builds with SCHEDULED. 111 func reportMaxAge(ctx context.Context, project, bucket, legacyBucket, builder string) error { 112 var leasedCT, neverLeasedCT time.Time 113 q := datastore.NewQuery(model.BuildKind). 114 Eq("bucket_id", protoutil.FormatBucketID(project, bucket)). 115 Eq("tags", "builder:"+builder). 116 Eq("status_v2", pb.Status_SCHEDULED). 117 Eq("experimental", false). 118 Order("create_time"). 119 Limit(1) 120 eg, ctx := errgroup.WithContext(ctx) 121 eg.Go(func() error { 122 var b []*model.Build 123 if err := datastore.GetAll(ctx, q.Eq("never_leased", false), &b); err != nil { 124 return err 125 } 126 if len(b) > 0 { 127 leasedCT = b[0].CreateTime 128 } 129 return nil 130 }) 131 eg.Go(func() error { 132 var b []*model.Build 133 if err := datastore.GetAll(ctx, q.Eq("never_leased", true), &b); err != nil { 134 return err 135 } 136 if len(b) > 0 { 137 neverLeasedCT = b[0].CreateTime 138 } 139 return nil 140 }) 141 if err := eg.Wait(); err != nil { 142 return err 143 } 144 145 var max, neverLeasedMax float64 146 now := clock.Now(ctx) 147 if !neverLeasedCT.IsZero() { 148 neverLeasedMax = now.Sub(neverLeasedCT).Seconds() 149 } 150 151 // In V1, the metric value of a stream with "must_be_never_leased == false" 152 // is the age of the oldest build w/ "must_be_never_leased == true|false". 153 // 154 // That is, it's the age of the oldest build regardless of the value 155 // in must_be_never_leased. 156 if !leasedCT.IsZero() { 157 max = now.Sub(leasedCT).Seconds() 158 } 159 if max < neverLeasedMax { 160 max = neverLeasedMax 161 } 162 V1.MaxAgeScheduled.Set(ctx, max, legacyBucket, builder, false /*must_be_never_leased*/) 163 V1.MaxAgeScheduled.Set(ctx, neverLeasedMax, legacyBucket, builder, true) 164 V2.MaxAgeScheduled.Set(ctx, max) 165 return nil 166 } 167 168 // reportBuildCount computes and reports # of builds with SCHEDULED and STARTED. 169 func reportBuildCount(ctx context.Context, project, bucket, legacyBucket, builder string) error { 170 var nScheduled, nStarted int64 171 q := datastore.NewQuery(model.BuildKind). 172 Eq("bucket_id", protoutil.FormatBucketID(project, bucket)). 173 Eq("experimental", false). 174 Eq("tags", "builder:"+builder) 175 eg, ctx := errgroup.WithContext(ctx) 176 eg.Go(func() (err error) { 177 nScheduled, err = datastore.Count(ctx, q.Eq("status_v2", pb.Status_SCHEDULED)) 178 return 179 }) 180 eg.Go(func() (err error) { 181 nStarted, err = datastore.Count(ctx, q.Eq("status_v2", pb.Status_STARTED)) 182 return 183 }) 184 if err := eg.Wait(); err != nil { 185 return err 186 } 187 188 V1.BuildCount.Set(ctx, nScheduled, legacyBucket, builder, pb.Status_name[int32(pb.Status_SCHEDULED)]) 189 V1.BuildCount.Set(ctx, nStarted, legacyBucket, builder, pb.Status_name[int32(pb.Status_STARTED)]) 190 V2.BuildCount.Set(ctx, nScheduled, pb.Status_name[int32(pb.Status_SCHEDULED)]) 191 V2.BuildCount.Set(ctx, nStarted, pb.Status_name[int32(pb.Status_STARTED)]) 192 return nil 193 } 194 195 func reportConsecutiveFailures(ctx context.Context, project, bucket, builder string) error { 196 var b []*model.Build 197 q := datastore.NewQuery(model.BuildKind). 198 Eq("bucket_id", protoutil.FormatBucketID(project, bucket)). 199 Eq("tags", "builder:"+builder). 200 Order("-status_changed_time") 201 if err := datastore.GetAll(ctx, q.Eq("status_v2", pb.Status_SUCCESS).Limit(1), &b); err != nil { 202 return err 203 } 204 205 // if there was at least one successful build, add Ge() 206 // to narrow the scope of the index scan. 207 if len(b) > 0 { 208 q = q.Gt("status_changed_time", b[0].StatusChangedTime.UTC()) 209 } 210 211 var nCancels, nFailures, nInfraFailures int64 212 eg, ctx := errgroup.WithContext(ctx) 213 eg.Go(func() (err error) { 214 nCancels, err = datastore.Count(ctx, q.Eq("status_v2", pb.Status_CANCELED)) 215 return 216 }) 217 eg.Go(func() (err error) { 218 nFailures, err = datastore.Count(ctx, q.Eq("status_v2", pb.Status_FAILURE)) 219 return 220 }) 221 eg.Go(func() (err error) { 222 nInfraFailures, err = datastore.Count(ctx, q.Eq("status_v2", pb.Status_INFRA_FAILURE)) 223 return 224 }) 225 if err := eg.Wait(); err != nil { 226 return err 227 } 228 229 // These counts can be inaccurate a bit, but should be accurate enough. 230 V2.ConsecutiveFailureCount.Set(ctx, nCancels, pb.Status_name[int32(pb.Status_CANCELED)]) 231 V2.ConsecutiveFailureCount.Set(ctx, nFailures, pb.Status_name[int32(pb.Status_FAILURE)]) 232 V2.ConsecutiveFailureCount.Set(ctx, nInfraFailures, pb.Status_name[int32(pb.Status_INFRA_FAILURE)]) 233 return nil 234 }