go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/buildbucket/appengine/internal/buildcron/expired.go (about) 1 // Copyright 2022 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package buildcron 16 17 import ( 18 "context" 19 "strings" 20 21 "go.chromium.org/luci/common/clock" 22 "go.chromium.org/luci/common/errors" 23 "go.chromium.org/luci/common/logging" 24 "go.chromium.org/luci/common/sync/parallel" 25 "go.chromium.org/luci/gae/service/datastore" 26 27 "go.chromium.org/luci/buildbucket" 28 "go.chromium.org/luci/buildbucket/appengine/internal/metrics" 29 "go.chromium.org/luci/buildbucket/appengine/model" 30 "go.chromium.org/luci/buildbucket/appengine/tasks" 31 taskdefs "go.chromium.org/luci/buildbucket/appengine/tasks/defs" 32 pb "go.chromium.org/luci/buildbucket/proto" 33 "go.chromium.org/luci/buildbucket/protoutil" 34 ) 35 36 func expireBuilds(ctx context.Context, bs []*model.Build, mr parallel.MultiRunner) error { 37 nOrig := len(bs) 38 if nOrig == 0 { 39 return nil 40 } 41 42 toUpdate := make([]*model.Build, 0, len(bs)) 43 err := datastore.RunInTransaction(ctx, func(ctx context.Context) error { 44 // Clear the slice since it may contains value from the previous 45 // failed transaction. 46 toUpdate = toUpdate[:0] 47 if err := datastore.Get(ctx, bs); err != nil { 48 return err 49 } 50 51 now := clock.Now(ctx) 52 for _, b := range bs { 53 // skip updating, if it's no longer in a non-terminal status. 54 if protoutil.IsEnded(b.Proto.Status) { 55 continue 56 } 57 58 protoutil.SetStatus(now, b.Proto, pb.Status_INFRA_FAILURE) 59 if b.Proto.StatusDetails == nil { 60 b.Proto.StatusDetails = &pb.StatusDetails{} 61 } 62 b.Proto.StatusDetails.Timeout = &pb.StatusDetails_Timeout{} 63 b.ClearLease() 64 // TODO(crbug.com/1414540): A temporary code to mitigate the issue. Should 65 // delete it after the cron job is executed. 66 if b.Proto.Input.GetProperties() != nil { 67 b.Proto.Input.Properties = nil 68 } 69 toUpdate = append(toUpdate, b) 70 } 71 72 if len(toUpdate) == 0 { 73 return nil 74 } 75 return mr.RunMulti(func(workC chan<- func() error) { 76 for _, b := range toUpdate { 77 b := b 78 workC <- func() error { return tasks.NotifyPubSub(ctx, b) } 79 workC <- func() error { 80 return tasks.ExportBigQuery(ctx, b.ID, strings.Contains(b.ExperimentsString(), buildbucket.ExperimentBqExporterGo)) 81 } 82 workC <- func() error { 83 return tasks.FinalizeResultDB(ctx, &taskdefs.FinalizeResultDBGo{BuildId: b.ID}) 84 } 85 } 86 workC <- func() error { return datastore.Put(ctx, toUpdate) } 87 workC <- func() error { 88 return updateBuildStatuses(ctx, toUpdate, pb.Status_INFRA_FAILURE) 89 } 90 }) 91 }, nil) 92 93 if err == nil { 94 logging.Infof(ctx, "Processed %d/%d expired builds", len(toUpdate), nOrig) 95 for _, b := range toUpdate { 96 logging.Infof(ctx, "Build %d: completed by cron(expire_builds) with status %q.", 97 b.ID, b.Status) 98 metrics.BuildCompleted(ctx, b) 99 } 100 } 101 return err 102 } 103 104 // TimeoutExpiredBuilds marks incomplete builds that were created longer than 105 // model.BuildMaxCompletionTime w/ INFRA_FAILURE. 106 func TimeoutExpiredBuilds(ctx context.Context) error { 107 // expireBuilds() updates 5 entities for each of the given builds within 108 // a single transaction, and a ds transaction can update at most 109 // 25 entities. 110 // 111 // Hence, this batchSize must be 5 or lower. 112 const batchSize = 25 / 5 113 // Processing each batch requires at most 5 goroutines. 114 // - 1 for ds.RunTransaction() 115 // - 4 for add tasks into TQ and ds.Put() 116 // 117 // Also, there is another goroutine for scanning expired builds. 118 // Hence, this can run at most 6 transactions in parallel. 119 const nWorkers = 32 120 q := datastore.NewQuery(model.BuildKind). 121 Gt("__key__", buildKeyByAge(ctx, model.BuildMaxCompletionTime)). 122 KeysOnly(true) 123 124 return parallel.RunMulti(ctx, nWorkers, func(mr parallel.MultiRunner) error { 125 return mr.RunMulti(func(workC chan<- func() error) { 126 ch := make(chan []*model.Build, nWorkers) 127 workC <- func() error { 128 defer close(ch) 129 130 // Queries within a transcation must include an Ancestor filter. 131 // Hence, this searches expired builds out of a transaction first, 132 // and then update them in a transaction. 133 for _, st := range []pb.Status{pb.Status_SCHEDULED, pb.Status_STARTED} { 134 bs := make([]*model.Build, 0, batchSize) 135 err := datastore.RunBatch(ctx, int32(batchSize), q.Eq("status_v2", st), 136 func(b *model.Build) error { 137 bs = append(bs, b) 138 if len(bs) == batchSize { 139 ch <- bs 140 bs = make([]*model.Build, 0, batchSize) 141 } 142 return nil 143 }, 144 ) 145 if len(bs) > 0 { 146 ch <- bs 147 } 148 if err != nil { 149 return errors.Annotate(err, "querying expired %s builds", st).Err() 150 } 151 } 152 return nil 153 } 154 155 for bs := range ch { 156 bs := bs 157 workC <- func() error { 158 return errors.Annotate(expireBuilds(ctx, bs, mr), "expireBuilds").Err() 159 } 160 } 161 }) 162 }) 163 } 164 165 func resetLeases(ctx context.Context, bs []*model.Build, mr parallel.MultiRunner) error { 166 nOrig := len(bs) 167 if nOrig == 0 { 168 return nil 169 } 170 171 toReset := make([]*model.Build, 0, len(bs)) 172 err := datastore.RunInTransaction(ctx, func(ctx context.Context) error { 173 if err := datastore.Get(ctx, bs); err != nil { 174 return err 175 } 176 177 now := clock.Now(ctx) 178 for _, b := range bs { 179 if !b.IsLeased || b.LeaseExpirationDate.After(now) { 180 continue 181 } 182 // A terminated build cannot be leased. 183 // It must be that the data is corrupted or there is a bug. 184 if protoutil.IsEnded(b.Proto.Status) { 185 logging.Errorf(ctx, "Build %d is leased and terminated", b.ID) 186 } else { 187 protoutil.SetStatus(now, b.Proto, pb.Status_SCHEDULED) 188 } 189 b.ClearLease() 190 toReset = append(toReset, b) 191 } 192 if len(toReset) == 0 { 193 return nil 194 } 195 return mr.RunMulti(func(workC chan<- func() error) { 196 for _, b := range toReset { 197 b := b 198 workC <- func() error { return tasks.NotifyPubSub(ctx, b) } 199 } 200 workC <- func() error { return datastore.Put(ctx, toReset) } 201 workC <- func() error { 202 return updateBuildStatuses(ctx, toReset, pb.Status_SCHEDULED) 203 } 204 }) 205 }, nil) 206 207 if err == nil { 208 logging.Infof(ctx, "Reset %d/%d expired leases.", len(toReset), nOrig) 209 for _, b := range toReset { 210 logging.Infof(ctx, "Build %d: expired lease was reset", b.ID) 211 metrics.ExpiredLeaseReset(ctx, b) 212 } 213 } 214 return err 215 } 216 217 // ResetExpiredLeases resets expired leases. 218 func ResetExpiredLeases(ctx context.Context) error { 219 // resetLeases() updates 3 entities for each of the given builds within 220 // a single transaction, and a ds transaction can update at most 221 // 25 entities. 222 // 223 // Hence, this batchSize must be 8 or lower. 224 const batchSize = 25 / 3 225 const nWorkers = 12 226 q := datastore.NewQuery(model.BuildKind). 227 Eq("is_leased", true). 228 Lte("lease_expiration_date", clock.Now(ctx).UTC()). 229 KeysOnly(true) 230 231 return parallel.RunMulti(ctx, nWorkers, func(mr parallel.MultiRunner) error { 232 return mr.RunMulti(func(workC chan<- func() error) { 233 ch := make(chan []*model.Build, nWorkers) 234 workC <- func() error { 235 defer close(ch) 236 bs := make([]*model.Build, 0, batchSize) 237 err := datastore.RunBatch(ctx, int32(batchSize), q, func(b *model.Build) error { 238 bs = append(bs, b) 239 if len(bs) == batchSize { 240 ch <- bs 241 bs = make([]*model.Build, 0, batchSize) 242 } 243 return nil 244 }) 245 if len(bs) > 0 { 246 ch <- bs 247 } 248 return errors.Annotate(err, "querying expired, leased builds").Err() 249 } 250 251 for bs := range ch { 252 bs := bs 253 workC <- func() error { 254 return errors.Annotate(resetLeases(ctx, bs, mr), 255 "resetting %d expired leases", len(bs)).Err() 256 } 257 } 258 }) 259 }) 260 } 261 262 func updateBuildStatuses(ctx context.Context, builds []*model.Build, status pb.Status) error { 263 buildStatuses := make([]*model.BuildStatus, 0, len(builds)) 264 for _, b := range builds { 265 buildStatuses = append(buildStatuses, &model.BuildStatus{ 266 Build: datastore.KeyForObj(ctx, b), 267 }) 268 } 269 err := datastore.Get(ctx, buildStatuses) 270 if err == nil { 271 for _, s := range buildStatuses { 272 s.Status = status 273 } 274 return datastore.Put(ctx, buildStatuses) 275 } 276 277 merr, ok := err.(errors.MultiError) 278 if !ok { 279 return err 280 } 281 existBuildStatuses := make([]*model.BuildStatus, 0, len(buildStatuses)) 282 for i, me := range merr { 283 if me == nil { 284 existBuildStatuses = append(existBuildStatuses, buildStatuses[i]) 285 } 286 // It is allowed for build created before BuildStatus rollout to not have 287 // BuildStatus. 288 // TODO(crbug.com/1430324): also disallow ErrNoSuchEntity for BuildStatus. 289 } 290 291 for _, s := range existBuildStatuses { 292 s.Status = status 293 } 294 return datastore.Put(ctx, existBuildStatuses) 295 }