go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/milo/internal/buildsource/buildbucket/build_sync.go (about) 1 // Copyright 2017 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package buildbucket 16 17 import ( 18 "context" 19 "encoding/json" 20 "fmt" 21 "net/http" 22 "time" 23 24 "github.com/gomodule/redigo/redis" 25 "google.golang.org/genproto/protobuf/field_mask" 26 "google.golang.org/grpc/codes" 27 "google.golang.org/grpc/status" 28 "google.golang.org/protobuf/encoding/protojson" 29 30 buildbucketpb "go.chromium.org/luci/buildbucket/proto" 31 bbv1 "go.chromium.org/luci/common/api/buildbucket/buildbucket/v1" 32 "go.chromium.org/luci/common/clock" 33 "go.chromium.org/luci/common/errors" 34 "go.chromium.org/luci/common/logging" 35 "go.chromium.org/luci/common/retry/transient" 36 "go.chromium.org/luci/common/sync/parallel" 37 "go.chromium.org/luci/common/tsmon/field" 38 "go.chromium.org/luci/common/tsmon/metric" 39 "go.chromium.org/luci/gae/service/datastore" 40 "go.chromium.org/luci/server/auth" 41 "go.chromium.org/luci/server/redisconn" 42 "go.chromium.org/luci/server/router" 43 44 "go.chromium.org/luci/milo/internal/model" 45 "go.chromium.org/luci/milo/internal/model/milostatus" 46 "go.chromium.org/luci/milo/internal/utils" 47 ) 48 49 // BuildSummaryStorageDuration is the maximum lifetime of a BuildSummary. 50 // 51 // Lifetime is the time elapsed since the Build creation time. 52 // Cron runs periodically to scan and remove all the Builds of which lifetime 53 // exceeded this duration. 54 // 55 // BuildSummaries are kept alive longer than builds in buildbuckets. So we can 56 // compute blamelist for builds that are at the end of their lifetime. 57 // 58 // TODO(weiweilin): expose BuildStorageDuration from buildbucket and compute 59 // BuildSummaryStorageDuration base on that (e.g. add two months). So we can 60 // ensure BuildSummaries are kept alive longer than builds. 61 const BuildSummaryStorageDuration = time.Hour * 24 * 30 * 20 // ~20 months 62 63 // BuildSummarySyncThreshold is the maximum duration before Milo attempts to 64 // sync non-terminal builds with buildbucket. 65 // 66 // Cron runs periodically to scan and sync all the non-terminal Builds of 67 // that was updated more than `BuildSummarySyncThreshold` ago. 68 const BuildSummarySyncThreshold = time.Hour * 24 * 2 // 2 days 69 70 var ( 71 deletedBuildsCounter = metric.NewCounter( 72 "luci/milo/cron/delete-builds/delete-count", 73 "The number of BuildSummaries deleted by Milo delete-builds cron job", 74 nil, 75 ) 76 77 updatedBuildsCounter = metric.NewCounter( 78 "luci/milo/cron/sync-builds/updated-count", 79 "The number of BuildSummaries that are out of sync and updated by Milo sync-builds cron job", 80 nil, 81 field.String("project"), 82 field.String("bucket"), 83 field.String("builder"), 84 field.Int("number"), 85 field.Int("ID"), 86 ) 87 ) 88 89 // DeleteOldBuilds is a cron job that deletes BuildSummaries that are older than 90 // BuildSummaryStorageDuration. 91 func DeleteOldBuilds(c context.Context) error { 92 const ( 93 batchSize = 1024 94 nWorkers = 8 95 ) 96 97 buildPurgeCutoffTime := clock.Now(c).Add(-BuildSummaryStorageDuration) 98 q := datastore.NewQuery("BuildSummary"). 99 Lt("Created", buildPurgeCutoffTime). 100 Order("Created"). 101 // Apply a limit so the call won't timeout. 102 Limit(batchSize * nWorkers * 32). 103 KeysOnly(true) 104 105 return parallel.FanOutIn(func(taskC chan<- func() error) { 106 buildsC := make(chan []*datastore.Key, nWorkers) 107 statsC := make(chan int, nWorkers) 108 109 // Collect stats. 110 taskC <- func() error { 111 for deletedCount := range statsC { 112 deletedBuildsCounter.Add(c, int64(deletedCount)) 113 } 114 115 return nil 116 } 117 118 // Find builds to delete. 119 taskC <- func() error { 120 defer close(buildsC) 121 122 bsKeys := make([]*datastore.Key, 0, batchSize) 123 err := datastore.RunBatch(c, batchSize, q, func(key *datastore.Key) error { 124 bsKeys = append(bsKeys, key) 125 if len(bsKeys) == batchSize { 126 buildsC <- bsKeys 127 bsKeys = make([]*datastore.Key, 0, batchSize) 128 } 129 return nil 130 }) 131 if err != nil { 132 return err 133 } 134 135 if len(bsKeys) > 0 { 136 buildsC <- bsKeys 137 } 138 return nil 139 } 140 141 // Spawn workers to delete builds. 142 taskC <- func() error { 143 defer close(statsC) 144 145 return parallel.WorkPool(nWorkers, func(workC chan<- func() error) { 146 for bks := range buildsC { 147 // Bind to a local variable so each worker can have their own copy. 148 bks := bks 149 workC <- func() error { 150 // Flatten first w/o filtering to calculate how many builds were 151 // actually removed. 152 err := errors.Flatten(datastore.Delete(c, bks)) 153 154 allErrs := 0 155 if err != nil { 156 allErrs = len(err.(errors.MultiError)) 157 } 158 159 statsC <- len(bks) - allErrs 160 161 return err 162 } 163 } 164 }) 165 } 166 }) 167 } 168 169 var summaryBuildMask = &field_mask.FieldMask{ 170 Paths: []string{ 171 "id", 172 "builder", 173 "number", 174 "create_time", 175 "start_time", 176 "end_time", 177 "update_time", 178 "status", 179 "summary_markdown", 180 "tags", 181 "infra.buildbucket.hostname", 182 "infra.swarming", 183 "input.experimental", 184 "input.gitiles_commit", 185 "output.properties", 186 "critical", 187 }, 188 } 189 190 // V2PubSubHandler is a webhook that stores the builds coming in from pubsub. 191 func V2PubSubHandler(ctx *router.Context) { 192 err := v2PubSubHandlerImpl(ctx.Request.Context(), ctx.Request) 193 if err != nil { 194 logging.Errorf(ctx.Request.Context(), "error while handling pubsub event") 195 errors.Log(ctx.Request.Context(), err) 196 } 197 if transient.Tag.In(err) { 198 // Transient errors are 4xx so that PubSub retries them. 199 // TODO(crbug.com/1099036): Address High traffic builders causing errors. 200 ctx.Writer.WriteHeader(http.StatusTooEarly) 201 return 202 } 203 // No errors or non-transient errors are 200s so that PubSub does not retry 204 // them. 205 ctx.Writer.WriteHeader(http.StatusOK) 206 } 207 208 // v2PubSubHandlerImpl takes the http.Request, expects to find 209 // a common.PubSubSubscription JSON object in the Body, containing a 210 // BuildsV2PubSub, and handles the contents with generateSummary. 211 func v2PubSubHandlerImpl(c context.Context, r *http.Request) error { 212 msg := utils.PubSubSubscription{} 213 if err := json.NewDecoder(r.Body).Decode(&msg); err != nil { 214 // This might be a transient error, e.g. when the json format changes 215 // and Milo isn't updated yet. 216 return errors.Annotate(err, "could not decode message").Tag(transient.Tag).Err() 217 } 218 bData, err := msg.GetData() 219 if err != nil { 220 return errors.Annotate(err, "could not parse pubsub message string").Err() 221 } 222 buildsV2Msg := &buildbucketpb.BuildsV2PubSub{} 223 opts := protojson.UnmarshalOptions{AllowPartial: true, DiscardUnknown: true} 224 if err := opts.Unmarshal(bData, buildsV2Msg); err != nil { 225 return err 226 } 227 228 return processBuild(c, buildsV2Msg.Build) 229 } 230 231 // PubSubHandler is a webhook that stores the builds coming in from pubsub. 232 func PubSubHandler(ctx *router.Context) { 233 err := pubSubHandlerImpl(ctx.Request.Context(), ctx.Request) 234 if err != nil { 235 logging.Errorf(ctx.Request.Context(), "error while handling pubsub event") 236 errors.Log(ctx.Request.Context(), err) 237 } 238 if transient.Tag.In(err) { 239 // Transient errors are 4xx so that PubSub retries them. 240 // TODO(crbug.com/1099036): Address High traffic builders causing errors. 241 ctx.Writer.WriteHeader(http.StatusTooEarly) 242 return 243 } 244 // No errors or non-transient errors are 200s so that PubSub does not retry 245 // them. 246 ctx.Writer.WriteHeader(http.StatusOK) 247 } 248 249 // pubSubHandlerImpl takes the http.Request, expects to find 250 // a common.PubSubSubscription JSON object in the Body, containing a bbPSEvent, 251 // and handles the contents with generateSummary. 252 func pubSubHandlerImpl(c context.Context, r *http.Request) error { 253 msg := utils.PubSubSubscription{} 254 if err := json.NewDecoder(r.Body).Decode(&msg); err != nil { 255 // This might be a transient error, e.g. when the json format changes 256 // and Milo isn't updated yet. 257 return errors.Annotate(err, "could not decode message").Tag(transient.Tag).Err() 258 } 259 if v, ok := msg.Message.Attributes["version"].(string); ok && v != "v1" { 260 // TODO(nodir): switch to v2, crbug.com/826006 261 logging.Debugf(c, "unsupported pubsub message version %q. Ignoring", v) 262 return nil 263 } 264 bData, err := msg.GetData() 265 if err != nil { 266 return errors.Annotate(err, "could not parse pubsub message string").Err() 267 } 268 269 event := struct { 270 Build bbv1.LegacyApiCommonBuildMessage `json:"build"` 271 Hostname string `json:"hostname"` 272 }{} 273 if err := json.Unmarshal(bData, &event); err != nil { 274 return errors.Annotate(err, "could not parse pubsub message data").Err() 275 } 276 277 client, err := BuildsClient(c, event.Hostname, auth.AsSelf) 278 if err != nil { 279 return err 280 } 281 build, err := client.GetBuild(c, &buildbucketpb.GetBuildRequest{ 282 Id: event.Build.Id, 283 Fields: summaryBuildMask, 284 }) 285 if err != nil { 286 return err 287 } 288 return processBuild(c, build) 289 } 290 291 func processBuild(c context.Context, build *buildbucketpb.Build) error { 292 // TODO(iannucci,nodir): get the bot context too 293 // TODO(iannucci,nodir): support manifests/got_revision 294 bs, err := model.BuildSummaryFromBuild(c, build.Infra.Buildbucket.Hostname, build) 295 if err != nil { 296 return err 297 } 298 if err := bs.AddManifestKeysFromBuildSets(c); err != nil { 299 return err 300 } 301 302 logging.Debugf(c, "Received from %s: build %s (%s)\n%v", 303 build.Infra.Buildbucket.Hostname, bs.ProjectID, bs.BuildID, bs.Summary.Status, bs) 304 305 updateBuilderSummary, err := shouldUpdateBuilderSummary(c, bs) 306 if err != nil { 307 updateBuilderSummary = true 308 logging.WithError(err).Warningf(c, "failed to determine whether the builder summary from %s should be updated. Fallback to always update.", bs.BuilderID) 309 } 310 err = transient.Tag.Apply(datastore.RunInTransaction(c, func(c context.Context) error { 311 curBS := &model.BuildSummary{BuildKey: bs.BuildKey} 312 if err := datastore.Get(c, curBS); err != nil && err != datastore.ErrNoSuchEntity { 313 return errors.Annotate(err, "reading current BuildSummary").Err() 314 } 315 316 if bs.Version <= curBS.Version { 317 logging.Warningf(c, "current BuildSummary is newer: %d <= %d", 318 bs.Version, curBS.Version) 319 return nil 320 } 321 322 if err := datastore.Put(c, bs); err != nil { 323 return err 324 } 325 326 if !updateBuilderSummary { 327 return nil 328 } 329 330 return model.UpdateBuilderForBuild(c, bs) 331 }, nil)) 332 333 return err 334 } 335 336 var ( 337 // Sustained datastore updates should not be higher than once per second per 338 // entity. 339 // See https://cloud.google.com/datastore/docs/concepts/limits. 340 entityUpdateIntervalInS int64 = 2 341 342 // A Redis LUA script that update the key with the new integer value if and 343 // only if the provided value is greater than the recorded value (0 if none 344 // were recorded). It returns 1 if the value is updated and 0 otherwise. 345 updateIfLargerScript = redis.NewScript(1, ` 346 local newValueStr = ARGV[1] 347 local newValue = tonumber(newValueStr) 348 local existingValueStr = redis.call('GET', KEYS[1]) 349 local existingValue = tonumber(existingValueStr) or 0 350 if newValue < existingValue then 351 return 0 352 elseif newValue == existingValue then 353 -- large u64/i64 (>2^53) may lose precision after being converted to f64. 354 -- Compare the last 5 digits if the f64 presentation of the integers are the 355 -- same. 356 local newValue = tonumber(string.sub(newValueStr, -5)) or 0 357 local existingValue = tonumber(string.sub(existingValueStr, -5)) or 0 358 if newValue <= existingValue then 359 return 0 360 end 361 end 362 363 redis.call('SET', KEYS[1], newValueStr, 'EX', ARGV[2]) 364 return 1 365 `) 366 ) 367 368 // shouldUpdateBuilderSummary determines whether the builder summary should be 369 // updated with the provided build summary. 370 // 371 // If the function is called with builds from the same builder multiple times 372 // within a time bucket, it will block the function until the start of the next 373 // time bucket and only return true for the call with the lastest created build 374 // with a terminal status, and return false for other calls occurred within the 375 // same time bucket. 376 func shouldUpdateBuilderSummary(c context.Context, buildSummary *model.BuildSummary) (bool, error) { 377 if !buildSummary.Summary.Status.Terminal() { 378 return false, nil 379 } 380 381 conn, err := redisconn.Get(c) 382 if err != nil { 383 return true, err 384 } 385 defer conn.Close() 386 387 createdAt := buildSummary.Created.UnixNano() 388 389 now := clock.Now(c).Unix() 390 thisTimeBucket := time.Unix(now-now%entityUpdateIntervalInS, 0) 391 thisTimeBucketKey := fmt.Sprintf("%s:%v", buildSummary.BuilderID, thisTimeBucket.Unix()) 392 393 // Check if there's a builder summary update occurred in this time bucket. 394 _, err = redis.String(conn.Do("SET", thisTimeBucketKey, createdAt, "NX", "EX", entityUpdateIntervalInS+1)) 395 switch err { 396 case redis.ErrNil: 397 // continue 398 case nil: 399 // There's no builder summary update occurred in this time bucket yet, we 400 // should run the update. 401 return true, nil 402 default: 403 return true, err 404 } 405 406 // There's already a builder summary update occurred in this time bucket. Try 407 // scheduling the update for the next time bucket instead. 408 nextTimeBucket := thisTimeBucket.Add(time.Duration(entityUpdateIntervalInS * int64(time.Second))) 409 nextTimeBucketKey := fmt.Sprintf("%s:%v", buildSummary.BuilderID, nextTimeBucket.Unix()) 410 replaced, err := redis.Int(updateIfLargerScript.Do(conn, nextTimeBucketKey, createdAt, entityUpdateIntervalInS+1)) 411 if err != nil { 412 return true, err 413 } 414 415 if replaced == 0 { 416 // There's already an update with a newer build scheduled for the next time 417 // bucket, skip the update. 418 return false, nil 419 } 420 421 // Wait until the start of the next time bucket. 422 clock.Sleep(c, clock.Until(c, nextTimeBucket)) 423 newCreatedAt, err := redis.Int64(conn.Do("GET", nextTimeBucketKey)) 424 if err != nil { 425 return true, err 426 } 427 428 // If the update event was not replaced by event by a newer build, we should 429 // run the update with this build. 430 if newCreatedAt == createdAt { 431 return true, nil 432 } 433 434 // Otherwise, skip the update. 435 logging.Infof(c, "skipping BuilderSummary update from builder %s because a newer build was found", buildSummary.BuilderID) 436 return false, nil 437 } 438 439 // SyncBuilds is a cron job that sync BuildSummaries that are not in terminal 440 // state and haven't been updated recently. 441 func SyncBuilds(c context.Context) error { 442 c = WithBuildsClientFactory(c, ProdBuildsClientFactory) 443 return syncBuildsImpl(c) 444 } 445 446 func syncBuildsImpl(c context.Context) error { 447 const ( 448 batchSize = 16 449 nWorkers = 64 450 ) 451 452 buildSyncCutoffTime := clock.Now(c).Add(-BuildSummarySyncThreshold) 453 454 return parallel.FanOutIn(func(taskC chan<- func() error) { 455 buildsC := make(chan []*model.BuildSummary, nWorkers) 456 updatedBuildC := make(chan *buildbucketpb.Build, nWorkers) 457 458 // Collect stats. 459 taskC <- func() error { 460 for b := range updatedBuildC { 461 updatedBuildsCounter.Add(c, 1, b.Builder.Project, b.Builder.Bucket, b.Builder.Builder, b.Number, b.Id) 462 } 463 return nil 464 } 465 466 // Find builds to update. 467 taskC <- func() error { 468 defer close(buildsC) 469 470 batch := make([]*model.BuildSummary, 0, batchSize) 471 findBuilds := func(q *datastore.Query) error { 472 return datastore.RunBatch(c, batchSize, q, func(bs *model.BuildSummary) error { 473 batch = append(batch, bs) 474 if len(batch) == batchSize { 475 buildsC <- batch 476 batch = make([]*model.BuildSummary, 0, batchSize) 477 } 478 return nil 479 }) 480 } 481 482 pendingBuildsQuery := datastore.NewQuery("BuildSummary"). 483 Eq("Summary.Status", milostatus.NotRun). 484 Lt("Version", buildSyncCutoffTime.UnixNano()). 485 // Apply a limit so the call won't timeout. 486 Limit(batchSize * nWorkers * 16) 487 err := findBuilds(pendingBuildsQuery) 488 if err != nil { 489 return err 490 } 491 492 runningBuildsQuery := datastore.NewQuery("BuildSummary"). 493 Eq("Summary.Status", milostatus.Running). 494 Lt("Version", buildSyncCutoffTime.UnixNano()). 495 // Apply a limit so the call won't timeout. 496 Limit(batchSize * nWorkers * 16) 497 err = findBuilds(runningBuildsQuery) 498 if err != nil { 499 return err 500 } 501 502 if len(batch) > 0 { 503 buildsC <- batch 504 } 505 return nil 506 } 507 508 // Spawn workers to update builds. 509 taskC <- func() error { 510 defer close(updatedBuildC) 511 512 return parallel.WorkPool(nWorkers, func(workC chan<- func() error) { 513 for builds := range buildsC { 514 // Bind to a local variable so each worker can have their own copy. 515 builds := builds 516 workC <- func() error { 517 for _, bs := range builds { 518 host, err := bs.GetHost() 519 if err != nil { 520 logging.WithError(err).Errorf(c, "failed to get host for build summary %v, skipping", bs.BuildKey) 521 continue 522 } 523 524 client, err := BuildsClient(c, host, auth.AsSelf) 525 if err != nil { 526 return err 527 } 528 529 var buildID int64 = 0 530 builder, buildNum, err := utils.ParseLegacyBuildbucketBuildID(bs.BuildID) 531 if err != nil { 532 // If the BuildID is not the legacy build ID, trying parsing it as 533 // the new build ID. 534 buildID, err = utils.ParseBuildbucketBuildID(bs.BuildID) 535 if err != nil { 536 return err 537 } 538 } 539 540 req := &buildbucketpb.GetBuildRequest{ 541 Id: buildID, 542 BuildNumber: buildNum, 543 Builder: builder, 544 Fields: summaryBuildMask, 545 } 546 newBuild, err := client.GetBuild(c, req) 547 if status.Code(err) == codes.NotFound { 548 logging.Warningf(c, "build %v not found on buildbucket. deleting it.\nrequest: %v", bs.BuildKey, req) 549 err := datastore.Delete(c, bs) 550 if err != nil { 551 err = errors.Annotate(err, "failed to delete build %v", bs.BuildKey).Err() 552 return err 553 } 554 continue 555 } else if err != nil { 556 err = errors.Annotate(err, "could not fetch build %v from buildbucket.\nrequest: %v", bs.BuildKey, req).Err() 557 return err 558 } 559 560 newBS, err := model.BuildSummaryFromBuild(c, host, newBuild) 561 if err != nil { 562 return err 563 } 564 if err := newBS.AddManifestKeysFromBuildSets(c); err != nil { 565 return err 566 } 567 568 // Ensure the new BuildSummary has the same key as the old 569 // BuildSummary, so we won't ends up having two copies of the same 570 // BuildSummary in the datastore. 571 newBS.BuildKey = bs.BuildKey 572 newBS.BuildID = bs.BuildID 573 574 if !newBS.Summary.Status.Terminal() { 575 continue 576 } 577 578 if err := datastore.Put(c, newBS); err != nil { 579 return err 580 } 581 582 updatedBuildC <- newBuild 583 } 584 585 return nil 586 } 587 } 588 }) 589 } 590 }) 591 }