go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/buildbucket/appengine/tasks/bq.go (about) 1 // Copyright 2022 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tasks 16 17 import ( 18 "context" 19 "encoding/json" 20 "fmt" 21 "regexp" 22 "strconv" 23 "strings" 24 "time" 25 26 "cloud.google.com/go/bigquery" 27 "google.golang.org/protobuf/encoding/protojson" 28 "google.golang.org/protobuf/types/known/structpb" 29 30 lucibq "go.chromium.org/luci/common/bq" 31 "go.chromium.org/luci/common/errors" 32 "go.chromium.org/luci/common/logging" 33 "go.chromium.org/luci/common/retry/transient" 34 "go.chromium.org/luci/gae/service/datastore" 35 "go.chromium.org/luci/server/tq" 36 37 "go.chromium.org/luci/buildbucket/appengine/internal/clients" 38 "go.chromium.org/luci/buildbucket/appengine/model" 39 pb "go.chromium.org/luci/buildbucket/proto" 40 "go.chromium.org/luci/buildbucket/protoutil" 41 ) 42 43 // maxBuildSizeInBQ is (10MB-5KB) as the maximum allowed request size in either 44 // streaming API or storage write API is 10MB. And we want to leave 5KB buffer 45 // room during message conversion. 46 // Note: make it to var so that it can be tested in unit tests without taking up 47 // too much memory. 48 var maxBuildSizeInBQ = 10*1000*1000 - 5*1000 49 50 // ExportBuild saves the build into BiqQuery. 51 // The returned error has transient.Tag or tq.Fatal in order to tell tq to drop 52 // or retry. 53 func ExportBuild(ctx context.Context, buildID int64) error { 54 b := &model.Build{ID: buildID} 55 switch err := datastore.Get(ctx, b); { 56 case err == datastore.ErrNoSuchEntity: 57 return errors.Annotate(err, "build %d not found when exporting into BQ", buildID).Tag(tq.Fatal).Err() 58 case err != nil: 59 return errors.Annotate(err, "error fetching builds").Tag(transient.Tag).Err() 60 } 61 p, err := b.ToProto(ctx, model.NoopBuildMask, nil) 62 if err != nil { 63 return errors.Annotate(err, "failed to convert build to proto").Err() 64 } 65 66 if p.Infra.Swarming == nil { 67 // Backfill Infra.Swarming for builds running on Swarming implemented backends. 68 // TODO(crbug.com/1508416) Stop backfill after Buildbucket taskbackend 69 // migration completes and all BQ queries are migrated away from Infra.Swarming. 70 err = tryBackfillSwarming(p) 71 if err != nil { 72 // Since the backfill is best effort, only log the error but not fail 73 // this bq export task. 74 logging.Warningf(ctx, "failed to backfill swarming data for build %d: %s", buildID, err) 75 } 76 } else if p.Infra.Backend == nil { 77 // Backfill Infra.Backend for builds running on Swarming directly. 78 // TODO(crbug.com/1508416) Stop backfill after Buildbucket taskbackend 79 // migration completes - by then there will be no more builds running on 80 // Swarming directly, so this flow can be removed. 81 err = tryBackfillBackend(p) 82 if err != nil { 83 // Since the backfill is best effort, only log the error but not fail 84 // this bq export task. 85 logging.Warningf(ctx, "failed to backfill backend data for build %d: %s", buildID, err) 86 } 87 } 88 89 // Clear fields that we don't want in BigQuery. 90 p.Infra.Buildbucket.Hostname = "" 91 if p.Infra.Backend.GetTask() != nil { 92 p.Infra.Backend.Task.UpdateId = 0 93 } 94 for _, step := range p.GetSteps() { 95 step.SummaryMarkdown = "" 96 step.MergeBuild = nil 97 for _, log := range step.Logs { 98 name := log.Name 99 log.Reset() 100 log.Name = name 101 } 102 } 103 104 // Check if the cleaned Build too large. 105 pj, err := protojson.Marshal(p) 106 if err != nil { 107 logging.Errorf(ctx, "failed to calculate Build size for %d: %s, continue to try to insert the build...", buildID, err) 108 } 109 // We only strip out the outputProperties here. 110 // Usually,large build size is caused by large outputProperties size since we 111 // only expanded the 1MB Datastore limit per field for BuildOutputProperties. 112 // If there are failures for any other fields, we'd like this job to continue 113 // to try so that we can be alerted. 114 if len(pj) > maxBuildSizeInBQ && p.Output.GetProperties() != nil { 115 logging.Warningf(ctx, "striping out outputProperties for build %d in BQ exporting", buildID) 116 p.Output.Properties = &structpb.Struct{ 117 Fields: map[string]*structpb.Value{ 118 "strip_reason": { 119 Kind: &structpb.Value_StringValue{ 120 StringValue: "output properties is stripped because it's too large which makes the whole build larger than BQ limit(10MB)", 121 }, 122 }, 123 }, 124 } 125 } 126 // Set timeout to avoid a hanging call. 127 ctx, cancel := context.WithTimeout(ctx, 10*time.Minute) 128 defer cancel() 129 130 row := &lucibq.Row{ 131 InsertID: strconv.FormatInt(p.Id, 10), 132 Message: p, 133 } 134 if err := clients.GetBqClient(ctx).Insert(ctx, "raw", "completed_builds", row); err != nil { 135 if pme, _ := err.(bigquery.PutMultiError); len(pme) != 0 { 136 return errors.Annotate(err, "bad row for build %d", buildID).Tag(tq.Fatal).Err() 137 } 138 return errors.Annotate(err, "transient error when inserting BQ for build %d", buildID).Tag(transient.Tag).Err() 139 } 140 return nil 141 } 142 143 // tryBackfillSwarming does a best effort backfill on Infra.Swarming for builds 144 // running on Swarming implemented backends. 145 // TODO(crbug.com/1508416) Stop backfill after Buildbucket taskbackend 146 // migration completes and all BQ queries are migrated away from Infra.Swarming. 147 func tryBackfillSwarming(b *pb.Build) (err error) { 148 backend := b.Infra.Backend 149 if backend.GetTask().GetId().GetId() == "" { 150 // No backend task associated with the build, bail out. 151 return 152 } 153 if !strings.HasPrefix(backend.Task.Id.Target, "swarming://") { 154 // The build doesn't run on a Swarming implemented backend, bail out. 155 return 156 } 157 158 sw := &pb.BuildInfra_Swarming{ 159 Hostname: backend.Hostname, 160 TaskId: backend.Task.Id.Id, 161 Caches: commonCacheToSwarmingCache(backend.Caches), 162 TaskDimensions: backend.TaskDimensions, 163 // ParentRunId is not set because Buildbucket (instead of backend) manages 164 // builds' parent/child relationships. 165 } 166 b.Infra.Swarming = sw 167 168 if backend.Config != nil { 169 for k, v := range backend.Config.AsMap() { 170 if k == "priority" { 171 if p, ok := v.(float64); ok { 172 sw.Priority = int32(p) 173 } 174 } 175 if k == "service_account" { 176 if s, ok := v.(string); ok { 177 sw.TaskServiceAccount = s 178 } 179 } 180 } 181 } 182 sw.BotDimensions, err = protoutil.BotDimensionsFromBackend(b) 183 return 184 } 185 186 // commonCacheToSwarmingCache returns the equivalent 187 // []*pb.BuildInfra_Swarming_CacheEntry for the given []*pb.CacheEntry. 188 func commonCacheToSwarmingCache(cache []*pb.CacheEntry) []*pb.BuildInfra_Swarming_CacheEntry { 189 var swarmingCache []*pb.BuildInfra_Swarming_CacheEntry 190 for _, c := range cache { 191 cacheEntry := &pb.BuildInfra_Swarming_CacheEntry{ 192 EnvVar: c.GetEnvVar(), 193 Name: c.GetName(), 194 Path: c.GetPath(), 195 WaitForWarmCache: c.GetWaitForWarmCache(), 196 } 197 swarmingCache = append(swarmingCache, cacheEntry) 198 } 199 return swarmingCache 200 } 201 202 // tryBackfillBackend does a best effort backfill on Infra.Backend for builds 203 // running on Swarming directly. 204 // TODO(crbug.com/1508416) Stop backfill after Buildbucket taskbackend 205 // migration completes. 206 func tryBackfillBackend(b *pb.Build) (err error) { 207 sw := b.Infra.Swarming 208 if sw.GetTaskId() == "" { 209 // No swarming task associated with the build, bail out. 210 return 211 } 212 213 backend := &pb.BuildInfra_Backend{ 214 Hostname: sw.Hostname, 215 Task: &pb.Task{ 216 Id: &pb.TaskID{ 217 Target: computeBackendTarget(sw.Hostname), 218 Id: sw.TaskId, 219 }, 220 // Status, Link, StatusDetails, SummaryMarkdown and UpdateId are not 221 // populated in this backfill. 222 }, 223 Caches: swarmingCacheToCommonCache(sw.Caches), 224 TaskDimensions: sw.TaskDimensions, 225 Config: &structpb.Struct{ 226 Fields: map[string]*structpb.Value{ 227 "priority": structpb.NewNumberValue(float64(sw.Priority)), 228 "service_account": structpb.NewStringValue(sw.TaskServiceAccount), 229 }, 230 }, 231 } 232 b.Infra.Backend = backend 233 234 // set backend.Task.Details 235 botDimensions := make(map[string][]string) 236 for _, dim := range sw.BotDimensions { 237 if _, ok := botDimensions[dim.Key]; !ok { 238 botDimensions[dim.Key] = make([]string, 0) 239 } 240 botDimensions[dim.Key] = append(botDimensions[dim.Key], dim.Value) 241 } 242 // Use json as an intermediate format to convert. 243 j, err := json.Marshal(map[string]any{ 244 "bot_dimensions": botDimensions, 245 }) 246 if err != nil { 247 return err 248 } 249 var m map[string]any 250 if err = json.Unmarshal(j, &m); err != nil { 251 return err 252 } 253 backend.Task.Details, err = structpb.NewStruct(m) 254 return err 255 } 256 257 // computeBackendTarget returns the backend target based on the swarming hostname. 258 // It's essentially a hack. The accurate way is to find the backend in global 259 // config by matching the hostname. But it's a bit heavy for a temporary solution 260 // like this. 261 func computeBackendTarget(swHost string) string { 262 swHostRe := regexp.MustCompile(`(.*).appspot.com`) 263 var swInstance string 264 if m := swHostRe.FindStringSubmatch(swHost); m != nil { 265 swInstance = m[1] 266 } 267 return fmt.Sprintf("swarming://%s", swInstance) 268 } 269 270 // swarmingCacheToCommonCache returns the equivalent []*pb.CacheEntry 271 // for the given []*pb.BuildInfra_Swarming_CacheEntry. 272 func swarmingCacheToCommonCache(swCache []*pb.BuildInfra_Swarming_CacheEntry) []*pb.CacheEntry { 273 var cache []*pb.CacheEntry 274 for _, c := range swCache { 275 cacheEntry := &pb.CacheEntry{ 276 EnvVar: c.GetEnvVar(), 277 Name: c.GetName(), 278 Path: c.GetPath(), 279 WaitForWarmCache: c.GetWaitForWarmCache(), 280 } 281 cache = append(cache, cacheEntry) 282 } 283 return cache 284 }