go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/buildbucket/appengine/tasks/bq.go (about)

     1  // Copyright 2022 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tasks
    16  
    17  import (
    18  	"context"
    19  	"encoding/json"
    20  	"fmt"
    21  	"regexp"
    22  	"strconv"
    23  	"strings"
    24  	"time"
    25  
    26  	"cloud.google.com/go/bigquery"
    27  	"google.golang.org/protobuf/encoding/protojson"
    28  	"google.golang.org/protobuf/types/known/structpb"
    29  
    30  	lucibq "go.chromium.org/luci/common/bq"
    31  	"go.chromium.org/luci/common/errors"
    32  	"go.chromium.org/luci/common/logging"
    33  	"go.chromium.org/luci/common/retry/transient"
    34  	"go.chromium.org/luci/gae/service/datastore"
    35  	"go.chromium.org/luci/server/tq"
    36  
    37  	"go.chromium.org/luci/buildbucket/appengine/internal/clients"
    38  	"go.chromium.org/luci/buildbucket/appengine/model"
    39  	pb "go.chromium.org/luci/buildbucket/proto"
    40  	"go.chromium.org/luci/buildbucket/protoutil"
    41  )
    42  
    43  // maxBuildSizeInBQ is (10MB-5KB) as the maximum allowed request size in either
    44  // streaming API or storage write API is 10MB. And we want to leave 5KB buffer
    45  // room during message conversion.
    46  // Note: make it to var so that it can be tested in unit tests without taking up
    47  // too much memory.
    48  var maxBuildSizeInBQ = 10*1000*1000 - 5*1000
    49  
    50  // ExportBuild saves the build into BiqQuery.
    51  // The returned error has transient.Tag or tq.Fatal in order to tell tq to drop
    52  // or retry.
    53  func ExportBuild(ctx context.Context, buildID int64) error {
    54  	b := &model.Build{ID: buildID}
    55  	switch err := datastore.Get(ctx, b); {
    56  	case err == datastore.ErrNoSuchEntity:
    57  		return errors.Annotate(err, "build %d not found when exporting into BQ", buildID).Tag(tq.Fatal).Err()
    58  	case err != nil:
    59  		return errors.Annotate(err, "error fetching builds").Tag(transient.Tag).Err()
    60  	}
    61  	p, err := b.ToProto(ctx, model.NoopBuildMask, nil)
    62  	if err != nil {
    63  		return errors.Annotate(err, "failed to convert build to proto").Err()
    64  	}
    65  
    66  	if p.Infra.Swarming == nil {
    67  		// Backfill Infra.Swarming for builds running on Swarming implemented backends.
    68  		// TODO(crbug.com/1508416) Stop backfill after Buildbucket taskbackend
    69  		// migration completes and all BQ queries are migrated away from Infra.Swarming.
    70  		err = tryBackfillSwarming(p)
    71  		if err != nil {
    72  			// Since the backfill is best effort, only log the error but not fail
    73  			// this bq export task.
    74  			logging.Warningf(ctx, "failed to backfill swarming data for build %d: %s", buildID, err)
    75  		}
    76  	} else if p.Infra.Backend == nil {
    77  		// Backfill Infra.Backend for builds running on Swarming directly.
    78  		// TODO(crbug.com/1508416) Stop backfill after Buildbucket taskbackend
    79  		// migration completes - by then there will be no more builds running on
    80  		// Swarming directly, so this flow can be removed.
    81  		err = tryBackfillBackend(p)
    82  		if err != nil {
    83  			// Since the backfill is best effort, only log the error but not fail
    84  			// this bq export task.
    85  			logging.Warningf(ctx, "failed to backfill backend data for build %d: %s", buildID, err)
    86  		}
    87  	}
    88  
    89  	// Clear fields that we don't want in BigQuery.
    90  	p.Infra.Buildbucket.Hostname = ""
    91  	if p.Infra.Backend.GetTask() != nil {
    92  		p.Infra.Backend.Task.UpdateId = 0
    93  	}
    94  	for _, step := range p.GetSteps() {
    95  		step.SummaryMarkdown = ""
    96  		step.MergeBuild = nil
    97  		for _, log := range step.Logs {
    98  			name := log.Name
    99  			log.Reset()
   100  			log.Name = name
   101  		}
   102  	}
   103  
   104  	// Check if the cleaned Build too large.
   105  	pj, err := protojson.Marshal(p)
   106  	if err != nil {
   107  		logging.Errorf(ctx, "failed to calculate Build size for %d: %s, continue to try to insert the build...", buildID, err)
   108  	}
   109  	// We only strip out the outputProperties here.
   110  	// Usually,large build size is caused by large outputProperties size since we
   111  	// only expanded the 1MB Datastore limit per field for BuildOutputProperties.
   112  	// If there are failures for any other fields, we'd like this job to continue
   113  	// to try so that we can be alerted.
   114  	if len(pj) > maxBuildSizeInBQ && p.Output.GetProperties() != nil {
   115  		logging.Warningf(ctx, "striping out outputProperties for build %d in BQ exporting", buildID)
   116  		p.Output.Properties = &structpb.Struct{
   117  			Fields: map[string]*structpb.Value{
   118  				"strip_reason": {
   119  					Kind: &structpb.Value_StringValue{
   120  						StringValue: "output properties is stripped because it's too large which makes the whole build larger than BQ limit(10MB)",
   121  					},
   122  				},
   123  			},
   124  		}
   125  	}
   126  	// Set timeout to avoid a hanging call.
   127  	ctx, cancel := context.WithTimeout(ctx, 10*time.Minute)
   128  	defer cancel()
   129  
   130  	row := &lucibq.Row{
   131  		InsertID: strconv.FormatInt(p.Id, 10),
   132  		Message:  p,
   133  	}
   134  	if err := clients.GetBqClient(ctx).Insert(ctx, "raw", "completed_builds", row); err != nil {
   135  		if pme, _ := err.(bigquery.PutMultiError); len(pme) != 0 {
   136  			return errors.Annotate(err, "bad row for build %d", buildID).Tag(tq.Fatal).Err()
   137  		}
   138  		return errors.Annotate(err, "transient error when inserting BQ for build %d", buildID).Tag(transient.Tag).Err()
   139  	}
   140  	return nil
   141  }
   142  
   143  // tryBackfillSwarming does a best effort backfill on Infra.Swarming for builds
   144  // running on Swarming implemented backends.
   145  // TODO(crbug.com/1508416) Stop backfill after Buildbucket taskbackend
   146  // migration completes and all BQ queries are migrated away from Infra.Swarming.
   147  func tryBackfillSwarming(b *pb.Build) (err error) {
   148  	backend := b.Infra.Backend
   149  	if backend.GetTask().GetId().GetId() == "" {
   150  		// No backend task associated with the build, bail out.
   151  		return
   152  	}
   153  	if !strings.HasPrefix(backend.Task.Id.Target, "swarming://") {
   154  		// The build doesn't run on a Swarming implemented backend, bail out.
   155  		return
   156  	}
   157  
   158  	sw := &pb.BuildInfra_Swarming{
   159  		Hostname:       backend.Hostname,
   160  		TaskId:         backend.Task.Id.Id,
   161  		Caches:         commonCacheToSwarmingCache(backend.Caches),
   162  		TaskDimensions: backend.TaskDimensions,
   163  		// ParentRunId is not set because Buildbucket (instead of backend) manages
   164  		// builds' parent/child relationships.
   165  	}
   166  	b.Infra.Swarming = sw
   167  
   168  	if backend.Config != nil {
   169  		for k, v := range backend.Config.AsMap() {
   170  			if k == "priority" {
   171  				if p, ok := v.(float64); ok {
   172  					sw.Priority = int32(p)
   173  				}
   174  			}
   175  			if k == "service_account" {
   176  				if s, ok := v.(string); ok {
   177  					sw.TaskServiceAccount = s
   178  				}
   179  			}
   180  		}
   181  	}
   182  	sw.BotDimensions, err = protoutil.BotDimensionsFromBackend(b)
   183  	return
   184  }
   185  
   186  // commonCacheToSwarmingCache returns the equivalent
   187  // []*pb.BuildInfra_Swarming_CacheEntry for the given []*pb.CacheEntry.
   188  func commonCacheToSwarmingCache(cache []*pb.CacheEntry) []*pb.BuildInfra_Swarming_CacheEntry {
   189  	var swarmingCache []*pb.BuildInfra_Swarming_CacheEntry
   190  	for _, c := range cache {
   191  		cacheEntry := &pb.BuildInfra_Swarming_CacheEntry{
   192  			EnvVar:           c.GetEnvVar(),
   193  			Name:             c.GetName(),
   194  			Path:             c.GetPath(),
   195  			WaitForWarmCache: c.GetWaitForWarmCache(),
   196  		}
   197  		swarmingCache = append(swarmingCache, cacheEntry)
   198  	}
   199  	return swarmingCache
   200  }
   201  
   202  // tryBackfillBackend does a best effort backfill on Infra.Backend for builds
   203  // running on Swarming directly.
   204  // TODO(crbug.com/1508416) Stop backfill after Buildbucket taskbackend
   205  // migration completes.
   206  func tryBackfillBackend(b *pb.Build) (err error) {
   207  	sw := b.Infra.Swarming
   208  	if sw.GetTaskId() == "" {
   209  		// No swarming task associated with the build, bail out.
   210  		return
   211  	}
   212  
   213  	backend := &pb.BuildInfra_Backend{
   214  		Hostname: sw.Hostname,
   215  		Task: &pb.Task{
   216  			Id: &pb.TaskID{
   217  				Target: computeBackendTarget(sw.Hostname),
   218  				Id:     sw.TaskId,
   219  			},
   220  			// Status, Link, StatusDetails, SummaryMarkdown and UpdateId are not
   221  			// populated in this backfill.
   222  		},
   223  		Caches:         swarmingCacheToCommonCache(sw.Caches),
   224  		TaskDimensions: sw.TaskDimensions,
   225  		Config: &structpb.Struct{
   226  			Fields: map[string]*structpb.Value{
   227  				"priority":        structpb.NewNumberValue(float64(sw.Priority)),
   228  				"service_account": structpb.NewStringValue(sw.TaskServiceAccount),
   229  			},
   230  		},
   231  	}
   232  	b.Infra.Backend = backend
   233  
   234  	// set backend.Task.Details
   235  	botDimensions := make(map[string][]string)
   236  	for _, dim := range sw.BotDimensions {
   237  		if _, ok := botDimensions[dim.Key]; !ok {
   238  			botDimensions[dim.Key] = make([]string, 0)
   239  		}
   240  		botDimensions[dim.Key] = append(botDimensions[dim.Key], dim.Value)
   241  	}
   242  	// Use json as an intermediate format to convert.
   243  	j, err := json.Marshal(map[string]any{
   244  		"bot_dimensions": botDimensions,
   245  	})
   246  	if err != nil {
   247  		return err
   248  	}
   249  	var m map[string]any
   250  	if err = json.Unmarshal(j, &m); err != nil {
   251  		return err
   252  	}
   253  	backend.Task.Details, err = structpb.NewStruct(m)
   254  	return err
   255  }
   256  
   257  // computeBackendTarget returns the backend target based on the swarming hostname.
   258  // It's essentially a hack. The accurate way is to find the backend in global
   259  // config by matching the hostname. But it's a bit heavy for a temporary solution
   260  // like this.
   261  func computeBackendTarget(swHost string) string {
   262  	swHostRe := regexp.MustCompile(`(.*).appspot.com`)
   263  	var swInstance string
   264  	if m := swHostRe.FindStringSubmatch(swHost); m != nil {
   265  		swInstance = m[1]
   266  	}
   267  	return fmt.Sprintf("swarming://%s", swInstance)
   268  }
   269  
   270  // swarmingCacheToCommonCache returns the equivalent []*pb.CacheEntry
   271  // for the given []*pb.BuildInfra_Swarming_CacheEntry.
   272  func swarmingCacheToCommonCache(swCache []*pb.BuildInfra_Swarming_CacheEntry) []*pb.CacheEntry {
   273  	var cache []*pb.CacheEntry
   274  	for _, c := range swCache {
   275  		cacheEntry := &pb.CacheEntry{
   276  			EnvVar:           c.GetEnvVar(),
   277  			Name:             c.GetName(),
   278  			Path:             c.GetPath(),
   279  			WaitForWarmCache: c.GetWaitForWarmCache(),
   280  		}
   281  		cache = append(cache, cacheEntry)
   282  	}
   283  	return cache
   284  }