go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/bisection/rerun/rerun.go (about)

     1  // Copyright 2022 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package rerun handles rerun for a build.
    16  package rerun
    17  
    18  import (
    19  	"context"
    20  	"encoding/json"
    21  	"fmt"
    22  	"strconv"
    23  
    24  	buildbucketpb "go.chromium.org/luci/buildbucket/proto"
    25  	"go.chromium.org/luci/common/errors"
    26  	"go.chromium.org/luci/common/logging"
    27  	"go.chromium.org/luci/gae/service/datastore"
    28  	"google.golang.org/protobuf/types/known/fieldmaskpb"
    29  	"google.golang.org/protobuf/types/known/structpb"
    30  
    31  	"go.chromium.org/luci/bisection/internal/buildbucket"
    32  	"go.chromium.org/luci/bisection/model"
    33  	pb "go.chromium.org/luci/bisection/proto/v1"
    34  	"go.chromium.org/luci/bisection/testfailureanalysis"
    35  	"go.chromium.org/luci/bisection/util"
    36  	"go.chromium.org/luci/bisection/util/datastoreutil"
    37  )
    38  
    39  // TriggerOptions contains information how the rerun should be triggered.
    40  type TriggerOptions struct {
    41  	// The builder we should trigger the rerun on. Required.
    42  	Builder *buildbucketpb.BuilderID
    43  	// The gitiles commit for the revision that we want to run the rerun build. Required.
    44  	GitilesCommit *buildbucketpb.GitilesCommit
    45  	// The buildbucket ID that the rerun build should copy the properties
    46  	// and dimension from. Required.
    47  	SampleBuildID int64
    48  	// Extra properties we want the rerun build to have.
    49  	ExtraProperties map[string]any
    50  	// Extra dimensions we want the rerun build to have.
    51  	ExtraDimensions map[string]string
    52  	// Priority of the rerun build.
    53  	Priority int32
    54  }
    55  
    56  // TriggerRerun triggers a rerun build given the options.
    57  func TriggerRerun(c context.Context, options *TriggerOptions) (*buildbucketpb.Build, error) {
    58  	err := validateOptions(options)
    59  	if err != nil {
    60  		return nil, errors.Annotate(err, "validate rerun options").Err()
    61  	}
    62  	logging.Infof(c, "triggerRerun with commit %s", options.GitilesCommit.Id)
    63  	properties, dimensions, err := getRerunPropertiesAndDimensions(c, options.SampleBuildID, options.ExtraProperties, options.ExtraDimensions)
    64  	if err != nil {
    65  		logging.Errorf(c, "Failed getRerunPropertiesAndDimension for build %d", options.SampleBuildID)
    66  		return nil, err
    67  	}
    68  	req := &buildbucketpb.ScheduleBuildRequest{
    69  		Builder:       options.Builder,
    70  		Properties:    properties,
    71  		Dimensions:    dimensions,
    72  		Tags:          getRerunTags(c, options.SampleBuildID),
    73  		GitilesCommit: options.GitilesCommit,
    74  		Priority:      options.Priority,
    75  	}
    76  	build, err := buildbucket.ScheduleBuild(c, req)
    77  	if err != nil {
    78  		logging.Errorf(c, "Failed trigger rerun for build %d: %w", options.SampleBuildID, err)
    79  		return nil, err
    80  	}
    81  	logging.Infof(c, "Rerun build %d triggered for build: %d", build.GetId(), options.SampleBuildID)
    82  	return build, nil
    83  }
    84  
    85  func validateOptions(options *TriggerOptions) error {
    86  	if options == nil {
    87  		return errors.New("option must not be nil")
    88  	}
    89  	if options.Builder == nil {
    90  		return errors.New("builder must not be nil")
    91  	}
    92  	if options.GitilesCommit == nil {
    93  		return errors.New("gitiles commit must not be nil")
    94  	}
    95  	if options.SampleBuildID == 0 {
    96  		return errors.New("sample build id must be specified")
    97  	}
    98  	return nil
    99  }
   100  
   101  // getRerunTags returns the build bucket tags for the rerun build
   102  func getRerunTags(c context.Context, bbid int64) []*buildbucketpb.StringPair {
   103  	return []*buildbucketpb.StringPair{
   104  		{
   105  			// analyzed_build_id is the buildbucket ID of the build which we want to rerun.
   106  			Key:   "analyzed_build_id",
   107  			Value: strconv.FormatInt(bbid, 10),
   108  		},
   109  	}
   110  }
   111  
   112  // getRerunPropertiesAndDimensions returns the properties and dimensions for a rerun of a buildID.
   113  // If the builder is a tester, the dimension will be derived from its parent build.
   114  func getRerunPropertiesAndDimensions(c context.Context, bbid int64, props map[string]any, dims map[string]string) (*structpb.Struct, []*buildbucketpb.RequestedDimension, error) {
   115  	mask := &buildbucketpb.BuildMask{
   116  		Fields: &fieldmaskpb.FieldMask{
   117  			Paths: []string{"input.properties", "builder", "infra.swarming.task_dimensions", "infra.backend.task_dimensions"},
   118  		},
   119  	}
   120  	build, err := buildbucket.GetBuild(c, bbid, mask)
   121  	if err != nil {
   122  		return nil, nil, errors.Annotate(err, "failed to get properties for build %d", bbid).Err()
   123  	}
   124  	properties, err := getRerunProperties(c, build, props)
   125  	if err != nil {
   126  		return nil, nil, err
   127  	}
   128  	parentBuildIDStr, found := build.GetInput().GetProperties().GetFields()["parent_build_id"]
   129  
   130  	// If builder is not a tester, return the dimension derived by this build.
   131  	if !found {
   132  		dimens := getRerunDimensions(c, build, dims)
   133  		return properties, dimens, nil
   134  	}
   135  	parentBuildID, err := strconv.Atoi(parentBuildIDStr.GetStringValue())
   136  	if err != nil {
   137  		return nil, nil, errors.Annotate(err, "parse parent_build_id %s", parentBuildIDStr).Err()
   138  	}
   139  	// If builder is a tester, return the dimension derived by the parent build.
   140  	parentBuild, err := buildbucket.GetBuild(c, int64(parentBuildID), mask)
   141  	if err != nil {
   142  		return nil, nil, errors.Annotate(err, "failed to get properties for parent build %d", int64(parentBuildID)).Err()
   143  	}
   144  	dimens := getRerunDimensions(c, parentBuild, dims)
   145  	return properties, dimens, nil
   146  }
   147  
   148  func getRerunProperties(c context.Context, build *buildbucketpb.Build, props map[string]any) (*structpb.Struct, error) {
   149  	fields := map[string]any{}
   150  	properties := build.GetInput().GetProperties()
   151  	if properties != nil {
   152  		m := properties.GetFields()
   153  		if builderGroup, ok := m["builder_group"]; ok {
   154  			fields["builder_group"] = builderGroup
   155  			fields["target_builder"] = map[string]string{
   156  				"builder": build.Builder.Builder,
   157  				"group":   builderGroup.GetStringValue(),
   158  			}
   159  		}
   160  		if bootstrapProperties, ok := m["$bootstrap/properties"]; ok {
   161  			fields["$bootstrap/properties"] = bootstrapProperties
   162  		}
   163  	}
   164  
   165  	for k, v := range props {
   166  		fields[k] = v
   167  	}
   168  
   169  	spb, err := toStructPB(fields)
   170  	if err != nil {
   171  		return nil, fmt.Errorf("cannot convert %v to structpb: %w", fields, err)
   172  	}
   173  	return spb, nil
   174  }
   175  
   176  func getRerunDimensions(c context.Context, build *buildbucketpb.Build, dims map[string]string) []*buildbucketpb.RequestedDimension {
   177  	result := []*buildbucketpb.RequestedDimension{}
   178  
   179  	// Only copy these dimensions from the analyzed builder to the rerun job request.
   180  	allowedDimensions := map[string]bool{"os": true, "gpu": true}
   181  
   182  	if dimens := util.GetTaskDimensions(build); dimens != nil {
   183  		dimens := util.GetTaskDimensions(build)
   184  		for _, d := range dimens {
   185  			if _, ok := allowedDimensions[d.Key]; ok {
   186  				result = append(result, &buildbucketpb.RequestedDimension{
   187  					Key:   d.Key,
   188  					Value: d.Value,
   189  				})
   190  			}
   191  		}
   192  	}
   193  
   194  	// Add extra dimension from dims
   195  	for k, v := range dims {
   196  		result = append(result, &buildbucketpb.RequestedDimension{
   197  			Key:   k,
   198  			Value: v,
   199  		})
   200  	}
   201  
   202  	return result
   203  }
   204  
   205  // CreateRerunBuildModel creates a CompileRerunBuild (and SingleRerun) in datastore
   206  func CreateRerunBuildModel(c context.Context, build *buildbucketpb.Build, rerunType model.RerunBuildType, suspect *model.Suspect, nsa *model.CompileNthSectionAnalysis, priority int32) (*model.CompileRerunBuild, error) {
   207  	if rerunType == model.RerunBuildType_CulpritVerification && suspect == nil {
   208  		return nil, fmt.Errorf("CreateRerunBuildModel requires suspect when type is CulpritVerification")
   209  	}
   210  	if rerunType == model.RerunBuildType_NthSection && nsa == nil {
   211  		return nil, fmt.Errorf("CreateRerunBuildModel requires nth section analysis when type is NthSection")
   212  	}
   213  
   214  	gitilesCommit := *build.GetInput().GetGitilesCommit()
   215  	startTime := build.StartTime.AsTime()
   216  	createTime := build.CreateTime.AsTime()
   217  	rerunBuild := &model.CompileRerunBuild{
   218  		Id: build.GetId(),
   219  		LuciBuild: model.LuciBuild{
   220  			BuildId:    build.GetId(),
   221  			Project:    build.Builder.Project,
   222  			Bucket:     build.Builder.Bucket,
   223  			Builder:    build.Builder.Builder,
   224  			CreateTime: createTime,
   225  			StartTime:  startTime,
   226  			Status:     build.GetStatus(),
   227  			GitilesCommit: buildbucketpb.GitilesCommit{
   228  				Host:    gitilesCommit.Host,
   229  				Project: gitilesCommit.Project,
   230  				Ref:     gitilesCommit.Ref,
   231  				Id:      gitilesCommit.Id,
   232  			},
   233  		},
   234  	}
   235  	err := datastore.Put(c, rerunBuild)
   236  	if err != nil {
   237  		logging.Errorf(c, "Error in creating CompileRerunBuild model for build %d", build.GetId())
   238  		return nil, err
   239  	}
   240  	dimensions, err := buildbucket.GetBuildTaskDimension(c, build.GetId())
   241  	if err != nil {
   242  		return nil, errors.Annotate(err, "get build task dimension bbid %v", build.GetId()).Err()
   243  	}
   244  	// Create the first SingleRerun for CompileRerunBuild
   245  	// It will be updated when we receive updates from recipe
   246  	singleRerun := &model.SingleRerun{
   247  		RerunBuild: datastore.KeyForObj(c, rerunBuild),
   248  		Status:     pb.RerunStatus_RERUN_STATUS_IN_PROGRESS,
   249  		GitilesCommit: buildbucketpb.GitilesCommit{
   250  			Host:    gitilesCommit.Host,
   251  			Project: gitilesCommit.Project,
   252  			Ref:     gitilesCommit.Ref,
   253  			Id:      gitilesCommit.Id,
   254  		},
   255  		CreateTime: createTime,
   256  		StartTime:  startTime,
   257  		Type:       rerunType,
   258  		Priority:   priority,
   259  		Dimensions: dimensions,
   260  	}
   261  
   262  	if rerunType == model.RerunBuildType_CulpritVerification {
   263  		singleRerun.Analysis = suspect.ParentAnalysis.Parent()
   264  		singleRerun.Suspect = datastore.KeyForObj(c, suspect)
   265  	}
   266  	if rerunType == model.RerunBuildType_NthSection {
   267  		singleRerun.Analysis = nsa.ParentAnalysis
   268  		singleRerun.NthSectionAnalysis = datastore.KeyForObj(c, nsa)
   269  	}
   270  
   271  	err = datastore.Put(c, singleRerun)
   272  	if err != nil {
   273  		logging.Errorf(c, "Error in creating SingleRerun model for build %d", build.GetId())
   274  		return nil, err
   275  	}
   276  
   277  	return rerunBuild, nil
   278  }
   279  
   280  // UpdateCompileRerunStatus updates the start/end time and status of rerun builds and single rerun (when we received buildbucket pubsub messages)
   281  func UpdateCompileRerunStatus(c context.Context, bbid int64) error {
   282  	logging.Infof(c, "UpdateCompileRerunStatus for build %d", bbid)
   283  	rerunModel := &model.CompileRerunBuild{
   284  		Id: bbid,
   285  	}
   286  
   287  	err := datastore.Get(c, rerunModel)
   288  	if err == datastore.ErrNoSuchEntity {
   289  		// There are cases where we cannot find datastore entries, like
   290  		// luci-bisection-dev receives pubsub message for a prod run
   291  		// In this case, just log and return nil
   292  		logging.Warningf(c, "Couldn't find compile rerun to update status: %d", bbid)
   293  		return nil
   294  	}
   295  	if err != nil {
   296  		return errors.Annotate(err, "couldn't get rerun model %d", bbid).Err()
   297  	}
   298  
   299  	lastRerun, err := datastoreutil.GetLastRerunForRerunBuild(c, rerunModel)
   300  	if err != nil {
   301  		return errors.Annotate(err, "failed getting last rerun for build %d", rerunModel.Id).Err()
   302  	}
   303  
   304  	build, err := buildbucket.GetBuild(c, bbid, &buildbucketpb.BuildMask{
   305  		Fields: &fieldmaskpb.FieldMask{
   306  			Paths: []string{"id", "builder", "end_time", "start_time", "status"},
   307  		},
   308  	})
   309  	if err != nil {
   310  		return errors.Annotate(err, "couldn't get build %d", bbid).Err()
   311  	}
   312  
   313  	startTime := build.StartTime.AsTime()
   314  	endTime := build.EndTime.AsTime()
   315  
   316  	err = datastore.RunInTransaction(c, func(ctx context.Context) error {
   317  		e := datastore.Get(c, rerunModel)
   318  		if e != nil {
   319  			return e
   320  		}
   321  		rerunModel.StartTime = startTime
   322  		rerunModel.EndTime = endTime
   323  		rerunModel.Status = build.Status
   324  		return datastore.Put(c, rerunModel)
   325  	}, nil)
   326  
   327  	if err != nil {
   328  		return errors.Annotate(err, "couldn't save rerun model %d", bbid).Err()
   329  	}
   330  
   331  	err = datastore.RunInTransaction(c, func(ctx context.Context) error {
   332  		e := datastore.Get(c, lastRerun)
   333  		if e != nil {
   334  			return e
   335  		}
   336  		buildEnded := build.Status&buildbucketpb.Status_ENDED_MASK == buildbucketpb.Status_ENDED_MASK
   337  		if buildEnded && !lastRerun.HasEnded() {
   338  			// Edge case: when the build ends but the rerun isn't ended,
   339  			// this suggests that there is a infra failure in the rerun build
   340  			// which prevent it from sending back the update via the UpdateAnalysisProgress RPC.
   341  			// TODO (nqmtuan): Perhaps we need to update Analysis and NthSection analysis status too?
   342  			lastRerun.Status = pb.RerunStatus_RERUN_STATUS_INFRA_FAILED
   343  			lastRerun.EndTime = endTime
   344  		}
   345  		lastRerun.StartTime = startTime
   346  		return datastore.Put(c, lastRerun)
   347  	}, nil)
   348  
   349  	if err != nil {
   350  		return errors.Annotate(err, "failed saving last rerun for build %d", rerunModel.Id).Err()
   351  	}
   352  	return nil
   353  }
   354  
   355  // UpdateTestRerunStatus is called when we receive updates from buildbucket
   356  // for test rerun build.
   357  func UpdateTestRerunStatus(ctx context.Context, build *buildbucketpb.Build) error {
   358  	bbid := build.Id
   359  	logging.Infof(ctx, "UpdateTestRerunStatus for build %d", bbid)
   360  	rerunFailed := false
   361  	singleRerun := &model.TestSingleRerun{
   362  		ID: bbid,
   363  	}
   364  
   365  	err := datastore.RunInTransaction(ctx, func(ctx context.Context) error {
   366  		err := datastore.Get(ctx, singleRerun)
   367  		if err == datastore.ErrNoSuchEntity {
   368  			// There are cases where we cannot find datastore entries, like
   369  			// luci-bisection-dev receives pubsub message for a prod run.
   370  			// In this case, just log and return nil.
   371  			logging.Warningf(ctx, "Couldn't find test rerun to update status : %d", bbid)
   372  			return nil
   373  		}
   374  		if err != nil {
   375  			return errors.Annotate(err, "couldn't get TestSingleRerun %d", bbid).Err()
   376  		}
   377  
   378  		singleRerun.LUCIBuild.StartTime = build.StartTime.AsTime()
   379  		singleRerun.LUCIBuild.EndTime = build.EndTime.AsTime()
   380  		singleRerun.LUCIBuild.Status = build.Status
   381  		buildEnded := build.Status&buildbucketpb.Status_ENDED_MASK == buildbucketpb.Status_ENDED_MASK
   382  
   383  		if buildEnded && !singleRerun.HasEnded() {
   384  			// Edge case: when the build ends but the rerun isn't ended,
   385  			// this suggests that there is a infra failure in the rerun build
   386  			// which prevent it from sending back the update via the UpdateTestAnalysisProgress RPC.
   387  			singleRerun.Status = pb.RerunStatus_RERUN_STATUS_INFRA_FAILED
   388  			rerunFailed = true
   389  		}
   390  
   391  		err = datastore.Put(ctx, singleRerun)
   392  		if err != nil {
   393  			return errors.Annotate(err, "couldn't save single rerun %d", bbid).Err()
   394  		}
   395  		return nil
   396  	}, nil)
   397  
   398  	if err != nil {
   399  		return errors.Annotate(err, "saving test single rerun").Err()
   400  	}
   401  
   402  	if rerunFailed {
   403  		tfa, err := datastoreutil.GetTestFailureAnalysis(ctx, singleRerun.AnalysisKey.IntID())
   404  		if err != nil {
   405  			return errors.Annotate(err, "get test failure analysis").Err()
   406  		}
   407  		// Update analysis and nthsection analysis if applicable.
   408  		// The reason why we put it here instead of the above transaction
   409  		// was because read-after-write within a transaction does not work.
   410  		// (https://cloud.google.com/datastore/docs/concepts/transactions#isolation_and_consistency)
   411  		err = testfailureanalysis.UpdateAnalysisStatusWhenError(ctx, tfa)
   412  		if err != nil {
   413  			return errors.Annotate(err, "update analysis status when error").Err()
   414  		}
   415  	}
   416  	return nil
   417  }
   418  
   419  // TODO (nqmtuan): Move this into a helper class if it turns out we need to use
   420  // it for more than one place
   421  // toStructPB convert an any s to structpb.Struct, as long as s is marshallable.
   422  // s can be a general Go type, structpb.Struct type, or mixed.
   423  // For example, s can be a map of mixed type, like
   424  // {"key1": "val1", "key2": structpb.NewStringValue("val2")}
   425  func toStructPB(s any) (*structpb.Struct, error) {
   426  	// We used json as an intermediate format to convert
   427  	j, err := json.Marshal(s)
   428  	if err != nil {
   429  		return nil, err
   430  	}
   431  	var m map[string]any
   432  	if err := json.Unmarshal(j, &m); err != nil {
   433  		return nil, err
   434  	}
   435  	return structpb.NewStruct(m)
   436  }