go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/buildbucket/appengine/tasks/sync_builds_with_backend_tasks.go (about)

     1  // Copyright 2023 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tasks
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"time"
    21  
    22  	"google.golang.org/protobuf/types/known/timestamppb"
    23  
    24  	"go.chromium.org/luci/common/clock"
    25  	"go.chromium.org/luci/common/errors"
    26  	"go.chromium.org/luci/common/logging"
    27  	"go.chromium.org/luci/common/retry/transient"
    28  	"go.chromium.org/luci/common/sync/parallel"
    29  	"go.chromium.org/luci/gae/service/datastore"
    30  	"go.chromium.org/luci/server/tq"
    31  
    32  	"go.chromium.org/luci/buildbucket/appengine/internal/clients"
    33  	"go.chromium.org/luci/buildbucket/appengine/internal/config"
    34  	"go.chromium.org/luci/buildbucket/appengine/internal/metrics"
    35  	"go.chromium.org/luci/buildbucket/appengine/model"
    36  	pb "go.chromium.org/luci/buildbucket/proto"
    37  	"go.chromium.org/luci/buildbucket/protoutil"
    38  )
    39  
    40  // Batch size to fetch tasks from backend.
    41  var fetchBatchSize = 1000
    42  
    43  // Batch size to update builds and sub entities in on transaction.
    44  // Transactions are limited to 25 entity groups.
    45  var updateBatchSize = 25
    46  
    47  // queryBuildsToSync runs queries to get incomplete builds from the project running
    48  // on the backend that have reached/exceeded their next sync time.
    49  //
    50  // It will run n parallel queries where n is the number of shards for the backend.
    51  // The queries pass the results to bkC for post process.
    52  func queryBuildsToSync(ctx context.Context, mr parallel.MultiRunner, backend, project string, shards int32, now time.Time, bkC chan []*datastore.Key) error {
    53  	baseQ := datastore.NewQuery(model.BuildKind).Eq("incomplete", true).Eq("backend_target", backend).Eq("project", project)
    54  
    55  	return mr.RunMulti(func(work chan<- func() error) {
    56  		for i := 0; i < int(shards); i++ {
    57  			i := i
    58  			work <- func() error {
    59  				bks := make([]*datastore.Key, 0, fetchBatchSize)
    60  				left := model.ConstructNextSyncTime(backend, project, i, time.Time{})
    61  				right := model.ConstructNextSyncTime(backend, project, i, now)
    62  				q := baseQ.Lt("next_backend_sync_time", right).Gt("next_backend_sync_time", left)
    63  				err := datastore.RunBatch(ctx, int32(fetchBatchSize), q.KeysOnly(true),
    64  					func(bk *datastore.Key) error {
    65  						bks = append(bks, bk)
    66  						if len(bks) == fetchBatchSize {
    67  							bkC <- bks
    68  							bks = make([]*datastore.Key, 0, fetchBatchSize)
    69  						}
    70  						return nil
    71  					},
    72  				)
    73  				if len(bks) > 0 {
    74  					bkC <- bks
    75  				}
    76  				return err
    77  			}
    78  		}
    79  	})
    80  }
    81  
    82  type buildAndInfra struct {
    83  	build *model.Build
    84  	infra *model.BuildInfra
    85  }
    86  
    87  func buildHasBeenUpdated(b *model.Build, now time.Time) bool {
    88  	_, _, _, nextSync := b.MustParseNextBackendSyncTime()
    89  	nowUnix := fmt.Sprint(now.Truncate(time.Minute).Unix())
    90  	return nextSync > nowUnix
    91  }
    92  func getEntities(ctx context.Context, bks []*datastore.Key, now time.Time) ([]*buildAndInfra, error) {
    93  	var blds []*model.Build
    94  	var infs []*model.BuildInfra
    95  	var toGet []any
    96  	for _, k := range bks {
    97  		b := &model.Build{}
    98  		populated := datastore.PopulateKey(b, k)
    99  		if !populated {
   100  			continue
   101  		}
   102  		inf := &model.BuildInfra{Build: k}
   103  		blds = append(blds, b)
   104  		infs = append(infs, inf)
   105  		toGet = append(toGet, b, inf)
   106  	}
   107  	if err := datastore.Get(ctx, toGet...); err != nil {
   108  		return nil, errors.Annotate(err, "error fetching builds %q", bks).Err()
   109  	}
   110  
   111  	var entitiesToSync []*buildAndInfra
   112  	for i, bld := range blds {
   113  		inf := infs[i]
   114  		switch {
   115  		case bld == nil || inf == nil:
   116  			continue
   117  		case protoutil.IsEnded(bld.Status):
   118  			continue
   119  		case inf.Proto.GetBackend().GetTask().GetId().GetId() == "":
   120  			// No task is associated to the build, log the error but move on.
   121  			logging.Errorf(ctx, "build %d does not have backend task associated", bld.ID)
   122  			continue
   123  		case buildHasBeenUpdated(bld, now):
   124  			// Build has been updated, skip.
   125  			continue
   126  		}
   127  		entitiesToSync = append(entitiesToSync, &buildAndInfra{build: bld, infra: inf})
   128  	}
   129  	return entitiesToSync, nil
   130  }
   131  
   132  func updateEntities(ctx context.Context, bks []*datastore.Key, now time.Time, taskMap map[string]*pb.Task) ([]*model.Build, error) {
   133  	var endedBld []*model.Build
   134  	err := datastore.RunInTransaction(ctx, func(ctx context.Context) error {
   135  		entities, err := getEntities(ctx, bks, now)
   136  		switch {
   137  		case err != nil:
   138  			return err
   139  		case len(entities) == 0:
   140  			// Nothing to sync.
   141  			return nil
   142  		}
   143  		logging.Infof(ctx, "updating %d builds with their backend tasks", len(bks))
   144  
   145  		var toPut []any
   146  		for _, ent := range entities {
   147  			bld := ent.build
   148  			inf := ent.infra
   149  			t := inf.Proto.Backend.GetTask()
   150  			taskID := t.GetId().GetId()
   151  			if taskID == "" {
   152  				// impossible.
   153  				logging.Errorf(ctx, "failed to get backend task id for build %d", bld.ID)
   154  				continue
   155  			}
   156  			fetchedTask := taskMap[taskID]
   157  			switch {
   158  			case fetchedTask == nil:
   159  				logging.Errorf(ctx, "backend task %s:%s is not in valid fetched tasks", t.GetId().GetTarget(), taskID)
   160  				continue
   161  			case fetchedTask.UpdateId < t.UpdateId:
   162  				logging.Errorf(ctx, "FetchTasks returns stale task for %s:%s with update_id %d, which task in datastore has update_id %d", t.GetId().GetTarget(), taskID, fetchedTask.UpdateId, t.UpdateId)
   163  				continue
   164  			case fetchedTask.UpdateId == t.UpdateId:
   165  				// No update from the task, so it's still running.
   166  				// Update build's UpdateTime (so that NextBackendSyncTime is
   167  				// recalculated when save) and we're done.
   168  				bld.Proto.UpdateTime = timestamppb.New(clock.Now(ctx))
   169  				toPut = append(toPut, bld)
   170  				continue
   171  			}
   172  			toSave, err := prepareUpdate(ctx, bld, inf, fetchedTask)
   173  			if err != nil {
   174  				logging.Errorf(ctx, "failed to update task for build %d: %s", bld.ID, err)
   175  				continue
   176  			}
   177  			toPut = append(toPut, toSave...)
   178  
   179  			if protoutil.IsEnded(fetchedTask.Status) {
   180  				endedBld = append(endedBld, bld)
   181  			}
   182  		}
   183  		return datastore.Put(ctx, toPut)
   184  	}, nil)
   185  	return endedBld, err
   186  }
   187  
   188  // validateResponses iterates through FetchTaskResponse.Responses and logs the
   189  // taskIDs that returned with errors and returns a map of taskIDs to valid tasks.
   190  func validateResponses(ctx context.Context, responses []*pb.FetchTasksResponse_Response, numTaskIDsRequsted int) (map[string]*pb.Task, errors.MultiError) {
   191  	if len(responses) != numTaskIDsRequsted {
   192  		return nil, errors.NewMultiError(errors.New(fmt.Sprintf("FetchTasksResponse returned with %d responses when %d were requested", len(responses), numTaskIDsRequsted)))
   193  	}
   194  	var err errors.MultiError
   195  	taskMap := map[string]*pb.Task{}
   196  	var validTaskIDs []string
   197  	fetchedCount := 0
   198  	for idx, resp := range responses {
   199  		switch r := resp.Response.(type) {
   200  		case *pb.FetchTasksResponse_Response_Task:
   201  			fetchedCount += 1
   202  			if e := validateTask(r.Task, true); e != nil {
   203  				err.MaybeAdd(e)
   204  				continue
   205  			}
   206  			taskMap[resp.GetTask().Id.GetId()] = resp.GetTask()
   207  			validTaskIDs = append(validTaskIDs, resp.GetTask().Id.GetId())
   208  		case *pb.FetchTasksResponse_Response_Error:
   209  			status := resp.GetError()
   210  			err.MaybeAdd(errors.New(fmt.Sprintf("Error at index %d: %d-%s", idx, status.Code, status.Message)))
   211  		}
   212  	}
   213  	// TODO(crbug.com/1472896): Remove the log after confirming the build task
   214  	// sync cron WAI.
   215  	if len(validTaskIDs) > 0 {
   216  		logging.Infof(ctx, "requested %d tasks, fetched %d, valid %d: %q", len(responses), fetchedCount, len(validTaskIDs), validTaskIDs)
   217  	}
   218  	return taskMap, err
   219  }
   220  
   221  // syncBuildsWithBackendTasks fetches backend tasks for the builds of a project,
   222  // then updates the builds.
   223  //
   224  // The task only retries if there's top level errors. In the case that a single
   225  // build is failed to update, we'll wait for the next task to update it again.
   226  func syncBuildsWithBackendTasks(ctx context.Context, mr parallel.MultiRunner, bc *clients.BackendClient, bks []*datastore.Key, now time.Time) error {
   227  	if len(bks) == 0 {
   228  		return nil
   229  	}
   230  
   231  	entities, err := getEntities(ctx, bks, now)
   232  	switch {
   233  	case err != nil:
   234  		return err
   235  	case len(entities) == 0:
   236  		// Nothing to sync.
   237  		return nil
   238  	}
   239  
   240  	// Fetch backend tasks.
   241  	var taskIDs []*pb.TaskID
   242  	for _, ent := range entities {
   243  		taskIDs = append(taskIDs, ent.infra.Proto.Backend.Task.Id)
   244  	}
   245  	if len(taskIDs) == 0 {
   246  		return nil
   247  	}
   248  	// TODO(crbug.com/1472896): Simplify the log after confirming the build task
   249  	// sync cron WAI.
   250  	logging.Infof(ctx, "Fetching %d backend tasks %q", len(taskIDs), taskIDs)
   251  	resp, err := bc.FetchTasks(ctx, &pb.FetchTasksRequest{TaskIds: taskIDs})
   252  	if err != nil {
   253  		return errors.Annotate(err, "failed to fetch backend tasks").Err()
   254  	}
   255  
   256  	// Validate fetched tasks and create a task map with validated tasks.
   257  	taskMap, errs := validateResponses(ctx, resp.Responses, len(taskIDs))
   258  	if errs.First() != nil {
   259  		logging.Errorf(ctx, errs.AsError().Error())
   260  		// Return early since taskMap must be empty.
   261  		if len(errs) == len(taskIDs) {
   262  			return errs
   263  		}
   264  	}
   265  
   266  	// Update entities for the builds that need to sync.
   267  	curBatch := make([]*datastore.Key, 0, updateBatchSize)
   268  	var bksBatchesToSync [][]*datastore.Key
   269  	for _, ent := range entities {
   270  		curBatch = append(curBatch, datastore.KeyForObj(ctx, ent.build))
   271  		if len(curBatch) == updateBatchSize {
   272  			bksBatchesToSync = append(bksBatchesToSync, curBatch)
   273  			curBatch = make([]*datastore.Key, 0, updateBatchSize)
   274  		}
   275  	}
   276  	if len(curBatch) > 0 {
   277  		bksBatchesToSync = append(bksBatchesToSync, curBatch)
   278  	}
   279  	var endedBld []*model.Build
   280  	for _, batch := range bksBatchesToSync {
   281  		batch := batch
   282  		err := mr.RunMulti(func(work chan<- func() error) {
   283  			work <- func() error {
   284  				endedBldInBatch, txErr := updateEntities(ctx, batch, now, taskMap)
   285  				if txErr != nil {
   286  					return transient.Tag.Apply(errors.Annotate(err, "failed to sync backend tasks").Err())
   287  				}
   288  				endedBld = append(endedBld, endedBldInBatch...)
   289  				return nil
   290  			}
   291  		})
   292  		if err != nil {
   293  			return err
   294  		}
   295  	}
   296  
   297  	for _, b := range endedBld {
   298  		metrics.BuildCompleted(ctx, b)
   299  	}
   300  
   301  	return nil
   302  }
   303  
   304  // SyncBuildsWithBackendTasks syncs all the builds belongs to `project` running
   305  // on `backend` with their backend tasks if their next sync time have been
   306  // exceeded.
   307  func SyncBuildsWithBackendTasks(ctx context.Context, backend, project string) error {
   308  	globalCfg, err := config.GetSettingsCfg(ctx)
   309  	if err != nil {
   310  		return errors.Annotate(err, "could not get global settings config").Err()
   311  	}
   312  
   313  	var shards int32
   314  	backendFound := false
   315  	for _, config := range globalCfg.Backends {
   316  		if config.Target == backend {
   317  			if config.GetFullMode() == nil {
   318  				// No need to sync tasks if it's not in a full mode.
   319  				return nil
   320  			}
   321  			backendFound = true
   322  			shards = config.GetFullMode().GetBuildSyncSetting().GetShards()
   323  		}
   324  	}
   325  	if !backendFound {
   326  		return tq.Fatal.Apply(errors.Reason("failed to find backend %s from global config", backend).Err())
   327  	}
   328  
   329  	bc, err := clients.NewBackendClient(ctx, project, backend, globalCfg)
   330  	if err != nil {
   331  		return tq.Fatal.Apply(errors.Annotate(err, "failed to connect to backend service %s as project %s", backend, project).Err())
   332  	}
   333  
   334  	now := clock.Now(ctx)
   335  	if shards == 0 {
   336  		shards = 1
   337  	}
   338  	nWorkers := int(shards)
   339  	return parallel.RunMulti(ctx, nWorkers, func(mr parallel.MultiRunner) error {
   340  		return mr.RunMulti(func(work chan<- func() error) {
   341  			bkC := make(chan []*datastore.Key)
   342  
   343  			work <- func() error {
   344  				defer close(bkC)
   345  				return queryBuildsToSync(ctx, mr, backend, project, shards, now, bkC)
   346  			}
   347  
   348  			for bks := range bkC {
   349  				bks := bks
   350  				work <- func() error {
   351  					return syncBuildsWithBackendTasks(ctx, mr, bc, bks, now)
   352  				}
   353  			}
   354  		})
   355  	})
   356  }