go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/buildbucket/appengine/tasks/sync_builds_with_backend_tasks.go (about) 1 // Copyright 2023 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tasks 16 17 import ( 18 "context" 19 "fmt" 20 "time" 21 22 "google.golang.org/protobuf/types/known/timestamppb" 23 24 "go.chromium.org/luci/common/clock" 25 "go.chromium.org/luci/common/errors" 26 "go.chromium.org/luci/common/logging" 27 "go.chromium.org/luci/common/retry/transient" 28 "go.chromium.org/luci/common/sync/parallel" 29 "go.chromium.org/luci/gae/service/datastore" 30 "go.chromium.org/luci/server/tq" 31 32 "go.chromium.org/luci/buildbucket/appengine/internal/clients" 33 "go.chromium.org/luci/buildbucket/appengine/internal/config" 34 "go.chromium.org/luci/buildbucket/appengine/internal/metrics" 35 "go.chromium.org/luci/buildbucket/appengine/model" 36 pb "go.chromium.org/luci/buildbucket/proto" 37 "go.chromium.org/luci/buildbucket/protoutil" 38 ) 39 40 // Batch size to fetch tasks from backend. 41 var fetchBatchSize = 1000 42 43 // Batch size to update builds and sub entities in on transaction. 44 // Transactions are limited to 25 entity groups. 45 var updateBatchSize = 25 46 47 // queryBuildsToSync runs queries to get incomplete builds from the project running 48 // on the backend that have reached/exceeded their next sync time. 49 // 50 // It will run n parallel queries where n is the number of shards for the backend. 51 // The queries pass the results to bkC for post process. 52 func queryBuildsToSync(ctx context.Context, mr parallel.MultiRunner, backend, project string, shards int32, now time.Time, bkC chan []*datastore.Key) error { 53 baseQ := datastore.NewQuery(model.BuildKind).Eq("incomplete", true).Eq("backend_target", backend).Eq("project", project) 54 55 return mr.RunMulti(func(work chan<- func() error) { 56 for i := 0; i < int(shards); i++ { 57 i := i 58 work <- func() error { 59 bks := make([]*datastore.Key, 0, fetchBatchSize) 60 left := model.ConstructNextSyncTime(backend, project, i, time.Time{}) 61 right := model.ConstructNextSyncTime(backend, project, i, now) 62 q := baseQ.Lt("next_backend_sync_time", right).Gt("next_backend_sync_time", left) 63 err := datastore.RunBatch(ctx, int32(fetchBatchSize), q.KeysOnly(true), 64 func(bk *datastore.Key) error { 65 bks = append(bks, bk) 66 if len(bks) == fetchBatchSize { 67 bkC <- bks 68 bks = make([]*datastore.Key, 0, fetchBatchSize) 69 } 70 return nil 71 }, 72 ) 73 if len(bks) > 0 { 74 bkC <- bks 75 } 76 return err 77 } 78 } 79 }) 80 } 81 82 type buildAndInfra struct { 83 build *model.Build 84 infra *model.BuildInfra 85 } 86 87 func buildHasBeenUpdated(b *model.Build, now time.Time) bool { 88 _, _, _, nextSync := b.MustParseNextBackendSyncTime() 89 nowUnix := fmt.Sprint(now.Truncate(time.Minute).Unix()) 90 return nextSync > nowUnix 91 } 92 func getEntities(ctx context.Context, bks []*datastore.Key, now time.Time) ([]*buildAndInfra, error) { 93 var blds []*model.Build 94 var infs []*model.BuildInfra 95 var toGet []any 96 for _, k := range bks { 97 b := &model.Build{} 98 populated := datastore.PopulateKey(b, k) 99 if !populated { 100 continue 101 } 102 inf := &model.BuildInfra{Build: k} 103 blds = append(blds, b) 104 infs = append(infs, inf) 105 toGet = append(toGet, b, inf) 106 } 107 if err := datastore.Get(ctx, toGet...); err != nil { 108 return nil, errors.Annotate(err, "error fetching builds %q", bks).Err() 109 } 110 111 var entitiesToSync []*buildAndInfra 112 for i, bld := range blds { 113 inf := infs[i] 114 switch { 115 case bld == nil || inf == nil: 116 continue 117 case protoutil.IsEnded(bld.Status): 118 continue 119 case inf.Proto.GetBackend().GetTask().GetId().GetId() == "": 120 // No task is associated to the build, log the error but move on. 121 logging.Errorf(ctx, "build %d does not have backend task associated", bld.ID) 122 continue 123 case buildHasBeenUpdated(bld, now): 124 // Build has been updated, skip. 125 continue 126 } 127 entitiesToSync = append(entitiesToSync, &buildAndInfra{build: bld, infra: inf}) 128 } 129 return entitiesToSync, nil 130 } 131 132 func updateEntities(ctx context.Context, bks []*datastore.Key, now time.Time, taskMap map[string]*pb.Task) ([]*model.Build, error) { 133 var endedBld []*model.Build 134 err := datastore.RunInTransaction(ctx, func(ctx context.Context) error { 135 entities, err := getEntities(ctx, bks, now) 136 switch { 137 case err != nil: 138 return err 139 case len(entities) == 0: 140 // Nothing to sync. 141 return nil 142 } 143 logging.Infof(ctx, "updating %d builds with their backend tasks", len(bks)) 144 145 var toPut []any 146 for _, ent := range entities { 147 bld := ent.build 148 inf := ent.infra 149 t := inf.Proto.Backend.GetTask() 150 taskID := t.GetId().GetId() 151 if taskID == "" { 152 // impossible. 153 logging.Errorf(ctx, "failed to get backend task id for build %d", bld.ID) 154 continue 155 } 156 fetchedTask := taskMap[taskID] 157 switch { 158 case fetchedTask == nil: 159 logging.Errorf(ctx, "backend task %s:%s is not in valid fetched tasks", t.GetId().GetTarget(), taskID) 160 continue 161 case fetchedTask.UpdateId < t.UpdateId: 162 logging.Errorf(ctx, "FetchTasks returns stale task for %s:%s with update_id %d, which task in datastore has update_id %d", t.GetId().GetTarget(), taskID, fetchedTask.UpdateId, t.UpdateId) 163 continue 164 case fetchedTask.UpdateId == t.UpdateId: 165 // No update from the task, so it's still running. 166 // Update build's UpdateTime (so that NextBackendSyncTime is 167 // recalculated when save) and we're done. 168 bld.Proto.UpdateTime = timestamppb.New(clock.Now(ctx)) 169 toPut = append(toPut, bld) 170 continue 171 } 172 toSave, err := prepareUpdate(ctx, bld, inf, fetchedTask) 173 if err != nil { 174 logging.Errorf(ctx, "failed to update task for build %d: %s", bld.ID, err) 175 continue 176 } 177 toPut = append(toPut, toSave...) 178 179 if protoutil.IsEnded(fetchedTask.Status) { 180 endedBld = append(endedBld, bld) 181 } 182 } 183 return datastore.Put(ctx, toPut) 184 }, nil) 185 return endedBld, err 186 } 187 188 // validateResponses iterates through FetchTaskResponse.Responses and logs the 189 // taskIDs that returned with errors and returns a map of taskIDs to valid tasks. 190 func validateResponses(ctx context.Context, responses []*pb.FetchTasksResponse_Response, numTaskIDsRequsted int) (map[string]*pb.Task, errors.MultiError) { 191 if len(responses) != numTaskIDsRequsted { 192 return nil, errors.NewMultiError(errors.New(fmt.Sprintf("FetchTasksResponse returned with %d responses when %d were requested", len(responses), numTaskIDsRequsted))) 193 } 194 var err errors.MultiError 195 taskMap := map[string]*pb.Task{} 196 var validTaskIDs []string 197 fetchedCount := 0 198 for idx, resp := range responses { 199 switch r := resp.Response.(type) { 200 case *pb.FetchTasksResponse_Response_Task: 201 fetchedCount += 1 202 if e := validateTask(r.Task, true); e != nil { 203 err.MaybeAdd(e) 204 continue 205 } 206 taskMap[resp.GetTask().Id.GetId()] = resp.GetTask() 207 validTaskIDs = append(validTaskIDs, resp.GetTask().Id.GetId()) 208 case *pb.FetchTasksResponse_Response_Error: 209 status := resp.GetError() 210 err.MaybeAdd(errors.New(fmt.Sprintf("Error at index %d: %d-%s", idx, status.Code, status.Message))) 211 } 212 } 213 // TODO(crbug.com/1472896): Remove the log after confirming the build task 214 // sync cron WAI. 215 if len(validTaskIDs) > 0 { 216 logging.Infof(ctx, "requested %d tasks, fetched %d, valid %d: %q", len(responses), fetchedCount, len(validTaskIDs), validTaskIDs) 217 } 218 return taskMap, err 219 } 220 221 // syncBuildsWithBackendTasks fetches backend tasks for the builds of a project, 222 // then updates the builds. 223 // 224 // The task only retries if there's top level errors. In the case that a single 225 // build is failed to update, we'll wait for the next task to update it again. 226 func syncBuildsWithBackendTasks(ctx context.Context, mr parallel.MultiRunner, bc *clients.BackendClient, bks []*datastore.Key, now time.Time) error { 227 if len(bks) == 0 { 228 return nil 229 } 230 231 entities, err := getEntities(ctx, bks, now) 232 switch { 233 case err != nil: 234 return err 235 case len(entities) == 0: 236 // Nothing to sync. 237 return nil 238 } 239 240 // Fetch backend tasks. 241 var taskIDs []*pb.TaskID 242 for _, ent := range entities { 243 taskIDs = append(taskIDs, ent.infra.Proto.Backend.Task.Id) 244 } 245 if len(taskIDs) == 0 { 246 return nil 247 } 248 // TODO(crbug.com/1472896): Simplify the log after confirming the build task 249 // sync cron WAI. 250 logging.Infof(ctx, "Fetching %d backend tasks %q", len(taskIDs), taskIDs) 251 resp, err := bc.FetchTasks(ctx, &pb.FetchTasksRequest{TaskIds: taskIDs}) 252 if err != nil { 253 return errors.Annotate(err, "failed to fetch backend tasks").Err() 254 } 255 256 // Validate fetched tasks and create a task map with validated tasks. 257 taskMap, errs := validateResponses(ctx, resp.Responses, len(taskIDs)) 258 if errs.First() != nil { 259 logging.Errorf(ctx, errs.AsError().Error()) 260 // Return early since taskMap must be empty. 261 if len(errs) == len(taskIDs) { 262 return errs 263 } 264 } 265 266 // Update entities for the builds that need to sync. 267 curBatch := make([]*datastore.Key, 0, updateBatchSize) 268 var bksBatchesToSync [][]*datastore.Key 269 for _, ent := range entities { 270 curBatch = append(curBatch, datastore.KeyForObj(ctx, ent.build)) 271 if len(curBatch) == updateBatchSize { 272 bksBatchesToSync = append(bksBatchesToSync, curBatch) 273 curBatch = make([]*datastore.Key, 0, updateBatchSize) 274 } 275 } 276 if len(curBatch) > 0 { 277 bksBatchesToSync = append(bksBatchesToSync, curBatch) 278 } 279 var endedBld []*model.Build 280 for _, batch := range bksBatchesToSync { 281 batch := batch 282 err := mr.RunMulti(func(work chan<- func() error) { 283 work <- func() error { 284 endedBldInBatch, txErr := updateEntities(ctx, batch, now, taskMap) 285 if txErr != nil { 286 return transient.Tag.Apply(errors.Annotate(err, "failed to sync backend tasks").Err()) 287 } 288 endedBld = append(endedBld, endedBldInBatch...) 289 return nil 290 } 291 }) 292 if err != nil { 293 return err 294 } 295 } 296 297 for _, b := range endedBld { 298 metrics.BuildCompleted(ctx, b) 299 } 300 301 return nil 302 } 303 304 // SyncBuildsWithBackendTasks syncs all the builds belongs to `project` running 305 // on `backend` with their backend tasks if their next sync time have been 306 // exceeded. 307 func SyncBuildsWithBackendTasks(ctx context.Context, backend, project string) error { 308 globalCfg, err := config.GetSettingsCfg(ctx) 309 if err != nil { 310 return errors.Annotate(err, "could not get global settings config").Err() 311 } 312 313 var shards int32 314 backendFound := false 315 for _, config := range globalCfg.Backends { 316 if config.Target == backend { 317 if config.GetFullMode() == nil { 318 // No need to sync tasks if it's not in a full mode. 319 return nil 320 } 321 backendFound = true 322 shards = config.GetFullMode().GetBuildSyncSetting().GetShards() 323 } 324 } 325 if !backendFound { 326 return tq.Fatal.Apply(errors.Reason("failed to find backend %s from global config", backend).Err()) 327 } 328 329 bc, err := clients.NewBackendClient(ctx, project, backend, globalCfg) 330 if err != nil { 331 return tq.Fatal.Apply(errors.Annotate(err, "failed to connect to backend service %s as project %s", backend, project).Err()) 332 } 333 334 now := clock.Now(ctx) 335 if shards == 0 { 336 shards = 1 337 } 338 nWorkers := int(shards) 339 return parallel.RunMulti(ctx, nWorkers, func(mr parallel.MultiRunner) error { 340 return mr.RunMulti(func(work chan<- func() error) { 341 bkC := make(chan []*datastore.Key) 342 343 work <- func() error { 344 defer close(bkC) 345 return queryBuildsToSync(ctx, mr, backend, project, shards, now, bkC) 346 } 347 348 for bks := range bkC { 349 bks := bks 350 work <- func() error { 351 return syncBuildsWithBackendTasks(ctx, mr, bc, bks, now) 352 } 353 } 354 }) 355 }) 356 }