go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/gce/appengine/backend/queues.go (about) 1 // Copyright 2018 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package backend 16 17 import ( 18 "context" 19 "fmt" 20 "math/rand" 21 "time" 22 23 "github.com/golang/protobuf/proto" 24 25 "google.golang.org/api/googleapi" 26 "google.golang.org/protobuf/types/known/timestamppb" 27 28 "go.chromium.org/luci/appengine/tq" 29 "go.chromium.org/luci/common/clock" 30 "go.chromium.org/luci/common/data/rand/mathrand" 31 "go.chromium.org/luci/common/data/stringset" 32 "go.chromium.org/luci/common/errors" 33 "go.chromium.org/luci/common/logging" 34 "go.chromium.org/luci/gae/service/datastore" 35 36 "go.chromium.org/luci/gce/api/tasks/v1" 37 "go.chromium.org/luci/gce/appengine/backend/internal/metrics" 38 "go.chromium.org/luci/gce/appengine/model" 39 ) 40 41 // countVMsQueue is the name of the count VMs task handler queue. 42 const countVMsQueue = "count-vms" 43 44 // countVMs counts the VMs for a given config. 45 func countVMs(c context.Context, payload proto.Message) error { 46 task, ok := payload.(*tasks.CountVMs) 47 switch { 48 case !ok: 49 return errors.Reason("unexpected payload type %T", payload).Err() 50 case task.GetId() == "": 51 return errors.Reason("ID is required").Err() 52 } 53 // Count VMs per project, server and zone. 54 // VMs created from the same config eventually have the same project, server, 55 // and zone but may currently exist for a previous version of the config. 56 vms := &metrics.InstanceCount{} 57 58 // Get the configured count. 59 cfg := &model.Config{ 60 ID: task.Id, 61 } 62 switch err := datastore.Get(c, cfg); { 63 case err == datastore.ErrNoSuchEntity: 64 case err != nil: 65 return errors.Annotate(err, "failed to fetch config").Err() 66 default: 67 vms.AddConfigured(int(cfg.Config.CurrentAmount), cfg.Config.Attributes.Project) 68 } 69 70 // Get the actual (connected, created) counts. 71 vm := &model.VM{} 72 q := datastore.NewQuery(model.VMKind).Eq("config", task.Id) 73 if err := datastore.Run(c, q, func(k *datastore.Key) error { 74 id := k.StringID() 75 vm.ID = id 76 switch err := datastore.Get(c, vm); { 77 case err == datastore.ErrNoSuchEntity: 78 return nil 79 case err != nil: 80 return errors.Annotate(err, "failed to fetch VM").Err() 81 default: 82 if vm.Created > 0 { 83 vms.AddCreated(1, vm.Attributes.Project, vm.Attributes.Zone) 84 } 85 if vm.Connected > 0 { 86 vms.AddConnected(1, vm.Attributes.Project, vm.Swarming, vm.Attributes.Zone) 87 } 88 return nil 89 } 90 }); err != nil { 91 return errors.Annotate(err, "failed to fetch VMs").Err() 92 } 93 if err := vms.Update(c, task.Id); err != nil { 94 return errors.Annotate(err, "failed to update count").Err() 95 } 96 return nil 97 } 98 99 // drainVMQueue is the name of the drain VM task handler queue. 100 const drainVMQueue = "drain-vm" 101 102 func drainVMQueueHandler(c context.Context, payload proto.Message) error { 103 task, ok := payload.(*tasks.DrainVM) 104 switch { 105 case !ok: 106 return errors.Reason("unexpected payload %q", payload).Err() 107 case task.GetId() == "": 108 return errors.Reason("ID is required").Err() 109 } 110 vm := &model.VM{ 111 ID: task.Id, 112 } 113 switch err := datastore.Get(c, vm); { 114 case errors.Is(err, datastore.ErrNoSuchEntity): 115 return nil 116 case err != nil: 117 return errors.Annotate(err, "failed to fetch VM").Err() 118 case vm.URL == "": 119 logging.Debugf(c, "instance %q does not exist", vm.Hostname) 120 return nil 121 } 122 return drainVM(c, vm) 123 } 124 125 // drainVM drains a given VM if necessary. 126 func drainVM(c context.Context, vm *model.VM) error { 127 if vm.Drained { 128 return nil 129 } 130 cfg := &model.Config{ 131 ID: vm.Config, 132 } 133 switch err := datastore.Get(c, cfg); { 134 case err == datastore.ErrNoSuchEntity: 135 logging.Debugf(c, "config %q does not exist", cfg.ID) 136 case err != nil: 137 return errors.Annotate(err, "failed to fetch config").Err() 138 } 139 if vm.DUT != "" { 140 // DUT is still present in config. 141 // Index is not available for VM mapped to DUT due to different sequences of creation. 142 duts := cfg.Config.GetDuts() 143 if _, ok := duts[vm.DUT]; ok { 144 return nil 145 } 146 logging.Debugf(c, "config %q only specifies %d VMs", cfg.ID, cfg.Config.GetCurrentAmount()) 147 } else { 148 // This VM is below the currentAmount threshold and should not be drained. 149 if cfg.Config.GetCurrentAmount() > vm.Index { 150 return nil 151 } 152 logging.Debugf(c, "config %q only specifies %d VMs", cfg.ID, cfg.Config.GetCurrentAmount()) 153 } 154 return datastore.RunInTransaction(c, func(c context.Context) error { 155 switch err := datastore.Get(c, vm); { 156 case err == datastore.ErrNoSuchEntity: 157 vm.Drained = true 158 return nil 159 case err != nil: 160 return errors.Annotate(err, "failed to fetch VM").Err() 161 case vm.Drained: 162 return nil 163 } 164 vm.Drained = true 165 logging.Debugf(c, "set VM %s as drained in db", vm.Hostname) 166 if err := datastore.Put(c, vm); err != nil { 167 return errors.Annotate(err, "failed to store VM").Err() 168 } 169 return nil 170 }, nil) 171 } 172 173 // getSuffix returns a random suffix to use when naming a GCE instance. 174 func getSuffix(c context.Context) string { 175 const allowed = "abcdefghijklmnopqrstuvwxyz0123456789" 176 suf := make([]byte, 4) 177 for i := range suf { 178 suf[i] = allowed[mathrand.Intn(c, len(allowed))] 179 } 180 return string(suf) 181 } 182 183 // createVMQueue is the name of the create VM task handler queue. 184 const createVMQueue = "create-vm" 185 186 // createVM creates a VM if it doesn't already exist. 187 func createVM(c context.Context, payload proto.Message) error { 188 task, ok := payload.(*tasks.CreateVM) 189 switch { 190 case !ok: 191 return errors.Reason("unexpected payload type %T", payload).Err() 192 case task.GetId() == "": 193 return errors.Reason("ID is required").Err() 194 case task.GetConfig() == "": 195 return errors.Reason("config is required").Err() 196 } 197 198 // VMs paired with DUTs cannot rely on index for hostname uniqueness. 199 // Instead, we rely on timestamp see getUniqueID. 200 var hostname string 201 if task.DUT != "" { 202 hostname = task.Id 203 } else { 204 hostname = fmt.Sprintf("%s-%d-%s", task.Prefix, task.Index, getSuffix(c)) 205 } 206 vm := &model.VM{ 207 ID: task.Id, 208 Config: task.Config, 209 Configured: clock.Now(c).Unix(), 210 DUT: task.DUT, 211 Hostname: hostname, 212 Index: task.Index, 213 Lifetime: task.Lifetime, 214 Prefix: task.Prefix, 215 Revision: task.Revision, 216 Swarming: task.Swarming, 217 Timeout: task.Timeout, 218 } 219 if task.Attributes != nil { 220 vm.Attributes = *task.Attributes 221 // TODO(crbug/942301): Auto-select zone if zone is unspecified. 222 vm.Attributes.SetZone(vm.Attributes.GetZone()) 223 vm.IndexAttributes() 224 } 225 // createVM is called repeatedly, so do a fast check outside the transaction. 226 // In most cases, this will skip the more expensive transactional check. 227 switch err := datastore.Get(c, vm); { 228 case err == datastore.ErrNoSuchEntity: 229 case err != nil: 230 return errors.Annotate(err, "failed to fetch VM").Err() 231 default: 232 return nil 233 } 234 return datastore.RunInTransaction(c, func(c context.Context) error { 235 switch err := datastore.Get(c, vm); { 236 case err == datastore.ErrNoSuchEntity: 237 case err != nil: 238 return errors.Annotate(err, "failed to fetch VM").Err() 239 default: 240 return nil 241 } 242 if err := datastore.Put(c, vm); err != nil { 243 return errors.Annotate(err, "failed to store VM").Err() 244 } 245 return nil 246 }, nil) 247 } 248 249 // updateCurrentAmount updates CurrentAmount if necessary. 250 // Returns up-to-date config entity and the reference timestamp. 251 func updateCurrentAmount(c context.Context, id string) (cfg *model.Config, now time.Time, err error) { 252 cfg = &model.Config{ 253 ID: id, 254 } 255 // Avoid transaction if possible. 256 if err = datastore.Get(c, cfg); err != nil { 257 err = errors.Annotate(err, "failed to fetch config").Err() 258 return 259 } 260 261 now = clock.Now(c) 262 var amt int32 263 switch amt, err = cfg.Config.ComputeAmount(cfg.Config.CurrentAmount, now); { 264 case err != nil: 265 err = errors.Annotate(err, "failed to parse amount").Err() 266 return 267 case cfg.Config.CurrentAmount == amt: 268 return 269 } 270 271 err = datastore.RunInTransaction(c, func(c context.Context) error { 272 var err error 273 if err = datastore.Get(c, cfg); err != nil { 274 return errors.Annotate(err, "failed to fetch config").Err() 275 } 276 277 now = clock.Now(c) 278 switch amt, err = cfg.Config.ComputeAmount(cfg.Config.CurrentAmount, now); { 279 case err != nil: 280 return errors.Annotate(err, "failed to parse amount").Err() 281 case cfg.Config.CurrentAmount == amt: 282 return nil 283 } 284 cfg.Config.CurrentAmount = amt 285 logging.Debugf(c, "set config %q to allow %d VMs", cfg.ID, cfg.Config.CurrentAmount) 286 if err = datastore.Put(c, cfg); err != nil { 287 return errors.Annotate(err, "failed to store config").Err() 288 } 289 return nil 290 }, nil) 291 return 292 } 293 294 // getCurrentVMsByPrefix returns all the VMs in the datastore by prefix 295 func getCurrentVMsByPrefix(ctx context.Context, prefix string) ([]*model.VM, error) { 296 q := datastore.NewQuery(model.VMKind).Eq("prefix", prefix) 297 vms := make([]*model.VM, 0) 298 if err := datastore.Run(ctx, q, func(vm *model.VM) { 299 vms = append(vms, vm) 300 }); err != nil { 301 return nil, errors.Annotate(err, "failed to fetch vms for %s", prefix).Err() 302 } 303 return vms, nil 304 } 305 306 // expandConfigQueue is the name of the expand config task handler queue. 307 const expandConfigQueue = "expand-config" 308 309 // expandConfig creates task queue tasks to create each VM in the given config. 310 func expandConfig(c context.Context, payload proto.Message) error { 311 task, ok := payload.(*tasks.ExpandConfig) 312 switch { 313 case !ok: 314 return errors.Reason("unexpected payload type %T", payload).Err() 315 case task.GetId() == "": 316 return errors.Reason("ID is required").Err() 317 } 318 cfg, now, err := updateCurrentAmount(c, task.Id) 319 if err != nil { 320 return err 321 } 322 // Measure the time taken for this query, For debugging purposes 323 start := time.Now() 324 vms, err := getCurrentVMsByPrefix(c, cfg.Config.Prefix) 325 rt := time.Since(start) 326 logging.Debugf(c, "getCurrentVMsByPrefix[%s]: error - %v #VMs - %d", rt, err, len(vms)) 327 if err != nil { 328 return err 329 } 330 331 var t []*tq.Task 332 // DUTs take priority. 333 if len(cfg.Config.GetDuts()) > 0 { 334 t, err = createTasksPerDUT(c, vms, cfg, now) 335 } else { 336 t, err = createTasksPerAmount(c, vms, cfg, now) 337 } 338 if err != nil { 339 return err 340 } 341 342 logging.Debugf(c, "for config %s, creating %d VMs", cfg.Config.Prefix, len(t)) 343 if err := getDispatcher(c).AddTask(c, t...); err != nil { 344 return errors.Annotate(err, "failed to schedule tasks").Err() 345 } 346 return nil 347 } 348 349 // getUniqueID returns a unique ID based on Unix time in milliseconds. 350 func getUniqueID(c context.Context, prefix string) string { 351 ms := clock.Now(c).UnixMilli() 352 return fmt.Sprintf("%s-%d-%s", prefix, ms, getSuffix(c)) 353 } 354 355 // createTasksPerDUT returns a slice of CreateVM tasks based on config.Duts. 356 func createTasksPerDUT(c context.Context, vms []*model.VM, cfg *model.Config, now time.Time) ([]*tq.Task, error) { 357 logging.Debugf(c, "CloudBots flow entered for config %s", cfg.Config.Prefix) 358 if len(cfg.Config.Duts) == 0 { 359 return nil, errors.Reason("config.DUTs cannot be empty").Err() 360 } 361 existingVMs := make(map[string]string, len(vms)) 362 for _, vm := range vms { 363 existingVMs[vm.DUT] = vm.Hostname 364 } 365 var t []*tq.Task 366 var i int32 = 0 367 for dut := range cfg.Config.Duts { 368 if vm, ok := existingVMs[dut]; ok { 369 logging.Debugf(c, "the DUT %s is already assigned to an existing VM %s, skipping", dut, vm) 370 continue 371 } 372 t = append(t, &tq.Task{ 373 Payload: &tasks.CreateVM{ 374 Id: getUniqueID(c, cfg.Config.Prefix), 375 Attributes: cfg.Config.Attributes, 376 Config: cfg.ID, 377 Created: ×tamppb.Timestamp{ 378 Seconds: now.Unix(), 379 }, 380 // Index is not needed here. 381 // CloudBots flow does not rely on Index for VM hostname uniqueness. 382 DUT: dut, 383 Lifetime: randomizeLifetime(cfg.Config.Lifetime.GetSeconds()), 384 Prefix: cfg.Config.Prefix, 385 Revision: cfg.Config.Revision, 386 Swarming: cfg.Config.Swarming, 387 Timeout: cfg.Config.Timeout.GetSeconds(), 388 }, 389 }) 390 i++ 391 } 392 return t, nil 393 } 394 395 // createTasksPerAmount returns a slice of CreateVM tasks based on config.CurrentAmount. 396 func createTasksPerAmount(c context.Context, vms []*model.VM, cfg *model.Config, now time.Time) ([]*tq.Task, error) { 397 logging.Debugf(c, "default flow entered for config %s", cfg.Config.Prefix) 398 if len(cfg.Config.Duts) > 0 { 399 return nil, errors.Reason("config.Duts should be empty").Err() 400 } 401 existingVMs := stringset.New(len(vms)) 402 for _, vm := range vms { 403 existingVMs.Add(vm.ID) 404 } 405 var t []*tq.Task 406 for i := int32(0); i < cfg.Config.CurrentAmount; i++ { 407 id := fmt.Sprintf("%s-%d", cfg.Config.Prefix, i) 408 if !existingVMs.Has(id) { 409 t = append(t, &tq.Task{ 410 Payload: &tasks.CreateVM{ 411 Id: id, 412 Attributes: cfg.Config.Attributes, 413 Config: cfg.ID, 414 Created: ×tamppb.Timestamp{ 415 Seconds: now.Unix(), 416 }, 417 Index: i, 418 Lifetime: randomizeLifetime(cfg.Config.Lifetime.GetSeconds()), 419 Prefix: cfg.Config.Prefix, 420 Revision: cfg.Config.Revision, 421 Swarming: cfg.Config.Swarming, 422 Timeout: cfg.Config.Timeout.GetSeconds(), 423 }, 424 }) 425 } 426 } 427 return t, nil 428 } 429 430 // randomizeLifetime randomizes the specified lifetime within an interval. 431 // 432 // Randomized lifetime of VMs spreads the load of terminated/respawn VMs. 433 func randomizeLifetime(lifetime int64) int64 { 434 interval := lifetime / 10 435 if interval <= 0 { // The lifetime is too short or invalid, so do nothing. 436 return lifetime 437 } 438 return lifetime + rand.Int63n(interval) 439 } 440 441 // reportQuotaQueue is the name of the report quota task handler queue. 442 const reportQuotaQueue = "report-quota" 443 444 // reportQuota reports GCE quota utilization. 445 func reportQuota(c context.Context, payload proto.Message) error { 446 task, ok := payload.(*tasks.ReportQuota) 447 switch { 448 case !ok: 449 return errors.Reason("unexpected payload type %T", payload).Err() 450 case task.GetId() == "": 451 return errors.Reason("ID is required").Err() 452 } 453 p := &model.Project{ 454 ID: task.Id, 455 } 456 if err := datastore.Get(c, p); err != nil { 457 return errors.Annotate(err, "failed to fetch project").Err() 458 } 459 mets := stringset.NewFromSlice(p.Config.Metric...) 460 regs := stringset.NewFromSlice(p.Config.Region...) 461 rsp, err := getCompute(c).Stable.Regions.List(p.Config.Project).Context(c).Do() 462 if err != nil { 463 if gerr, ok := err.(*googleapi.Error); ok { 464 logErrors(c, task.Id, gerr) 465 } 466 return errors.Annotate(err, "failed to fetch quota").Err() 467 } 468 for _, r := range rsp.Items { 469 if regs.Has(r.Name) { 470 for _, q := range r.Quotas { 471 if mets.Has(q.Metric) { 472 metrics.UpdateQuota(c, q.Limit, q.Usage, q.Metric, p.Config.Project, r.Name) 473 } 474 } 475 } 476 } 477 return nil 478 }