go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/gce/appengine/backend/queues.go (about)

     1  // Copyright 2018 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package backend
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"math/rand"
    21  	"time"
    22  
    23  	"github.com/golang/protobuf/proto"
    24  
    25  	"google.golang.org/api/googleapi"
    26  	"google.golang.org/protobuf/types/known/timestamppb"
    27  
    28  	"go.chromium.org/luci/appengine/tq"
    29  	"go.chromium.org/luci/common/clock"
    30  	"go.chromium.org/luci/common/data/rand/mathrand"
    31  	"go.chromium.org/luci/common/data/stringset"
    32  	"go.chromium.org/luci/common/errors"
    33  	"go.chromium.org/luci/common/logging"
    34  	"go.chromium.org/luci/gae/service/datastore"
    35  
    36  	"go.chromium.org/luci/gce/api/tasks/v1"
    37  	"go.chromium.org/luci/gce/appengine/backend/internal/metrics"
    38  	"go.chromium.org/luci/gce/appengine/model"
    39  )
    40  
    41  // countVMsQueue is the name of the count VMs task handler queue.
    42  const countVMsQueue = "count-vms"
    43  
    44  // countVMs counts the VMs for a given config.
    45  func countVMs(c context.Context, payload proto.Message) error {
    46  	task, ok := payload.(*tasks.CountVMs)
    47  	switch {
    48  	case !ok:
    49  		return errors.Reason("unexpected payload type %T", payload).Err()
    50  	case task.GetId() == "":
    51  		return errors.Reason("ID is required").Err()
    52  	}
    53  	// Count VMs per project, server and zone.
    54  	// VMs created from the same config eventually have the same project, server,
    55  	// and zone but may currently exist for a previous version of the config.
    56  	vms := &metrics.InstanceCount{}
    57  
    58  	// Get the configured count.
    59  	cfg := &model.Config{
    60  		ID: task.Id,
    61  	}
    62  	switch err := datastore.Get(c, cfg); {
    63  	case err == datastore.ErrNoSuchEntity:
    64  	case err != nil:
    65  		return errors.Annotate(err, "failed to fetch config").Err()
    66  	default:
    67  		vms.AddConfigured(int(cfg.Config.CurrentAmount), cfg.Config.Attributes.Project)
    68  	}
    69  
    70  	// Get the actual (connected, created) counts.
    71  	vm := &model.VM{}
    72  	q := datastore.NewQuery(model.VMKind).Eq("config", task.Id)
    73  	if err := datastore.Run(c, q, func(k *datastore.Key) error {
    74  		id := k.StringID()
    75  		vm.ID = id
    76  		switch err := datastore.Get(c, vm); {
    77  		case err == datastore.ErrNoSuchEntity:
    78  			return nil
    79  		case err != nil:
    80  			return errors.Annotate(err, "failed to fetch VM").Err()
    81  		default:
    82  			if vm.Created > 0 {
    83  				vms.AddCreated(1, vm.Attributes.Project, vm.Attributes.Zone)
    84  			}
    85  			if vm.Connected > 0 {
    86  				vms.AddConnected(1, vm.Attributes.Project, vm.Swarming, vm.Attributes.Zone)
    87  			}
    88  			return nil
    89  		}
    90  	}); err != nil {
    91  		return errors.Annotate(err, "failed to fetch VMs").Err()
    92  	}
    93  	if err := vms.Update(c, task.Id); err != nil {
    94  		return errors.Annotate(err, "failed to update count").Err()
    95  	}
    96  	return nil
    97  }
    98  
    99  // drainVMQueue is the name of the drain VM task handler queue.
   100  const drainVMQueue = "drain-vm"
   101  
   102  func drainVMQueueHandler(c context.Context, payload proto.Message) error {
   103  	task, ok := payload.(*tasks.DrainVM)
   104  	switch {
   105  	case !ok:
   106  		return errors.Reason("unexpected payload %q", payload).Err()
   107  	case task.GetId() == "":
   108  		return errors.Reason("ID is required").Err()
   109  	}
   110  	vm := &model.VM{
   111  		ID: task.Id,
   112  	}
   113  	switch err := datastore.Get(c, vm); {
   114  	case errors.Is(err, datastore.ErrNoSuchEntity):
   115  		return nil
   116  	case err != nil:
   117  		return errors.Annotate(err, "failed to fetch VM").Err()
   118  	case vm.URL == "":
   119  		logging.Debugf(c, "instance %q does not exist", vm.Hostname)
   120  		return nil
   121  	}
   122  	return drainVM(c, vm)
   123  }
   124  
   125  // drainVM drains a given VM if necessary.
   126  func drainVM(c context.Context, vm *model.VM) error {
   127  	if vm.Drained {
   128  		return nil
   129  	}
   130  	cfg := &model.Config{
   131  		ID: vm.Config,
   132  	}
   133  	switch err := datastore.Get(c, cfg); {
   134  	case err == datastore.ErrNoSuchEntity:
   135  		logging.Debugf(c, "config %q does not exist", cfg.ID)
   136  	case err != nil:
   137  		return errors.Annotate(err, "failed to fetch config").Err()
   138  	}
   139  	if vm.DUT != "" {
   140  		// DUT is still present in config.
   141  		// Index is not available for VM mapped to DUT due to different sequences of creation.
   142  		duts := cfg.Config.GetDuts()
   143  		if _, ok := duts[vm.DUT]; ok {
   144  			return nil
   145  		}
   146  		logging.Debugf(c, "config %q only specifies %d VMs", cfg.ID, cfg.Config.GetCurrentAmount())
   147  	} else {
   148  		// This VM is below the currentAmount threshold and should not be drained.
   149  		if cfg.Config.GetCurrentAmount() > vm.Index {
   150  			return nil
   151  		}
   152  		logging.Debugf(c, "config %q only specifies %d VMs", cfg.ID, cfg.Config.GetCurrentAmount())
   153  	}
   154  	return datastore.RunInTransaction(c, func(c context.Context) error {
   155  		switch err := datastore.Get(c, vm); {
   156  		case err == datastore.ErrNoSuchEntity:
   157  			vm.Drained = true
   158  			return nil
   159  		case err != nil:
   160  			return errors.Annotate(err, "failed to fetch VM").Err()
   161  		case vm.Drained:
   162  			return nil
   163  		}
   164  		vm.Drained = true
   165  		logging.Debugf(c, "set VM %s as drained in db", vm.Hostname)
   166  		if err := datastore.Put(c, vm); err != nil {
   167  			return errors.Annotate(err, "failed to store VM").Err()
   168  		}
   169  		return nil
   170  	}, nil)
   171  }
   172  
   173  // getSuffix returns a random suffix to use when naming a GCE instance.
   174  func getSuffix(c context.Context) string {
   175  	const allowed = "abcdefghijklmnopqrstuvwxyz0123456789"
   176  	suf := make([]byte, 4)
   177  	for i := range suf {
   178  		suf[i] = allowed[mathrand.Intn(c, len(allowed))]
   179  	}
   180  	return string(suf)
   181  }
   182  
   183  // createVMQueue is the name of the create VM task handler queue.
   184  const createVMQueue = "create-vm"
   185  
   186  // createVM creates a VM if it doesn't already exist.
   187  func createVM(c context.Context, payload proto.Message) error {
   188  	task, ok := payload.(*tasks.CreateVM)
   189  	switch {
   190  	case !ok:
   191  		return errors.Reason("unexpected payload type %T", payload).Err()
   192  	case task.GetId() == "":
   193  		return errors.Reason("ID is required").Err()
   194  	case task.GetConfig() == "":
   195  		return errors.Reason("config is required").Err()
   196  	}
   197  
   198  	// VMs paired with DUTs cannot rely on index for hostname uniqueness.
   199  	// Instead, we rely on timestamp see getUniqueID.
   200  	var hostname string
   201  	if task.DUT != "" {
   202  		hostname = task.Id
   203  	} else {
   204  		hostname = fmt.Sprintf("%s-%d-%s", task.Prefix, task.Index, getSuffix(c))
   205  	}
   206  	vm := &model.VM{
   207  		ID:         task.Id,
   208  		Config:     task.Config,
   209  		Configured: clock.Now(c).Unix(),
   210  		DUT:        task.DUT,
   211  		Hostname:   hostname,
   212  		Index:      task.Index,
   213  		Lifetime:   task.Lifetime,
   214  		Prefix:     task.Prefix,
   215  		Revision:   task.Revision,
   216  		Swarming:   task.Swarming,
   217  		Timeout:    task.Timeout,
   218  	}
   219  	if task.Attributes != nil {
   220  		vm.Attributes = *task.Attributes
   221  		// TODO(crbug/942301): Auto-select zone if zone is unspecified.
   222  		vm.Attributes.SetZone(vm.Attributes.GetZone())
   223  		vm.IndexAttributes()
   224  	}
   225  	// createVM is called repeatedly, so do a fast check outside the transaction.
   226  	// In most cases, this will skip the more expensive transactional check.
   227  	switch err := datastore.Get(c, vm); {
   228  	case err == datastore.ErrNoSuchEntity:
   229  	case err != nil:
   230  		return errors.Annotate(err, "failed to fetch VM").Err()
   231  	default:
   232  		return nil
   233  	}
   234  	return datastore.RunInTransaction(c, func(c context.Context) error {
   235  		switch err := datastore.Get(c, vm); {
   236  		case err == datastore.ErrNoSuchEntity:
   237  		case err != nil:
   238  			return errors.Annotate(err, "failed to fetch VM").Err()
   239  		default:
   240  			return nil
   241  		}
   242  		if err := datastore.Put(c, vm); err != nil {
   243  			return errors.Annotate(err, "failed to store VM").Err()
   244  		}
   245  		return nil
   246  	}, nil)
   247  }
   248  
   249  // updateCurrentAmount updates CurrentAmount if necessary.
   250  // Returns up-to-date config entity and the reference timestamp.
   251  func updateCurrentAmount(c context.Context, id string) (cfg *model.Config, now time.Time, err error) {
   252  	cfg = &model.Config{
   253  		ID: id,
   254  	}
   255  	// Avoid transaction if possible.
   256  	if err = datastore.Get(c, cfg); err != nil {
   257  		err = errors.Annotate(err, "failed to fetch config").Err()
   258  		return
   259  	}
   260  
   261  	now = clock.Now(c)
   262  	var amt int32
   263  	switch amt, err = cfg.Config.ComputeAmount(cfg.Config.CurrentAmount, now); {
   264  	case err != nil:
   265  		err = errors.Annotate(err, "failed to parse amount").Err()
   266  		return
   267  	case cfg.Config.CurrentAmount == amt:
   268  		return
   269  	}
   270  
   271  	err = datastore.RunInTransaction(c, func(c context.Context) error {
   272  		var err error
   273  		if err = datastore.Get(c, cfg); err != nil {
   274  			return errors.Annotate(err, "failed to fetch config").Err()
   275  		}
   276  
   277  		now = clock.Now(c)
   278  		switch amt, err = cfg.Config.ComputeAmount(cfg.Config.CurrentAmount, now); {
   279  		case err != nil:
   280  			return errors.Annotate(err, "failed to parse amount").Err()
   281  		case cfg.Config.CurrentAmount == amt:
   282  			return nil
   283  		}
   284  		cfg.Config.CurrentAmount = amt
   285  		logging.Debugf(c, "set config %q to allow %d VMs", cfg.ID, cfg.Config.CurrentAmount)
   286  		if err = datastore.Put(c, cfg); err != nil {
   287  			return errors.Annotate(err, "failed to store config").Err()
   288  		}
   289  		return nil
   290  	}, nil)
   291  	return
   292  }
   293  
   294  // getCurrentVMsByPrefix returns all the VMs in the datastore by prefix
   295  func getCurrentVMsByPrefix(ctx context.Context, prefix string) ([]*model.VM, error) {
   296  	q := datastore.NewQuery(model.VMKind).Eq("prefix", prefix)
   297  	vms := make([]*model.VM, 0)
   298  	if err := datastore.Run(ctx, q, func(vm *model.VM) {
   299  		vms = append(vms, vm)
   300  	}); err != nil {
   301  		return nil, errors.Annotate(err, "failed to fetch vms for %s", prefix).Err()
   302  	}
   303  	return vms, nil
   304  }
   305  
   306  // expandConfigQueue is the name of the expand config task handler queue.
   307  const expandConfigQueue = "expand-config"
   308  
   309  // expandConfig creates task queue tasks to create each VM in the given config.
   310  func expandConfig(c context.Context, payload proto.Message) error {
   311  	task, ok := payload.(*tasks.ExpandConfig)
   312  	switch {
   313  	case !ok:
   314  		return errors.Reason("unexpected payload type %T", payload).Err()
   315  	case task.GetId() == "":
   316  		return errors.Reason("ID is required").Err()
   317  	}
   318  	cfg, now, err := updateCurrentAmount(c, task.Id)
   319  	if err != nil {
   320  		return err
   321  	}
   322  	// Measure the time taken for this query, For debugging purposes
   323  	start := time.Now()
   324  	vms, err := getCurrentVMsByPrefix(c, cfg.Config.Prefix)
   325  	rt := time.Since(start)
   326  	logging.Debugf(c, "getCurrentVMsByPrefix[%s]: error - %v #VMs - %d", rt, err, len(vms))
   327  	if err != nil {
   328  		return err
   329  	}
   330  
   331  	var t []*tq.Task
   332  	// DUTs take priority.
   333  	if len(cfg.Config.GetDuts()) > 0 {
   334  		t, err = createTasksPerDUT(c, vms, cfg, now)
   335  	} else {
   336  		t, err = createTasksPerAmount(c, vms, cfg, now)
   337  	}
   338  	if err != nil {
   339  		return err
   340  	}
   341  
   342  	logging.Debugf(c, "for config %s, creating %d VMs", cfg.Config.Prefix, len(t))
   343  	if err := getDispatcher(c).AddTask(c, t...); err != nil {
   344  		return errors.Annotate(err, "failed to schedule tasks").Err()
   345  	}
   346  	return nil
   347  }
   348  
   349  // getUniqueID returns a unique ID based on Unix time in milliseconds.
   350  func getUniqueID(c context.Context, prefix string) string {
   351  	ms := clock.Now(c).UnixMilli()
   352  	return fmt.Sprintf("%s-%d-%s", prefix, ms, getSuffix(c))
   353  }
   354  
   355  // createTasksPerDUT returns a slice of CreateVM tasks based on config.Duts.
   356  func createTasksPerDUT(c context.Context, vms []*model.VM, cfg *model.Config, now time.Time) ([]*tq.Task, error) {
   357  	logging.Debugf(c, "CloudBots flow entered for config %s", cfg.Config.Prefix)
   358  	if len(cfg.Config.Duts) == 0 {
   359  		return nil, errors.Reason("config.DUTs cannot be empty").Err()
   360  	}
   361  	existingVMs := make(map[string]string, len(vms))
   362  	for _, vm := range vms {
   363  		existingVMs[vm.DUT] = vm.Hostname
   364  	}
   365  	var t []*tq.Task
   366  	var i int32 = 0
   367  	for dut := range cfg.Config.Duts {
   368  		if vm, ok := existingVMs[dut]; ok {
   369  			logging.Debugf(c, "the DUT %s is already assigned to an existing VM %s, skipping", dut, vm)
   370  			continue
   371  		}
   372  		t = append(t, &tq.Task{
   373  			Payload: &tasks.CreateVM{
   374  				Id:         getUniqueID(c, cfg.Config.Prefix),
   375  				Attributes: cfg.Config.Attributes,
   376  				Config:     cfg.ID,
   377  				Created: &timestamppb.Timestamp{
   378  					Seconds: now.Unix(),
   379  				},
   380  				// Index is not needed here.
   381  				// CloudBots flow does not rely on Index for VM hostname uniqueness.
   382  				DUT:      dut,
   383  				Lifetime: randomizeLifetime(cfg.Config.Lifetime.GetSeconds()),
   384  				Prefix:   cfg.Config.Prefix,
   385  				Revision: cfg.Config.Revision,
   386  				Swarming: cfg.Config.Swarming,
   387  				Timeout:  cfg.Config.Timeout.GetSeconds(),
   388  			},
   389  		})
   390  		i++
   391  	}
   392  	return t, nil
   393  }
   394  
   395  // createTasksPerAmount returns a slice of CreateVM tasks based on config.CurrentAmount.
   396  func createTasksPerAmount(c context.Context, vms []*model.VM, cfg *model.Config, now time.Time) ([]*tq.Task, error) {
   397  	logging.Debugf(c, "default flow entered for config %s", cfg.Config.Prefix)
   398  	if len(cfg.Config.Duts) > 0 {
   399  		return nil, errors.Reason("config.Duts should be empty").Err()
   400  	}
   401  	existingVMs := stringset.New(len(vms))
   402  	for _, vm := range vms {
   403  		existingVMs.Add(vm.ID)
   404  	}
   405  	var t []*tq.Task
   406  	for i := int32(0); i < cfg.Config.CurrentAmount; i++ {
   407  		id := fmt.Sprintf("%s-%d", cfg.Config.Prefix, i)
   408  		if !existingVMs.Has(id) {
   409  			t = append(t, &tq.Task{
   410  				Payload: &tasks.CreateVM{
   411  					Id:         id,
   412  					Attributes: cfg.Config.Attributes,
   413  					Config:     cfg.ID,
   414  					Created: &timestamppb.Timestamp{
   415  						Seconds: now.Unix(),
   416  					},
   417  					Index:    i,
   418  					Lifetime: randomizeLifetime(cfg.Config.Lifetime.GetSeconds()),
   419  					Prefix:   cfg.Config.Prefix,
   420  					Revision: cfg.Config.Revision,
   421  					Swarming: cfg.Config.Swarming,
   422  					Timeout:  cfg.Config.Timeout.GetSeconds(),
   423  				},
   424  			})
   425  		}
   426  	}
   427  	return t, nil
   428  }
   429  
   430  // randomizeLifetime randomizes the specified lifetime within an interval.
   431  //
   432  // Randomized lifetime of VMs spreads the load of terminated/respawn VMs.
   433  func randomizeLifetime(lifetime int64) int64 {
   434  	interval := lifetime / 10
   435  	if interval <= 0 { // The lifetime is too short or invalid, so do nothing.
   436  		return lifetime
   437  	}
   438  	return lifetime + rand.Int63n(interval)
   439  }
   440  
   441  // reportQuotaQueue is the name of the report quota task handler queue.
   442  const reportQuotaQueue = "report-quota"
   443  
   444  // reportQuota reports GCE quota utilization.
   445  func reportQuota(c context.Context, payload proto.Message) error {
   446  	task, ok := payload.(*tasks.ReportQuota)
   447  	switch {
   448  	case !ok:
   449  		return errors.Reason("unexpected payload type %T", payload).Err()
   450  	case task.GetId() == "":
   451  		return errors.Reason("ID is required").Err()
   452  	}
   453  	p := &model.Project{
   454  		ID: task.Id,
   455  	}
   456  	if err := datastore.Get(c, p); err != nil {
   457  		return errors.Annotate(err, "failed to fetch project").Err()
   458  	}
   459  	mets := stringset.NewFromSlice(p.Config.Metric...)
   460  	regs := stringset.NewFromSlice(p.Config.Region...)
   461  	rsp, err := getCompute(c).Stable.Regions.List(p.Config.Project).Context(c).Do()
   462  	if err != nil {
   463  		if gerr, ok := err.(*googleapi.Error); ok {
   464  			logErrors(c, task.Id, gerr)
   465  		}
   466  		return errors.Annotate(err, "failed to fetch quota").Err()
   467  	}
   468  	for _, r := range rsp.Items {
   469  		if regs.Has(r.Name) {
   470  			for _, q := range r.Quotas {
   471  				if mets.Has(q.Metric) {
   472  					metrics.UpdateQuota(c, q.Limit, q.Usage, q.Metric, p.Config.Project, r.Name)
   473  				}
   474  			}
   475  		}
   476  	}
   477  	return nil
   478  }