go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/gce/appengine/backend/instances.go (about)

     1  // Copyright 2018 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package backend
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"net/http"
    21  	"regexp"
    22  	"strings"
    23  	"time"
    24  
    25  	"github.com/golang/protobuf/proto"
    26  	"github.com/google/uuid"
    27  
    28  	compute "google.golang.org/api/compute/v1"
    29  	"google.golang.org/api/googleapi"
    30  
    31  	"go.chromium.org/luci/appengine/tq"
    32  	"go.chromium.org/luci/common/errors"
    33  	"go.chromium.org/luci/common/logging"
    34  	"go.chromium.org/luci/gae/service/datastore"
    35  
    36  	"go.chromium.org/luci/gce/api/tasks/v1"
    37  	"go.chromium.org/luci/gce/appengine/backend/internal/metrics"
    38  	"go.chromium.org/luci/gce/appengine/model"
    39  )
    40  
    41  var (
    42  	// Check createVM in queues.go for the naming logic
    43  	hostnameSuffixMatch = regexp.MustCompile(`-[0-9]+-[a-z0-9]{4}$`)
    44  )
    45  
    46  // setCreated sets the GCE instance as created in the datastore if it isn't already.
    47  func setCreated(c context.Context, id string, inst *compute.Instance) error {
    48  	t, err := time.Parse(time.RFC3339, inst.CreationTimestamp)
    49  	if err != nil {
    50  		return errors.Annotate(err, "failed to parse instance creation time").Err()
    51  	}
    52  	nics := make([]model.NetworkInterface, len(inst.NetworkInterfaces))
    53  	for i, n := range inst.NetworkInterfaces {
    54  		if len(n.AccessConfigs) > 0 {
    55  			// GCE currently supports at most one access config per network interface.
    56  			nics[i].ExternalIP = n.AccessConfigs[0].NatIP
    57  			if len(n.AccessConfigs) > 1 {
    58  				logging.Warningf(c, "network interface %q has more than one access config", n.Name)
    59  			}
    60  		}
    61  		nics[i].InternalIP = n.NetworkIP
    62  	}
    63  	vm := &model.VM{
    64  		ID: id,
    65  	}
    66  	put := false
    67  	err = datastore.RunInTransaction(c, func(c context.Context) error {
    68  		put = false
    69  		switch err := datastore.Get(c, vm); {
    70  		case err != nil:
    71  			return errors.Annotate(err, "failed to fetch VM").Err()
    72  		case vm.Created > 0:
    73  			return nil
    74  		}
    75  		vm.Created = t.Unix()
    76  		vm.NetworkInterfaces = nics
    77  		vm.URL = inst.SelfLink
    78  		if err := datastore.Put(c, vm); err != nil {
    79  			return errors.Annotate(err, "failed to store VM").Err()
    80  		}
    81  		put = true
    82  		return nil
    83  	}, nil)
    84  	if put && err == nil {
    85  		metrics.ReportCreationTime(c, float64(vm.Created-vm.Configured), vm.Prefix, vm.Attributes.GetProject(), vm.Attributes.GetZone())
    86  	}
    87  	return err
    88  }
    89  
    90  // logErrors logs the errors in the given *googleapi.Error.
    91  func logErrors(c context.Context, name string, err *googleapi.Error) {
    92  	var errMsgs []string
    93  	for _, err := range err.Errors {
    94  		errMsgs = append(errMsgs, err.Message)
    95  	}
    96  	logging.Errorf(c, "failure for %s, HTTP: %d, errors: %s", name, err.Code, strings.Join(errMsgs, ","))
    97  }
    98  
    99  // rateLimitExceeded returns whether the given *googleapi.Error contains a rate
   100  // limit error.
   101  func rateLimitExceeded(err *googleapi.Error) bool {
   102  	for _, err := range err.Errors {
   103  		switch {
   104  		case strings.Contains(err.Message, "Queries per user per 100 seconds"):
   105  			return true
   106  		case strings.Contains(err.Message, "Rate Limit Exceeded"):
   107  			return true
   108  		case strings.Contains(err.Message, "rateLimitExceeded"):
   109  			return true
   110  		case strings.Contains(err.Reason, "rateLimitExceeded"):
   111  			return true
   112  		}
   113  	}
   114  	return false
   115  }
   116  
   117  // checkInstance fetches the GCE instance and either sets its creation details
   118  // or deletes the VM if the instance doesn't exist.
   119  func checkInstance(c context.Context, vm *model.VM) error {
   120  	srv := getCompute(c).Stable.Instances
   121  	call := srv.Get(vm.Attributes.GetProject(), vm.Attributes.GetZone(), vm.Hostname)
   122  	inst, err := call.Context(c).Do()
   123  	if err != nil {
   124  		if gerr, ok := err.(*googleapi.Error); ok {
   125  			if gerr.Code == http.StatusNotFound {
   126  				metrics.UpdateFailures(c, gerr.Code, vm)
   127  				if err := deleteVM(c, vm.ID, vm.Hostname); err != nil {
   128  					return errors.Annotate(err, "instance not found").Err()
   129  				}
   130  				return errors.Annotate(err, "instance not found").Err()
   131  			}
   132  			logErrors(c, vm.Hostname, gerr)
   133  		}
   134  		return errors.Annotate(err, "failed to fetch instance").Err()
   135  	}
   136  	logging.Debugf(c, "created instance: %s", inst.SelfLink)
   137  	metrics.CreatedInstanceChecked.Add(c, 1, vm.Config, vm.Attributes.GetProject(), vm.Attributes.GetZone(), vm.Hostname)
   138  	return setCreated(c, vm.ID, inst)
   139  }
   140  
   141  // createInstanceQueue is the name of the create instance task handler queue.
   142  const createInstanceQueue = "create-instance"
   143  
   144  // createInstance creates a GCE instance.
   145  func createInstance(ctx context.Context, payload proto.Message) error {
   146  	task, ok := payload.(*tasks.CreateInstance)
   147  	switch {
   148  	case !ok:
   149  		return errors.Reason("unexpected payload %q", payload).Err()
   150  	case task.GetId() == "":
   151  		return errors.Reason("ID is required").Err()
   152  	}
   153  	vm := &model.VM{
   154  		ID: task.Id,
   155  	}
   156  	switch err := datastore.Get(ctx, vm); {
   157  	case err != nil:
   158  		return errors.Annotate(err, "failed to fetch VM").Err()
   159  	case vm.URL != "":
   160  		logging.Debugf(ctx, "instance exists: %s", vm.URL)
   161  		return nil
   162  	}
   163  	logging.Debugf(ctx, "creating instance %q", vm.Hostname)
   164  
   165  	instance := vm.GetInstance()
   166  
   167  	// Generate a request ID based on the hostname.
   168  	// Ensures duplicate operations aren't created in GCE.
   169  	// Request IDs are valid for 24 hours.
   170  	rID := uuid.NewSHA1(uuid.Nil, []byte(fmt.Sprintf("create-%s", vm.Hostname)))
   171  	srv := getCompute(ctx)
   172  	op, err := srv.InsertInstance(ctx, vm.Attributes.GetProject(), vm.Attributes.GetZone(), instance, rID.String())
   173  	if err != nil {
   174  		if gerr, ok := err.(*googleapi.Error); ok {
   175  			logErrors(ctx, vm.Hostname, gerr)
   176  			metrics.UpdateFailures(ctx, gerr.Code, vm)
   177  			// TODO(b/130826296): Remove this once rate limit returns a transient HTTP error code.
   178  			if rateLimitExceeded(gerr) {
   179  				return errors.Annotate(err, "rate limit exceeded creating instance %s", vm.Hostname).Err()
   180  			}
   181  			if gerr.Code == http.StatusTooManyRequests || gerr.Code >= 500 {
   182  				return errors.Annotate(err, "transiently failed to create instance %s", vm.Hostname).Err()
   183  			}
   184  			if err := deleteVM(ctx, task.Id, vm.Hostname); err != nil {
   185  				return errors.Annotate(err, "failed to create instance %s", vm.Hostname).Err()
   186  			}
   187  		}
   188  		return errors.Annotate(err, "failed to create instance %s", vm.Hostname).Err()
   189  	}
   190  	// TODO(gregorynisbet): I don't like this variable name.
   191  	errs := op.GetErrors()
   192  	if len(errs) > 0 {
   193  		for _, err := range errs {
   194  			logging.Errorf(ctx, "for vm %s, %s: %s", vm.Hostname, err.Code, err.Message)
   195  		}
   196  		metrics.UpdateFailures(ctx, 200, vm)
   197  		if err := deleteVM(ctx, task.Id, vm.Hostname); err != nil {
   198  			return errors.Annotate(err, "failed to create instance %s", vm.Hostname).Err()
   199  		}
   200  		return errors.Reason("failed to create instance %s", vm.Hostname).Err()
   201  	}
   202  	if op.GetStatus() == "DONE" {
   203  		return checkInstance(ctx, vm)
   204  	}
   205  	// Instance creation is pending.
   206  	return nil
   207  }
   208  
   209  // destroyInstanceAsync schedules a task queue task to destroy a GCE instance.
   210  func destroyInstanceAsync(c context.Context, id, url string) error {
   211  	t := &tq.Task{
   212  		Payload: &tasks.DestroyInstance{
   213  			Id:  id,
   214  			Url: url,
   215  		},
   216  	}
   217  	if err := getDispatcher(c).AddTask(c, t); err != nil {
   218  		return errors.Annotate(err, "failed to schedule destroy task").Err()
   219  	}
   220  	return nil
   221  }
   222  
   223  // destroyInstanceQueue is the name of the destroy instance task handler queue.
   224  const destroyInstanceQueue = "destroy-instance"
   225  
   226  // destroyInstance destroys a GCE instance.
   227  func destroyInstance(c context.Context, payload proto.Message) error {
   228  	task, ok := payload.(*tasks.DestroyInstance)
   229  	switch {
   230  	case !ok:
   231  		return errors.Reason("unexpected payload type %T", payload).Err()
   232  	case task.GetId() == "":
   233  		return errors.Reason("ID is required").Err()
   234  	case task.GetUrl() == "":
   235  		return errors.Reason("URL is required").Err()
   236  	}
   237  	vm := &model.VM{
   238  		ID: task.Id,
   239  	}
   240  	switch err := datastore.Get(c, vm); {
   241  	case err == datastore.ErrNoSuchEntity:
   242  		return nil
   243  	case err != nil:
   244  		return errors.Annotate(err, "failed to fetch VM %s", vm.ID).Err()
   245  	case vm.URL != task.Url:
   246  		// Instance is already destroyed and replaced. Don't destroy the new one.
   247  		logging.Debugf(c, "instance %s does not exist: %s", vm.Hostname, task.Url)
   248  		return nil
   249  	}
   250  	logging.Debugf(c, "destroying instance %q", vm.Hostname)
   251  	rID := uuid.NewSHA1(uuid.Nil, []byte(fmt.Sprintf("destroy-%s", vm.Hostname)))
   252  	srv := getCompute(c).Stable.Instances
   253  	call := srv.Delete(vm.Attributes.GetProject(), vm.Attributes.GetZone(), vm.Hostname)
   254  	op, err := call.RequestId(rID.String()).Context(c).Do()
   255  	metrics.DestroyInstanceUnchecked.Add(c, 1, vm.Config, vm.Attributes.GetProject(), vm.Attributes.GetZone(), vm.Hostname)
   256  	if err != nil {
   257  		if gerr, ok := err.(*googleapi.Error); ok {
   258  			if gerr.Code == http.StatusNotFound {
   259  				// Instance is already destroyed.
   260  				logging.Debugf(c, "instance does not exist: %s", vm.URL)
   261  				return deleteBotAsync(c, task.Id, vm.Hostname)
   262  			}
   263  			logErrors(c, vm.Hostname, gerr)
   264  		}
   265  		return errors.Annotate(err, "failed to destroy instance %s", vm.Hostname).Err()
   266  	}
   267  	if op.Error != nil && len(op.Error.Errors) > 0 {
   268  		for _, err := range op.Error.Errors {
   269  			logging.Errorf(c, "%s: %s", err.Code, err.Message)
   270  		}
   271  		return errors.Reason("failed to destroy instance %s", vm.Hostname).Err()
   272  	}
   273  	if op.Status == "DONE" {
   274  		logging.Debugf(c, "destroyed instance: %s", op.TargetLink)
   275  		return deleteBotAsync(c, task.Id, vm.Hostname)
   276  	}
   277  	// Instance destruction is pending.
   278  	return nil
   279  }
   280  
   281  const auditInstancesQueue = "audit-instances"
   282  
   283  // Number of instances to audit per run
   284  const auditBatchSize = 100
   285  
   286  // auditInstanceInZone attempts to enforce the vm table as the source of truth
   287  // for any given project. It does this by deleting anything that it doesn't
   288  // recognize.
   289  func auditInstanceInZone(c context.Context, payload proto.Message) error {
   290  	task, ok := payload.(*tasks.AuditProject)
   291  	switch {
   292  	case !ok:
   293  		return errors.Reason("Unexpected payload type %T", payload).Err()
   294  	case task.GetProject() == "":
   295  		return errors.Reason("Project is required").Err()
   296  	case task.GetZone() == "":
   297  		return errors.Reason("Zone is required").Err()
   298  	}
   299  	proj := task.GetProject()
   300  	zone := task.GetZone()
   301  	logging.Debugf(c, "Auditing %s in %s", proj, zone)
   302  	// List a bunch on instances and validate with the DB
   303  	srv := getCompute(c).Stable.Instances
   304  	call := srv.List(proj, zone).MaxResults(auditBatchSize)
   305  	if task.GetPageToken() != "" {
   306  		// Add the page token if given
   307  		call = call.PageToken(task.GetPageToken())
   308  	}
   309  	op, err := call.Context(c).Do()
   310  	if err != nil {
   311  		if gerr, ok := err.(*googleapi.Error); ok {
   312  			logErrors(c, fmt.Sprintf("%s-%s", proj, zone), gerr)
   313  		}
   314  		return errors.Annotate(err, "failed to list  %s-%s", proj, zone).Err()
   315  	}
   316  
   317  	if op.Warning != nil {
   318  		logging.Warningf(c, "%s: %s", op.Warning.Code, op.Warning.Message)
   319  	}
   320  
   321  	// Assign job to handle the next set of instances
   322  	defer func(c context.Context, task *tasks.AuditProject, pageToken string) {
   323  		if pageToken != "" {
   324  			task.PageToken = pageToken
   325  			if err := getDispatcher(c).AddTask(c, &tq.Task{
   326  				Payload: task,
   327  			}); err != nil {
   328  				logging.Errorf(c, "Failed to add audit task for %s-%s", task.GetProject(), task.GetZone())
   329  			}
   330  		}
   331  	}(c, task, op.NextPageToken)
   332  
   333  	// Mapping from hostname to VM
   334  	hostToVM := make(map[string]*model.VM)
   335  	for _, inst := range op.Items {
   336  		// Init all the hostnames. Or as gcloud calls it names. There
   337  		// is no hostname assigned to this VM in gclouds sense. This is
   338  		// because it has a different meaning. We mean the hostname
   339  		// from swarmings perspective, which is gclouds sense is just
   340  		// name. The instance struct does have a hostname field, but
   341  		// this is empty for all the VMs
   342  		hostToVM[inst.Name] = nil
   343  	}
   344  
   345  	query := datastore.NewQuery(model.VMKind)
   346  	// Queries will contain all the queries to be made to datastore
   347  	var queries []*datastore.Query
   348  	for hostname := range hostToVM {
   349  		// Eq creates a new object everytime. So collect all the unique
   350  		// queries
   351  		queries = append(queries, query.Eq("hostname", hostname))
   352  	}
   353  	mapVMs := func(vm *model.VM) {
   354  		hostToVM[vm.Hostname] = vm
   355  	}
   356  	// Get all the VM records corresponding to the listed instances
   357  	err = datastore.RunMulti(c, queries, mapVMs)
   358  	if err != nil {
   359  		logging.Errorf(c, "Failed to query for the VMS. %v", err)
   360  		return err
   361  	}
   362  	var countLeaks int64
   363  	// Delete the ones we don't know of
   364  	for hostname, vm := range hostToVM {
   365  		if vm == nil && isLeakHuerestic(c, hostname, proj, zone) {
   366  			countLeaks += 1
   367  			logging.Debugf(c, "plugging the instance leak in %s-%s: %s", proj, zone, hostname)
   368  			// Send a delete request for the instance
   369  			reqID := uuid.NewSHA1(uuid.Nil, []byte(fmt.Sprintf("plug-%s", hostname)))
   370  			del := srv.Delete(proj, zone, hostname)
   371  			op, err := del.RequestId(reqID.String()).Context(c).Do()
   372  			if err != nil {
   373  				if gerr, ok := err.(*googleapi.Error); ok {
   374  					if gerr.Code == http.StatusNotFound {
   375  						// Instance is already destroyed.
   376  						logging.Debugf(c, "instance does not exist: %s", hostname)
   377  					}
   378  					logErrors(c, hostname, gerr)
   379  				}
   380  				logging.Errorf(c, "failed to plug the leak %s. %v", hostname, err)
   381  				continue
   382  			}
   383  			if op.Error != nil && len(op.Error.Errors) > 0 {
   384  				for _, e := range op.Error.Errors {
   385  					logging.Errorf(c, "%s: %s", e.Code, e.Message)
   386  				}
   387  				logging.Errorf(c, "failed to plug the leak %s. %v", hostname, err)
   388  				continue
   389  			}
   390  			if op.Status == "DONE" {
   391  				logging.Debugf(c, "plugged the leak of instance: %s", op.TargetLink)
   392  			}
   393  		}
   394  	}
   395  	metrics.UpdateLeaks(c, countLeaks, proj, zone)
   396  	return nil
   397  }
   398  
   399  // isLeakHuerestic determines if the given hostname is a leak from gce-provider
   400  // using a heuristic.
   401  func isLeakHuerestic(ctx context.Context, hostname, proj, zone string) bool {
   402  	// If this was an instance created by gce-provider. There must be a set
   403  	// of VM records for the prefix matching the hostname. Checking the
   404  	// existence of these will let us determine if this was a leak
   405  	id := hostname[:len(hostname)-5] // Remove the last 5 characters
   406  	prefix := hostnameSuffixMatch.ReplaceAllString(hostname, "")
   407  	q := datastore.NewQuery(model.VMKind)
   408  	// Get all the VMs under the prefix
   409  	q = q.Eq("prefix", prefix)
   410  	// Update the leak var based on the VM records that we find
   411  	leak := false
   412  	cb := func(vm *model.VM) error {
   413  		switch {
   414  		case vm.Hostname == hostname:
   415  			logging.Debugf(ctx, "%s record exists. Not a leak", hostname)
   416  			leak = false
   417  			// Stop searing as this is not a leak
   418  			return datastore.Stop
   419  		case vm.ID == id:
   420  			logging.Debugf(ctx, "%s is a leaked instance replaced by %s", hostname, vm.Hostname)
   421  			leak = true
   422  			// Stop searching as this is a leak
   423  			return datastore.Stop
   424  		default:
   425  			// Prefix matches but we cannot find the id that matches. Maybe the pool
   426  			// was resized. Might be a leak unless above conditions are true for
   427  			// other matches
   428  			leak = true
   429  		}
   430  		return nil
   431  	}
   432  	err := datastore.Run(ctx, q, cb)
   433  	if err != nil {
   434  		logging.Errorf(ctx, "isLeakHuerestic -- [%s]Error querying DB %v", hostname, err)
   435  		leak = false
   436  	}
   437  	if leak {
   438  		logging.Debugf(ctx, "%s is probably a leak in %s and %s", hostname, proj, zone)
   439  	} else {
   440  		logging.Debugf(ctx, "Cannot determine if %s is a leak in %s and %s", hostname, proj, zone)
   441  	}
   442  	return leak
   443  }