go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/gce/appengine/backend/bots.go (about)

     1  // Copyright 2019 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package backend
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"time"
    21  
    22  	"github.com/golang/protobuf/proto"
    23  	"google.golang.org/grpc/codes"
    24  	"google.golang.org/grpc/status"
    25  	"google.golang.org/protobuf/types/known/timestamppb"
    26  
    27  	"go.chromium.org/luci/appengine/tq"
    28  	"go.chromium.org/luci/common/data/stringset"
    29  	"go.chromium.org/luci/common/errors"
    30  	"go.chromium.org/luci/common/logging"
    31  	"go.chromium.org/luci/gae/service/datastore"
    32  	"go.chromium.org/luci/gae/service/memcache"
    33  	swarmingpb "go.chromium.org/luci/swarming/proto/api_v2"
    34  
    35  	"go.chromium.org/luci/gce/api/tasks/v1"
    36  	"go.chromium.org/luci/gce/appengine/backend/internal/metrics"
    37  	"go.chromium.org/luci/gce/appengine/model"
    38  )
    39  
    40  // utcRFC3339 is the timestamp format used by Swarming.
    41  // Similar to RFC3339 but with an implicit UTC time zone.
    42  const utcRFC3339 = "2006-01-02T15:04:05"
    43  
    44  // botListLimit is the maximum number of bots to list per query. It should be <= 1000.
    45  const botListLimit = 500
    46  
    47  // deleteStaleSwarmingBotBatchSize is set to 2 because it was found that on average the
    48  // deleteStaleSwarmingBot function took about 300ms to run and the queues are limited at
    49  // 500/s. This means we still have time left over in every run to process another bot.
    50  // It is possible to try with 3 bots, But might be a little too close and these numbers
    51  // are averages.
    52  const deleteStaleSwarmingBotBatchSize = 2
    53  
    54  // logGrpcError logs an error response from Swarming.
    55  func logGrpcError(c context.Context, name string, err error) {
    56  	logging.Errorf(c, "failure for %s: %s", name, err)
    57  }
    58  
    59  // setConnected sets the Swarming bot as connected in the datastore if it isn't already.
    60  func setConnected(c context.Context, id, hostname string, at time.Time) error {
    61  	// Check dscache in case vm.Connected is set now, to avoid a costly transaction.
    62  	vm := &model.VM{
    63  		ID: id,
    64  	}
    65  	switch err := datastore.Get(c, vm); {
    66  	case err != nil:
    67  		return errors.Annotate(err, "failed to fetch VM").Err()
    68  	case vm.Hostname != hostname:
    69  		return errors.Reason("bot %q does not exist", hostname).Err()
    70  	case vm.Connected > 0:
    71  		return nil
    72  	}
    73  	put := false
    74  	err := datastore.RunInTransaction(c, func(c context.Context) error {
    75  		put = false
    76  		switch err := datastore.Get(c, vm); {
    77  		case err != nil:
    78  			return errors.Annotate(err, "failed to fetch VM").Err()
    79  		case vm.Hostname != hostname:
    80  			return errors.Reason("bot %q does not exist", hostname).Err()
    81  		case vm.Connected > 0:
    82  			return nil
    83  		}
    84  		vm.Connected = at.Unix()
    85  		if err := datastore.Put(c, vm); err != nil {
    86  			return errors.Annotate(err, "failed to store VM").Err()
    87  		}
    88  		put = true
    89  		return nil
    90  	}, nil)
    91  	if put && err == nil {
    92  		metrics.ReportConnectionTime(c, float64(vm.Connected-vm.Created), vm.Prefix, vm.Attributes.GetProject(), vm.Swarming, vm.Attributes.GetZone())
    93  	}
    94  	return err
    95  }
    96  
    97  // manageMissingBot manages a missing Swarming bot.
    98  func manageMissingBot(c context.Context, vm *model.VM) error {
    99  	// Set that the bot has not yet connected to Swarming.
   100  	switch {
   101  	// the case that the VM is drained is not handled here due to b/264921632, that some newly created VM
   102  	// can be set to drained immidiately, but hasn't been connected to Swarming. If we destroy it here, there's
   103  	// a small chance that the bot is up during the destroyment and pick up a test but fail it.
   104  	case vm.Lifetime > 0 && vm.Created+vm.Lifetime < time.Now().Unix():
   105  		logging.Debugf(c, "deadline %d exceeded", vm.Created+vm.Lifetime)
   106  		return destroyInstanceAsync(c, vm.ID, vm.URL)
   107  	case vm.Timeout > 0 && vm.Created+vm.Timeout < time.Now().Unix():
   108  		logging.Debugf(c, "timeout %d exceeded", vm.Created+vm.Timeout)
   109  		return destroyInstanceAsync(c, vm.ID, vm.URL)
   110  	case time.Since(time.Unix(vm.Created, 0)) > minPendingForBotConnected:
   111  		logging.Debugf(c, "already waited for %d minutes for bots to connect to swarming, stop waiting", minPendingForBotConnected)
   112  		return destroyInstanceAsync(c, vm.ID, vm.URL)
   113  	default:
   114  		return nil
   115  	}
   116  }
   117  
   118  // minPendingForBotConnected is the minimal minutes (10 minutes) to wait for the swarming bot in the VM to connect to Swarming.
   119  const minPendingForBotConnected = 10 * time.Minute
   120  
   121  // manageExistingBot manages an existing Swarming bot.
   122  func manageExistingBot(c context.Context, bot *swarmingpb.BotInfo, vm *model.VM) error {
   123  	// A bot connected to Swarming may be executing workload.
   124  	// To destroy the instance, terminate the bot first to avoid interruptions.
   125  	// Termination can be skipped if the bot is deleted, dead, or already terminated.
   126  	if bot.Deleted || bot.IsDead {
   127  		if bot.Deleted {
   128  			logging.Debugf(c, "bot deleted (%s)", vm.Hostname)
   129  		} else {
   130  			logging.Debugf(c, "bot dead (%s)", vm.Hostname)
   131  		}
   132  		// A bot may be returned as deleted or dead if a bot with the same ID was previously connected to Swarming, but this new VM's bot hasn't connected yet
   133  		if time.Since(time.Unix(vm.Created, 0)) <= minPendingForBotConnected {
   134  			logging.Debugf(c, "bot %s is newly created, wait for %s minutes at least to destroy", vm.Hostname, minPendingForBotConnected.Minutes())
   135  			return nil
   136  		}
   137  		return destroyInstanceAsync(c, vm.ID, vm.URL)
   138  	}
   139  	// This value of vm.Connected may be several seconds old, because the VM was fetched
   140  	// prior to sending an RPC to Swarming. Still, check it here to save a costly operation
   141  	// in setConnected, since manageExistingBot may be called thousands of times per minute.
   142  	if vm.Connected == 0 {
   143  		if err := setConnected(c, vm.ID, vm.Hostname, bot.FirstSeenTs.AsTime()); err != nil {
   144  			return err
   145  		}
   146  	}
   147  	// bot_terminate occurs when the bot starts the termination task and is normally followed
   148  	// by task_completed and bot_shutdown. Responses also include the full set of dimensions
   149  	// when the event was recorded. Limit response size by fetching only recent events, and only
   150  	// the type of each.
   151  	// A VM may be recreated multiple times with the same name, so it is important to only
   152  	// query swarming for events since the current VM's creation.
   153  	events, err := getSwarming(c, vm.Swarming).ListBotEvents(c, &swarmingpb.BotEventsRequest{
   154  		BotId: vm.Hostname,
   155  		Limit: 5,
   156  		Start: timestamppb.New(time.Unix(vm.Created, 0)),
   157  	})
   158  	if err != nil {
   159  		logGrpcError(c, vm.Hostname, err)
   160  		return errors.Annotate(err, "failed to fetch bot events").Err()
   161  	}
   162  	for _, e := range events.Items {
   163  		if e.EventType == "bot_terminate" {
   164  			logging.Debugf(c, "bot terminated (%s)", vm.Hostname)
   165  			return destroyInstanceAsync(c, vm.ID, vm.URL)
   166  		}
   167  	}
   168  	switch {
   169  	case vm.Lifetime > 0 && vm.Created+vm.Lifetime < time.Now().Unix():
   170  		logging.Debugf(c, "deadline %d exceeded", vm.Created+vm.Lifetime)
   171  		return terminateBotAsync(c, vm.ID, vm.Hostname)
   172  	case vm.Drained:
   173  		logging.Debugf(c, "VM drained")
   174  		return terminateBotAsync(c, vm.ID, vm.Hostname)
   175  	}
   176  	return nil
   177  }
   178  
   179  // manageBotQueue is the name of the manage bot task handler queue.
   180  const manageBotQueue = "manage-bot"
   181  
   182  // manageBot manages a Swarming bot.
   183  func manageBot(c context.Context, payload proto.Message) error {
   184  	task, ok := payload.(*tasks.ManageBot)
   185  	switch {
   186  	case !ok:
   187  		return errors.Reason("unexpected payload %q", payload).Err()
   188  	case task.GetId() == "":
   189  		return errors.Reason("ID is required").Err()
   190  	}
   191  	vm := &model.VM{
   192  		ID: task.Id,
   193  	}
   194  	switch err := datastore.Get(c, vm); {
   195  	case err == datastore.ErrNoSuchEntity:
   196  		return nil
   197  	case err != nil:
   198  		return errors.Annotate(err, "failed to fetch VM").Err()
   199  	case vm.URL == "":
   200  		logging.Debugf(c, "instance %q does not exist", vm.Hostname)
   201  		return nil
   202  	}
   203  	// Drain the VM if necessary. No-op if unnecessary or already drained.
   204  	if err := drainVM(c, vm); err != nil {
   205  		return errors.Annotate(err, "failed to drain VM").Err()
   206  	}
   207  
   208  	logging.Debugf(c, "fetching bot %q: %s", vm.Hostname, vm.Swarming)
   209  	bot, err := getSwarming(c, vm.Swarming).GetBot(c, &swarmingpb.BotRequest{
   210  		BotId: vm.Hostname,
   211  	})
   212  	if err != nil {
   213  		if status.Code(err) == codes.NotFound {
   214  			logging.Debugf(c, "bot not found (%s)", vm.Hostname)
   215  			return manageMissingBot(c, vm)
   216  		}
   217  		logGrpcError(c, vm.Hostname, err)
   218  		return errors.Annotate(err, "failed to fetch bot").Err()
   219  	}
   220  	logging.Debugf(c, "found bot")
   221  	return manageExistingBot(c, bot, vm)
   222  }
   223  
   224  // inspectSwarmingAsync collects all the swarming servers and schedules a task for each of them
   225  func inspectSwarmingAsync(c context.Context) error {
   226  	// Collect all the swarming instances
   227  	swarmings := stringset.New(10)
   228  	qC := datastore.NewQuery("Config")
   229  	if err := datastore.Run(c, qC, func(cfg *model.Config) {
   230  		swarmings.Add(cfg.Config.Swarming)
   231  	}); err != nil {
   232  		return errors.Annotate(err, "inspectSwarmingAsync: Failed to query configs").Err()
   233  	}
   234  	// Generate all the inspectSwarmingTasks
   235  	var inspectSwarmingTasks []*tq.Task
   236  	swarmings.Iter(func(sw string) bool {
   237  		inspectSwarmingTasks = append(inspectSwarmingTasks, &tq.Task{
   238  			Payload: &tasks.InspectSwarming{
   239  				Swarming: sw,
   240  			},
   241  		})
   242  		return true
   243  	})
   244  	// schedule all the inspect swarming tasks
   245  	if len(inspectSwarmingTasks) > 0 {
   246  		if err := getDispatcher(c).AddTask(c, inspectSwarmingTasks...); err != nil {
   247  			return errors.Annotate(err, "inspectSwarmingAsync: failed to schedule task").Err()
   248  		}
   249  	}
   250  	return nil
   251  }
   252  
   253  // inspectSwarmingQueue is the name of the inspect swarming task queue
   254  const inspectSwarmingQueue = "inspect-swarming"
   255  
   256  // inspectSwarming is the task queue handler for inspect swarming queue
   257  func inspectSwarming(c context.Context, payload proto.Message) error {
   258  	task, ok := payload.(*tasks.InspectSwarming)
   259  	switch {
   260  	case !ok:
   261  		return errors.Reason("InspectSwarming: unexpected payload %q", payload).Err()
   262  	case task.GetSwarming() == "":
   263  		return errors.Reason("InspectSwarming: swarming is required").Err()
   264  	}
   265  	var inpectSwarmingSubtasks []*tq.Task
   266  	listRPCResp, err := getSwarming(c, task.GetSwarming()).ListBots(c, &swarmingpb.BotsRequest{
   267  		Limit:  botListLimit,
   268  		Cursor: task.Cursor,
   269  	})
   270  	if err != nil {
   271  		return errors.Annotate(err, "InspectSwarming: failed to List instances from swarming").Err()
   272  	}
   273  	// Schedule the new task with the new cursor
   274  	if listRPCResp.Cursor != "" {
   275  		task.Cursor = listRPCResp.Cursor
   276  		if err := getDispatcher(c).AddTask(c, &tq.Task{Payload: task}); err != nil {
   277  			// Log error and process the bots
   278  			logging.Errorf(c, "InspectSwarming: failed to schedule inspect swarming task. %v", err)
   279  		}
   280  	}
   281  	batchPayload := &tasks.DeleteStaleSwarmingBots{}
   282  	for _, bot := range listRPCResp.Items {
   283  		qV := datastore.NewQuery("VM").Eq("hostname", bot.BotId)
   284  		if err := datastore.Run(c, qV, func(vm *model.VM) {
   285  			if bot.IsDead || bot.Deleted {
   286  				// If the bot is dead or deleted, schedule a task to destroy the instance
   287  				logging.Debugf(c, "bot %s is dead[%v]/deleted[%v]. Destroying instance", bot.BotId, bot.IsDead, bot.Deleted)
   288  				inpectSwarmingSubtasks = append(inpectSwarmingSubtasks, &tq.Task{
   289  					Payload: &tasks.DestroyInstance{
   290  						Id:  vm.ID,
   291  						Url: vm.URL,
   292  					},
   293  				})
   294  			} else {
   295  				batchPayload.Bots = append(batchPayload.Bots, &tasks.DeleteStaleSwarmingBot{
   296  					Id:          vm.ID,
   297  					FirstSeenTs: bot.FirstSeenTs.AsTime().UTC().Format(utcRFC3339),
   298  				})
   299  				if len(batchPayload.GetBots()) == deleteStaleSwarmingBotBatchSize {
   300  					// Schedule a task to check and delete the bot if needed
   301  					inpectSwarmingSubtasks = append(inpectSwarmingSubtasks, &tq.Task{
   302  						Payload: batchPayload,
   303  					})
   304  					batchPayload = &tasks.DeleteStaleSwarmingBots{}
   305  				}
   306  			}
   307  		}); err != nil {
   308  			logging.Debugf(c, "bot %s does not exist in datastore?", bot.BotId)
   309  		}
   310  	}
   311  	if len(batchPayload.GetBots()) > 0 {
   312  		// Schedule a task to check and delete the bot if needed
   313  		inpectSwarmingSubtasks = append(inpectSwarmingSubtasks, &tq.Task{
   314  			Payload: batchPayload,
   315  		})
   316  	}
   317  	// Dispatch all the tasks
   318  	if len(inpectSwarmingSubtasks) > 0 {
   319  		if err := getDispatcher(c).AddTask(c, inpectSwarmingSubtasks...); err != nil {
   320  			return errors.Annotate(err, "InspectSwarming: failed to schedule sub task(s)").Err()
   321  		}
   322  	}
   323  	return nil
   324  }
   325  
   326  // terminateBotAsync schedules a task queue task to terminate a Swarming bot.
   327  func terminateBotAsync(c context.Context, id, hostname string) error {
   328  	t := &tq.Task{
   329  		Payload: &tasks.TerminateBot{
   330  			Id:       id,
   331  			Hostname: hostname,
   332  		},
   333  	}
   334  	if err := getDispatcher(c).AddTask(c, t); err != nil {
   335  		return errors.Annotate(err, "failed to schedule terminate task").Err()
   336  	}
   337  	return nil
   338  }
   339  
   340  // terminateBotQueue is the name of the terminate bot task handler queue.
   341  const terminateBotQueue = "terminate-bot"
   342  
   343  // terminateBot terminates an existing Swarming bot.
   344  func terminateBot(c context.Context, payload proto.Message) error {
   345  	task, ok := payload.(*tasks.TerminateBot)
   346  	switch {
   347  	case !ok:
   348  		return errors.Reason("unexpected payload %q", payload).Err()
   349  	case task.GetId() == "":
   350  		return errors.Reason("ID is required").Err()
   351  	case task.GetHostname() == "":
   352  		return errors.Reason("hostname is required").Err()
   353  	}
   354  	vm := &model.VM{
   355  		ID: task.Id,
   356  	}
   357  	switch err := datastore.Get(c, vm); {
   358  	case err == datastore.ErrNoSuchEntity:
   359  		return nil
   360  	case err != nil:
   361  		return errors.Annotate(err, "failed to fetch VM").Err()
   362  	case vm.Hostname != task.Hostname:
   363  		// Instance is already destroyed and replaced. Don't terminate the new bot.
   364  		logging.Debugf(c, "bot %q does not exist", task.Hostname)
   365  		return nil
   366  	}
   367  	// 1 terminate task should normally suffice to terminate a bot, but in case
   368  	// users or bugs interfere, resend terminate task after 1 hour. Even if
   369  	// memcache content is cleared, more frequent terminate requests can now be
   370  	// handled by the swarming server and after crbug/982840 will be auto-deduped.
   371  	mi := memcache.NewItem(c, fmt.Sprintf("terminate-%s/%s", vm.Swarming, vm.Hostname))
   372  	if err := memcache.Get(c, mi); err == nil {
   373  		logging.Debugf(c, "bot %q already has terminate task from us", task.Hostname)
   374  		return nil
   375  	}
   376  	logging.Debugf(c, "terminating bot %q: %s", vm.Hostname, vm.Swarming)
   377  	_, err := getSwarming(c, vm.Swarming).TerminateBot(c, &swarmingpb.TerminateRequest{
   378  		BotId:  vm.Hostname,
   379  		Reason: "GCE Provider",
   380  	})
   381  	if err != nil {
   382  		if status.Code(err) == codes.NotFound {
   383  			// Bot is already deleted.
   384  			logging.Debugf(c, "bot not found (%s)", vm.Hostname)
   385  			return nil
   386  		}
   387  		logGrpcError(c, vm.Hostname, err)
   388  		return errors.Annotate(err, "failed to terminate bot").Err()
   389  	}
   390  	if err := memcache.Set(c, mi.SetExpiration(time.Hour)); err != nil {
   391  		logging.Warningf(c, "failed to record terminate task in memcache: %s", err)
   392  	}
   393  	return nil
   394  }
   395  
   396  // deleteBotAsync schedules a task queue task to delete a Swarming bot.
   397  func deleteBotAsync(c context.Context, id, hostname string) error {
   398  	t := &tq.Task{
   399  		Payload: &tasks.DeleteBot{
   400  			Id:       id,
   401  			Hostname: hostname,
   402  		},
   403  	}
   404  	if err := getDispatcher(c).AddTask(c, t); err != nil {
   405  		return errors.Annotate(err, "failed to schedule delete task").Err()
   406  	}
   407  	return nil
   408  }
   409  
   410  // deleteBotQueue is the name of the delete bot task handler queue.
   411  const deleteBotQueue = "delete-bot"
   412  
   413  // deleteBot deletes an existing Swarming bot.
   414  func deleteBot(c context.Context, payload proto.Message) error {
   415  	task, ok := payload.(*tasks.DeleteBot)
   416  	switch {
   417  	case !ok:
   418  		return errors.Reason("unexpected payload %q", payload).Err()
   419  	case task.GetId() == "":
   420  		return errors.Reason("ID is required").Err()
   421  	case task.GetHostname() == "":
   422  		return errors.Reason("hostname is required").Err()
   423  	}
   424  	vm := &model.VM{
   425  		ID: task.Id,
   426  	}
   427  	switch err := datastore.Get(c, vm); {
   428  	case err == datastore.ErrNoSuchEntity:
   429  		return nil
   430  	case err != nil:
   431  		return errors.Annotate(err, "failed to fetch VM").Err()
   432  	case vm.Hostname != task.Hostname:
   433  		// Instance is already destroyed and replaced. Don't delete the new bot.
   434  		return errors.Reason("bot %q does not exist", task.Hostname).Err()
   435  	}
   436  	logging.Debugf(c, "deleting bot %q: %s", vm.Hostname, vm.Swarming)
   437  	_, err := getSwarming(c, vm.Swarming).DeleteBot(c, &swarmingpb.BotRequest{
   438  		BotId: vm.Hostname,
   439  	})
   440  	if err != nil {
   441  		if status.Code(err) == codes.NotFound {
   442  			// Bot is already deleted.
   443  			logging.Debugf(c, "bot not found (%s)", vm.Hostname)
   444  			return deleteVM(c, task.Id, vm.Hostname)
   445  		}
   446  		logGrpcError(c, vm.Hostname, err)
   447  		return errors.Annotate(err, "failed to delete bot").Err()
   448  	}
   449  	return deleteVM(c, task.Id, vm.Hostname)
   450  }
   451  
   452  // deleteVM deletes the given VM from the datastore if it exists.
   453  func deleteVM(c context.Context, id, hostname string) error {
   454  	vm := &model.VM{
   455  		ID: id,
   456  	}
   457  	return datastore.RunInTransaction(c, func(c context.Context) error {
   458  		switch err := datastore.Get(c, vm); {
   459  		case err == datastore.ErrNoSuchEntity:
   460  			return nil
   461  		case err != nil:
   462  			return errors.Annotate(err, "failed to fetch VM").Err()
   463  		case vm.Hostname != hostname:
   464  			logging.Debugf(c, "VM %q does not exist", hostname)
   465  			return nil
   466  		}
   467  		if err := datastore.Delete(c, vm); err != nil {
   468  			return errors.Annotate(err, "failed to delete VM").Err()
   469  		}
   470  		logging.Debugf(c, "deleted VM %s from db", hostname)
   471  		return nil
   472  	}, nil)
   473  }
   474  
   475  // deleteStaleSwarmingBotsQueue is the name of the queue that deletes stale swarming bots in batches
   476  const deleteStaleSwarmingBotsQueue = "delete-stale-swarming-bots"
   477  
   478  func deleteStaleSwarmingBots(c context.Context, payload proto.Message) error {
   479  	task, ok := payload.(*tasks.DeleteStaleSwarmingBots)
   480  	switch {
   481  	case !ok:
   482  		return errors.Reason("deleteStaleSwarmingBots: unexpected payload %q", payload).Err()
   483  	case task.GetBots() == nil:
   484  		return errors.Reason("deleteStaleSwarmingBots: No Bots to process").Err()
   485  	}
   486  	var errs []error
   487  	for _, dssb := range task.GetBots() {
   488  		if err := deleteStaleSwarmingBot(c, dssb); err != nil {
   489  			errs = append(errs, err)
   490  		}
   491  	}
   492  	if len(errs) > 0 {
   493  		return errors.NewMultiError(errs...).AsError()
   494  	}
   495  	return nil
   496  }
   497  
   498  // deleteStaleSwarmingBot manages the existing swarming bot
   499  func deleteStaleSwarmingBot(c context.Context, task *tasks.DeleteStaleSwarmingBot) error {
   500  	switch {
   501  	case task == nil:
   502  		return errors.Reason("deleteStaleSwarmingBot: Bad input?").Err()
   503  	case task.GetId() == "":
   504  		return errors.Reason("deleteStaleSwarmingBot: Missing bot id").Err()
   505  	case task.GetFirstSeenTs() == "":
   506  		return errors.Reason("deleteStaleSwarmingBot: Missing timestamp").Err()
   507  	}
   508  	vm := &model.VM{
   509  		ID: task.GetId(),
   510  	}
   511  	switch err := datastore.Get(c, vm); {
   512  	case errors.Is(err, datastore.ErrNoSuchEntity):
   513  		return nil
   514  	case err != nil:
   515  		return errors.Annotate(err, "deleteStaleSwarmingBot: failed to fetch VM %s", task.GetId()).Err()
   516  	case vm.URL == "":
   517  		logging.Debugf(c, "deleteStaleSwarmingBot: instance %q does not exist", vm.Hostname)
   518  		return nil
   519  	}
   520  	// bot_terminate occurs when the bot starts the termination task and is normally followed
   521  	// by task_completed and bot_shutdown. Responses also include the full set of dimensions
   522  	// when the event was recorded. Limit response size by fetching only recent events, and only
   523  	// the type of each.
   524  	// A VM may be recreated multiple times with the same name, so it is important to only
   525  	// query swarming for events since the current VM's creation.
   526  	events, err := getSwarming(c, vm.Swarming).ListBotEvents(c, &swarmingpb.BotEventsRequest{
   527  		BotId: vm.Hostname,
   528  		Limit: 5,
   529  		Start: timestamppb.New(time.Unix(vm.Created, 0)),
   530  	})
   531  	if err != nil {
   532  		logGrpcError(c, vm.Hostname, err)
   533  		return errors.Annotate(err, "deleteStaleSwarmingBot: failed to fetch bot events for %s", vm.Hostname).Err()
   534  	}
   535  	for _, e := range events.Items {
   536  		if e.EventType == "bot_terminate" {
   537  			logging.Debugf(c, "deleteStaleSwarmingBot: bot terminated (%s)", vm.Hostname)
   538  			return destroyInstanceAsync(c, vm.ID, vm.URL)
   539  		}
   540  	}
   541  	switch {
   542  	case vm.Lifetime > 0 && vm.Created+vm.Lifetime < time.Now().Unix():
   543  		logging.Debugf(c, "deleteStaleSwarmingBot: %s deadline %d exceeded", vm.ID, vm.Created+vm.Lifetime)
   544  		return terminateBotAsync(c, vm.ID, vm.Hostname)
   545  	case vm.Drained:
   546  		logging.Debugf(c, "deleteStaleSwarmingBot: VM %s drained", vm.ID)
   547  		return terminateBotAsync(c, vm.ID, vm.Hostname)
   548  	}
   549  	// This value of vm.Connected may be several seconds old, because the VM was fetched
   550  	// prior to sending an RPC to Swarming. Still, check it here to save a costly operation
   551  	// in setConnected, since DeleteStaleSwarmingBot may be called thousands of times per minute.
   552  	if vm.Connected == 0 {
   553  		t, err := time.Parse(utcRFC3339, task.GetFirstSeenTs())
   554  		if err != nil {
   555  			return errors.Annotate(err, "deleteStaleSwarmingBot: %s failed to parse bot connection time", vm.ID).Err()
   556  		}
   557  		if err := setConnected(c, vm.ID, vm.Hostname, t); err != nil {
   558  			return errors.Annotate(err, "deleteStaleSwarmingBot: %s failed to set connected time", vm.ID).Err()
   559  		}
   560  	}
   561  	return nil
   562  }