go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/gce/appengine/backend/instances.go (about) 1 // Copyright 2018 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package backend 16 17 import ( 18 "context" 19 "fmt" 20 "net/http" 21 "regexp" 22 "strings" 23 "time" 24 25 "github.com/golang/protobuf/proto" 26 "github.com/google/uuid" 27 28 compute "google.golang.org/api/compute/v1" 29 "google.golang.org/api/googleapi" 30 31 "go.chromium.org/luci/appengine/tq" 32 "go.chromium.org/luci/common/errors" 33 "go.chromium.org/luci/common/logging" 34 "go.chromium.org/luci/gae/service/datastore" 35 36 "go.chromium.org/luci/gce/api/tasks/v1" 37 "go.chromium.org/luci/gce/appengine/backend/internal/metrics" 38 "go.chromium.org/luci/gce/appengine/model" 39 ) 40 41 var ( 42 // Check createVM in queues.go for the naming logic 43 hostnameSuffixMatch = regexp.MustCompile(`-[0-9]+-[a-z0-9]{4}$`) 44 ) 45 46 // setCreated sets the GCE instance as created in the datastore if it isn't already. 47 func setCreated(c context.Context, id string, inst *compute.Instance) error { 48 t, err := time.Parse(time.RFC3339, inst.CreationTimestamp) 49 if err != nil { 50 return errors.Annotate(err, "failed to parse instance creation time").Err() 51 } 52 nics := make([]model.NetworkInterface, len(inst.NetworkInterfaces)) 53 for i, n := range inst.NetworkInterfaces { 54 if len(n.AccessConfigs) > 0 { 55 // GCE currently supports at most one access config per network interface. 56 nics[i].ExternalIP = n.AccessConfigs[0].NatIP 57 if len(n.AccessConfigs) > 1 { 58 logging.Warningf(c, "network interface %q has more than one access config", n.Name) 59 } 60 } 61 nics[i].InternalIP = n.NetworkIP 62 } 63 vm := &model.VM{ 64 ID: id, 65 } 66 put := false 67 err = datastore.RunInTransaction(c, func(c context.Context) error { 68 put = false 69 switch err := datastore.Get(c, vm); { 70 case err != nil: 71 return errors.Annotate(err, "failed to fetch VM").Err() 72 case vm.Created > 0: 73 return nil 74 } 75 vm.Created = t.Unix() 76 vm.NetworkInterfaces = nics 77 vm.URL = inst.SelfLink 78 if err := datastore.Put(c, vm); err != nil { 79 return errors.Annotate(err, "failed to store VM").Err() 80 } 81 put = true 82 return nil 83 }, nil) 84 if put && err == nil { 85 metrics.ReportCreationTime(c, float64(vm.Created-vm.Configured), vm.Prefix, vm.Attributes.GetProject(), vm.Attributes.GetZone()) 86 } 87 return err 88 } 89 90 // logErrors logs the errors in the given *googleapi.Error. 91 func logErrors(c context.Context, name string, err *googleapi.Error) { 92 var errMsgs []string 93 for _, err := range err.Errors { 94 errMsgs = append(errMsgs, err.Message) 95 } 96 logging.Errorf(c, "failure for %s, HTTP: %d, errors: %s", name, err.Code, strings.Join(errMsgs, ",")) 97 } 98 99 // rateLimitExceeded returns whether the given *googleapi.Error contains a rate 100 // limit error. 101 func rateLimitExceeded(err *googleapi.Error) bool { 102 for _, err := range err.Errors { 103 switch { 104 case strings.Contains(err.Message, "Queries per user per 100 seconds"): 105 return true 106 case strings.Contains(err.Message, "Rate Limit Exceeded"): 107 return true 108 case strings.Contains(err.Message, "rateLimitExceeded"): 109 return true 110 case strings.Contains(err.Reason, "rateLimitExceeded"): 111 return true 112 } 113 } 114 return false 115 } 116 117 // checkInstance fetches the GCE instance and either sets its creation details 118 // or deletes the VM if the instance doesn't exist. 119 func checkInstance(c context.Context, vm *model.VM) error { 120 srv := getCompute(c).Stable.Instances 121 call := srv.Get(vm.Attributes.GetProject(), vm.Attributes.GetZone(), vm.Hostname) 122 inst, err := call.Context(c).Do() 123 if err != nil { 124 if gerr, ok := err.(*googleapi.Error); ok { 125 if gerr.Code == http.StatusNotFound { 126 metrics.UpdateFailures(c, gerr.Code, vm) 127 if err := deleteVM(c, vm.ID, vm.Hostname); err != nil { 128 return errors.Annotate(err, "instance not found").Err() 129 } 130 return errors.Annotate(err, "instance not found").Err() 131 } 132 logErrors(c, vm.Hostname, gerr) 133 } 134 return errors.Annotate(err, "failed to fetch instance").Err() 135 } 136 logging.Debugf(c, "created instance: %s", inst.SelfLink) 137 metrics.CreatedInstanceChecked.Add(c, 1, vm.Config, vm.Attributes.GetProject(), vm.Attributes.GetZone(), vm.Hostname) 138 return setCreated(c, vm.ID, inst) 139 } 140 141 // createInstanceQueue is the name of the create instance task handler queue. 142 const createInstanceQueue = "create-instance" 143 144 // createInstance creates a GCE instance. 145 func createInstance(ctx context.Context, payload proto.Message) error { 146 task, ok := payload.(*tasks.CreateInstance) 147 switch { 148 case !ok: 149 return errors.Reason("unexpected payload %q", payload).Err() 150 case task.GetId() == "": 151 return errors.Reason("ID is required").Err() 152 } 153 vm := &model.VM{ 154 ID: task.Id, 155 } 156 switch err := datastore.Get(ctx, vm); { 157 case err != nil: 158 return errors.Annotate(err, "failed to fetch VM").Err() 159 case vm.URL != "": 160 logging.Debugf(ctx, "instance exists: %s", vm.URL) 161 return nil 162 } 163 logging.Debugf(ctx, "creating instance %q", vm.Hostname) 164 165 instance := vm.GetInstance() 166 167 // Generate a request ID based on the hostname. 168 // Ensures duplicate operations aren't created in GCE. 169 // Request IDs are valid for 24 hours. 170 rID := uuid.NewSHA1(uuid.Nil, []byte(fmt.Sprintf("create-%s", vm.Hostname))) 171 srv := getCompute(ctx) 172 op, err := srv.InsertInstance(ctx, vm.Attributes.GetProject(), vm.Attributes.GetZone(), instance, rID.String()) 173 if err != nil { 174 if gerr, ok := err.(*googleapi.Error); ok { 175 logErrors(ctx, vm.Hostname, gerr) 176 metrics.UpdateFailures(ctx, gerr.Code, vm) 177 // TODO(b/130826296): Remove this once rate limit returns a transient HTTP error code. 178 if rateLimitExceeded(gerr) { 179 return errors.Annotate(err, "rate limit exceeded creating instance %s", vm.Hostname).Err() 180 } 181 if gerr.Code == http.StatusTooManyRequests || gerr.Code >= 500 { 182 return errors.Annotate(err, "transiently failed to create instance %s", vm.Hostname).Err() 183 } 184 if err := deleteVM(ctx, task.Id, vm.Hostname); err != nil { 185 return errors.Annotate(err, "failed to create instance %s", vm.Hostname).Err() 186 } 187 } 188 return errors.Annotate(err, "failed to create instance %s", vm.Hostname).Err() 189 } 190 // TODO(gregorynisbet): I don't like this variable name. 191 errs := op.GetErrors() 192 if len(errs) > 0 { 193 for _, err := range errs { 194 logging.Errorf(ctx, "for vm %s, %s: %s", vm.Hostname, err.Code, err.Message) 195 } 196 metrics.UpdateFailures(ctx, 200, vm) 197 if err := deleteVM(ctx, task.Id, vm.Hostname); err != nil { 198 return errors.Annotate(err, "failed to create instance %s", vm.Hostname).Err() 199 } 200 return errors.Reason("failed to create instance %s", vm.Hostname).Err() 201 } 202 if op.GetStatus() == "DONE" { 203 return checkInstance(ctx, vm) 204 } 205 // Instance creation is pending. 206 return nil 207 } 208 209 // destroyInstanceAsync schedules a task queue task to destroy a GCE instance. 210 func destroyInstanceAsync(c context.Context, id, url string) error { 211 t := &tq.Task{ 212 Payload: &tasks.DestroyInstance{ 213 Id: id, 214 Url: url, 215 }, 216 } 217 if err := getDispatcher(c).AddTask(c, t); err != nil { 218 return errors.Annotate(err, "failed to schedule destroy task").Err() 219 } 220 return nil 221 } 222 223 // destroyInstanceQueue is the name of the destroy instance task handler queue. 224 const destroyInstanceQueue = "destroy-instance" 225 226 // destroyInstance destroys a GCE instance. 227 func destroyInstance(c context.Context, payload proto.Message) error { 228 task, ok := payload.(*tasks.DestroyInstance) 229 switch { 230 case !ok: 231 return errors.Reason("unexpected payload type %T", payload).Err() 232 case task.GetId() == "": 233 return errors.Reason("ID is required").Err() 234 case task.GetUrl() == "": 235 return errors.Reason("URL is required").Err() 236 } 237 vm := &model.VM{ 238 ID: task.Id, 239 } 240 switch err := datastore.Get(c, vm); { 241 case err == datastore.ErrNoSuchEntity: 242 return nil 243 case err != nil: 244 return errors.Annotate(err, "failed to fetch VM %s", vm.ID).Err() 245 case vm.URL != task.Url: 246 // Instance is already destroyed and replaced. Don't destroy the new one. 247 logging.Debugf(c, "instance %s does not exist: %s", vm.Hostname, task.Url) 248 return nil 249 } 250 logging.Debugf(c, "destroying instance %q", vm.Hostname) 251 rID := uuid.NewSHA1(uuid.Nil, []byte(fmt.Sprintf("destroy-%s", vm.Hostname))) 252 srv := getCompute(c).Stable.Instances 253 call := srv.Delete(vm.Attributes.GetProject(), vm.Attributes.GetZone(), vm.Hostname) 254 op, err := call.RequestId(rID.String()).Context(c).Do() 255 metrics.DestroyInstanceUnchecked.Add(c, 1, vm.Config, vm.Attributes.GetProject(), vm.Attributes.GetZone(), vm.Hostname) 256 if err != nil { 257 if gerr, ok := err.(*googleapi.Error); ok { 258 if gerr.Code == http.StatusNotFound { 259 // Instance is already destroyed. 260 logging.Debugf(c, "instance does not exist: %s", vm.URL) 261 return deleteBotAsync(c, task.Id, vm.Hostname) 262 } 263 logErrors(c, vm.Hostname, gerr) 264 } 265 return errors.Annotate(err, "failed to destroy instance %s", vm.Hostname).Err() 266 } 267 if op.Error != nil && len(op.Error.Errors) > 0 { 268 for _, err := range op.Error.Errors { 269 logging.Errorf(c, "%s: %s", err.Code, err.Message) 270 } 271 return errors.Reason("failed to destroy instance %s", vm.Hostname).Err() 272 } 273 if op.Status == "DONE" { 274 logging.Debugf(c, "destroyed instance: %s", op.TargetLink) 275 return deleteBotAsync(c, task.Id, vm.Hostname) 276 } 277 // Instance destruction is pending. 278 return nil 279 } 280 281 const auditInstancesQueue = "audit-instances" 282 283 // Number of instances to audit per run 284 const auditBatchSize = 100 285 286 // auditInstanceInZone attempts to enforce the vm table as the source of truth 287 // for any given project. It does this by deleting anything that it doesn't 288 // recognize. 289 func auditInstanceInZone(c context.Context, payload proto.Message) error { 290 task, ok := payload.(*tasks.AuditProject) 291 switch { 292 case !ok: 293 return errors.Reason("Unexpected payload type %T", payload).Err() 294 case task.GetProject() == "": 295 return errors.Reason("Project is required").Err() 296 case task.GetZone() == "": 297 return errors.Reason("Zone is required").Err() 298 } 299 proj := task.GetProject() 300 zone := task.GetZone() 301 logging.Debugf(c, "Auditing %s in %s", proj, zone) 302 // List a bunch on instances and validate with the DB 303 srv := getCompute(c).Stable.Instances 304 call := srv.List(proj, zone).MaxResults(auditBatchSize) 305 if task.GetPageToken() != "" { 306 // Add the page token if given 307 call = call.PageToken(task.GetPageToken()) 308 } 309 op, err := call.Context(c).Do() 310 if err != nil { 311 if gerr, ok := err.(*googleapi.Error); ok { 312 logErrors(c, fmt.Sprintf("%s-%s", proj, zone), gerr) 313 } 314 return errors.Annotate(err, "failed to list %s-%s", proj, zone).Err() 315 } 316 317 if op.Warning != nil { 318 logging.Warningf(c, "%s: %s", op.Warning.Code, op.Warning.Message) 319 } 320 321 // Assign job to handle the next set of instances 322 defer func(c context.Context, task *tasks.AuditProject, pageToken string) { 323 if pageToken != "" { 324 task.PageToken = pageToken 325 if err := getDispatcher(c).AddTask(c, &tq.Task{ 326 Payload: task, 327 }); err != nil { 328 logging.Errorf(c, "Failed to add audit task for %s-%s", task.GetProject(), task.GetZone()) 329 } 330 } 331 }(c, task, op.NextPageToken) 332 333 // Mapping from hostname to VM 334 hostToVM := make(map[string]*model.VM) 335 for _, inst := range op.Items { 336 // Init all the hostnames. Or as gcloud calls it names. There 337 // is no hostname assigned to this VM in gclouds sense. This is 338 // because it has a different meaning. We mean the hostname 339 // from swarmings perspective, which is gclouds sense is just 340 // name. The instance struct does have a hostname field, but 341 // this is empty for all the VMs 342 hostToVM[inst.Name] = nil 343 } 344 345 query := datastore.NewQuery(model.VMKind) 346 // Queries will contain all the queries to be made to datastore 347 var queries []*datastore.Query 348 for hostname := range hostToVM { 349 // Eq creates a new object everytime. So collect all the unique 350 // queries 351 queries = append(queries, query.Eq("hostname", hostname)) 352 } 353 mapVMs := func(vm *model.VM) { 354 hostToVM[vm.Hostname] = vm 355 } 356 // Get all the VM records corresponding to the listed instances 357 err = datastore.RunMulti(c, queries, mapVMs) 358 if err != nil { 359 logging.Errorf(c, "Failed to query for the VMS. %v", err) 360 return err 361 } 362 var countLeaks int64 363 // Delete the ones we don't know of 364 for hostname, vm := range hostToVM { 365 if vm == nil && isLeakHuerestic(c, hostname, proj, zone) { 366 countLeaks += 1 367 logging.Debugf(c, "plugging the instance leak in %s-%s: %s", proj, zone, hostname) 368 // Send a delete request for the instance 369 reqID := uuid.NewSHA1(uuid.Nil, []byte(fmt.Sprintf("plug-%s", hostname))) 370 del := srv.Delete(proj, zone, hostname) 371 op, err := del.RequestId(reqID.String()).Context(c).Do() 372 if err != nil { 373 if gerr, ok := err.(*googleapi.Error); ok { 374 if gerr.Code == http.StatusNotFound { 375 // Instance is already destroyed. 376 logging.Debugf(c, "instance does not exist: %s", hostname) 377 } 378 logErrors(c, hostname, gerr) 379 } 380 logging.Errorf(c, "failed to plug the leak %s. %v", hostname, err) 381 continue 382 } 383 if op.Error != nil && len(op.Error.Errors) > 0 { 384 for _, e := range op.Error.Errors { 385 logging.Errorf(c, "%s: %s", e.Code, e.Message) 386 } 387 logging.Errorf(c, "failed to plug the leak %s. %v", hostname, err) 388 continue 389 } 390 if op.Status == "DONE" { 391 logging.Debugf(c, "plugged the leak of instance: %s", op.TargetLink) 392 } 393 } 394 } 395 metrics.UpdateLeaks(c, countLeaks, proj, zone) 396 return nil 397 } 398 399 // isLeakHuerestic determines if the given hostname is a leak from gce-provider 400 // using a heuristic. 401 func isLeakHuerestic(ctx context.Context, hostname, proj, zone string) bool { 402 // If this was an instance created by gce-provider. There must be a set 403 // of VM records for the prefix matching the hostname. Checking the 404 // existence of these will let us determine if this was a leak 405 id := hostname[:len(hostname)-5] // Remove the last 5 characters 406 prefix := hostnameSuffixMatch.ReplaceAllString(hostname, "") 407 q := datastore.NewQuery(model.VMKind) 408 // Get all the VMs under the prefix 409 q = q.Eq("prefix", prefix) 410 // Update the leak var based on the VM records that we find 411 leak := false 412 cb := func(vm *model.VM) error { 413 switch { 414 case vm.Hostname == hostname: 415 logging.Debugf(ctx, "%s record exists. Not a leak", hostname) 416 leak = false 417 // Stop searing as this is not a leak 418 return datastore.Stop 419 case vm.ID == id: 420 logging.Debugf(ctx, "%s is a leaked instance replaced by %s", hostname, vm.Hostname) 421 leak = true 422 // Stop searching as this is a leak 423 return datastore.Stop 424 default: 425 // Prefix matches but we cannot find the id that matches. Maybe the pool 426 // was resized. Might be a leak unless above conditions are true for 427 // other matches 428 leak = true 429 } 430 return nil 431 } 432 err := datastore.Run(ctx, q, cb) 433 if err != nil { 434 logging.Errorf(ctx, "isLeakHuerestic -- [%s]Error querying DB %v", hostname, err) 435 leak = false 436 } 437 if leak { 438 logging.Debugf(ctx, "%s is probably a leak in %s and %s", hostname, proj, zone) 439 } else { 440 logging.Debugf(ctx, "Cannot determine if %s is a leak in %s and %s", hostname, proj, zone) 441 } 442 return leak 443 }