github.com/cloud-foundations/dominator@v0.0.0-20221004181915-6e4fee580046/cmd/hyper-control/rolloutImage.go (about) 1 package main 2 3 import ( 4 "fmt" 5 "io" 6 "math" 7 "net" 8 "os" 9 "os/exec" 10 "path" 11 "path/filepath" 12 "sync" 13 "syscall" 14 "time" 15 16 imageclient "github.com/Cloud-Foundations/Dominator/imageserver/client" 17 "github.com/Cloud-Foundations/Dominator/lib/concurrent" 18 "github.com/Cloud-Foundations/Dominator/lib/constants" 19 "github.com/Cloud-Foundations/Dominator/lib/cpusharer" 20 "github.com/Cloud-Foundations/Dominator/lib/errors" 21 "github.com/Cloud-Foundations/Dominator/lib/format" 22 "github.com/Cloud-Foundations/Dominator/lib/json" 23 "github.com/Cloud-Foundations/Dominator/lib/log" 24 "github.com/Cloud-Foundations/Dominator/lib/log/prefixlogger" 25 libnet "github.com/Cloud-Foundations/Dominator/lib/net" 26 "github.com/Cloud-Foundations/Dominator/lib/rpcclientpool" 27 "github.com/Cloud-Foundations/Dominator/lib/srpc" 28 "github.com/Cloud-Foundations/Dominator/lib/tags" 29 fm_proto "github.com/Cloud-Foundations/Dominator/proto/fleetmanager" 30 hyper_proto "github.com/Cloud-Foundations/Dominator/proto/hypervisor" 31 sub_proto "github.com/Cloud-Foundations/Dominator/proto/sub" 32 subclient "github.com/Cloud-Foundations/Dominator/sub/client" 33 "github.com/Cloud-Foundations/tricorder/go/tricorder/messages" 34 ) 35 36 const ( 37 filePerms = syscall.S_IRUSR | syscall.S_IWUSR | syscall.S_IRGRP | 38 syscall.S_IROTH 39 ) 40 41 type hypervisorType struct { 42 healthAgentClientResource *rpcclientpool.ClientResource 43 hostname string 44 hypervisorClientResource *srpc.ClientResource 45 initialTags tags.Tags 46 initialUnhealthyList map[string]struct{} 47 logger log.DebugLogger 48 noVMs bool 49 subClientResource *srpc.ClientResource 50 } 51 52 func rolloutImageSubcommand(args []string, logger log.DebugLogger) error { 53 err := rolloutImage(args[0], logger) 54 if err != nil { 55 return fmt.Errorf("Error rolling out image: %s", err) 56 } 57 return nil 58 } 59 60 func checkCertificates(predictedDuration time.Duration) error { 61 predictedFinish := time.Now().Add(predictedDuration) 62 if srpc.GetEarliestClientCertExpiration().Before(predictedFinish) { 63 return fmt.Errorf("a certificate expires before: %s", predictedFinish) 64 } 65 return nil 66 } 67 68 func extendImageLifetime(imageServerClientResource *srpc.ClientResource, 69 imageName string, expiresAt time.Time, predictedDuration time.Duration, 70 logger log.DebugLogger) error { 71 if expiresAt.IsZero() { 72 return nil 73 } 74 if time.Until(expiresAt) >= predictedDuration { 75 return nil 76 } 77 newExpiration := time.Now().Add(predictedDuration) 78 logger.Debugf(0, "extending image lifetime by %s\n", 79 format.Duration(time.Until(newExpiration))) 80 client, err := imageServerClientResource.GetHTTP(nil, 0) 81 if err != nil { 82 return err 83 } 84 defer client.Put() 85 return imageclient.ChangeImageExpiration(client, imageName, newExpiration) 86 } 87 88 func gitCommand(repositoryDirectory string, command ...string) ([]byte, error) { 89 cmd := exec.Command("git", command...) 90 cmd.Dir = repositoryDirectory 91 cmd.Stderr = os.Stderr 92 if output, err := cmd.Output(); err != nil { 93 return nil, fmt.Errorf("error running git %v: %s", cmd.Args, err) 94 } else { 95 return output, nil 96 } 97 } 98 99 func rolloutImage(imageName string, logger log.DebugLogger) error { 100 startTime := time.Now() 101 cpuSharer := cpusharer.NewFifoCpuSharer() 102 if *topologyDir != "" { 103 logger.Debugln(0, "updating Git repository") 104 stdout, err := gitCommand(*topologyDir, "status", "--porcelain") 105 if err != nil { 106 return err 107 } 108 if len(stdout) > 0 { 109 return errors.New("Git repository is not clean") 110 } 111 if _, err := gitCommand(*topologyDir, "pull"); err != nil { 112 return err 113 } 114 } 115 logger.Debugln(0, "checking image") 116 imageServerClientResource := srpc.NewClientResource("tcp", 117 fmt.Sprintf("%s:%d", *imageServerHostname, *imageServerPortNum)) 118 defer imageServerClientResource.ScheduleClose() 119 expiresAt, err := checkImage(imageServerClientResource, imageName) 120 if err != nil { 121 return err 122 } 123 fleetManagerClientResource := srpc.NewClientResource("tcp", 124 fmt.Sprintf("%s:%d", *fleetManagerHostname, *fleetManagerPortNum)) 125 defer fleetManagerClientResource.ScheduleClose() 126 logger.Debugln(0, "finding good Hypervisors") 127 hypervisorAddresses, err := listConnectedHypervisors( 128 fleetManagerClientResource) 129 if err != nil { 130 return err 131 } 132 hypervisors := make([]*hypervisorType, 0, len(hypervisorAddresses)) 133 defer closeHypervisors(hypervisors) 134 tagsForHypervisors, err := getTagsForHypervisors(fleetManagerClientResource) 135 logger.Debugln(0, "checking and tagging Hypervisors") 136 if err != nil { 137 return fmt.Errorf("failure getting tags: %s", err) 138 } 139 hypervisorsChannel := make(chan *hypervisorType, len(hypervisorAddresses)) 140 for _, address := range hypervisorAddresses { 141 if hostname, _, err := net.SplitHostPort(address); err != nil { 142 return err 143 } else { 144 go func(hostname string) { 145 cpuSharer.GrabCpu() 146 defer cpuSharer.ReleaseCpu() 147 hypervisor := setupHypervisor(hostname, imageName, 148 tagsForHypervisors[hostname], cpuSharer, logger) 149 hypervisorsChannel <- hypervisor 150 }(hostname) 151 } 152 } 153 for range hypervisorAddresses { 154 if hypervisor := <-hypervisorsChannel; hypervisor != nil { 155 err := hypervisor.updateTagForHypervisor( 156 fleetManagerClientResource, "PlannedImage", imageName) 157 if err != nil { 158 return fmt.Errorf("%s: failure updating tags: %s", 159 hypervisor.hostname, err) 160 } 161 hypervisors = append(hypervisors, hypervisor) 162 } 163 } 164 if len(hypervisors) < 1 { 165 return errors.New("no hypervisors to update") 166 } 167 logger.Debugln(0, "splitting unused/used Hypervisors") 168 unusedHypervisors, usedHypervisors := markUnusedHypervisors(hypervisors, 169 cpuSharer) 170 logger.Debugf(0, "%d unused, %d used Hypervisors\n", 171 len(unusedHypervisors), len(usedHypervisors)) 172 numSteps := math.Sqrt(float64(len(unusedHypervisors)*2)) + 173 math.Sqrt(float64(len(usedHypervisors)*2)) 174 predictedDuration := time.Minute * 5 * time.Duration(numSteps) 175 if err := checkCertificates(predictedDuration); err != nil { 176 return err 177 } 178 err = extendImageLifetime(imageServerClientResource, imageName, expiresAt, 179 predictedDuration, logger) 180 if err != nil { 181 return err 182 } 183 logger.Debugln(0, "upgrading unused Hypervisors") 184 err = upgradeOneThenAll(fleetManagerClientResource, imageName, 185 unusedHypervisors, cpuSharer, uint(len(unusedHypervisors))) 186 if err != nil { 187 return err 188 } 189 numConcurrent := uint(len(usedHypervisors) / 2) 190 if numConcurrent < 1 { 191 numConcurrent = 1 192 } else if numConcurrent > uint(len(unusedHypervisors)) { 193 numConcurrent = 1 194 } else if numConcurrent*10 < uint(len(usedHypervisors)) { 195 numConcurrent++ 196 } 197 logger.Debugln(0, "upgrading used Hypervisors") 198 err = upgradeOneThenAll(fleetManagerClientResource, imageName, 199 usedHypervisors, cpuSharer, numConcurrent) 200 if err != nil { 201 return err 202 } 203 err = releaseImage(imageServerClientResource, imageName, expiresAt, logger) 204 if err != nil { 205 return err 206 } 207 if *topologyDir != "" { 208 var tgs tags.Tags 209 tagsFilename := filepath.Join(*topologyDir, *location, "tags.json") 210 if err := json.ReadFromFile(tagsFilename, &tgs); err != nil { 211 if !os.IsNotExist(err) { 212 return err 213 } 214 tgs = make(tags.Tags) 215 } 216 oldImageName := tgs["RequiredImage"] 217 tgs["RequiredImage"] = imageName 218 delete(tgs, "PlannedImage") 219 err := json.WriteToFile(tagsFilename, filePerms, " ", tgs) 220 if err != nil { 221 return err 222 } 223 if _, err := gitCommand(*topologyDir, "add", tagsFilename); err != nil { 224 return err 225 } 226 var locationInsert string 227 if *location != "" { 228 locationInsert = "in " + *location + " " 229 } 230 _, err = gitCommand(*topologyDir, "commit", "-m", 231 fmt.Sprintf("Upgrade %sfrom %s to %s", 232 locationInsert, oldImageName, imageName)) 233 if err != nil { 234 return err 235 } 236 if _, err := gitCommand(*topologyDir, "push"); err != nil { 237 return err 238 } 239 } 240 logger.Printf("rollout completed in %s\n", 241 format.Duration(time.Since(startTime))) 242 return nil 243 } 244 245 func checkImage(imageServerClientResource *srpc.ClientResource, 246 imageName string) (time.Time, error) { 247 client, err := imageServerClientResource.GetHTTP(nil, 0) 248 if err != nil { 249 return time.Time{}, err 250 } 251 defer client.Put() 252 expiresAt, err := imageclient.GetImageExpiration(client, imageName) 253 if err != nil { 254 return time.Time{}, err 255 } 256 if expiresAt.IsZero() { 257 return expiresAt, nil 258 } 259 return expiresAt, 260 imageclient.ChangeImageExpiration(client, imageName, expiresAt) 261 } 262 263 func closeHypervisors(hypervisors []*hypervisorType) { 264 for _, hypervisor := range hypervisors { 265 hypervisor.hypervisorClientResource.ScheduleClose() 266 hypervisor.subClientResource.ScheduleClose() 267 } 268 } 269 270 func getTagsForHypervisors(clientResource *srpc.ClientResource) ( 271 map[string]tags.Tags, error) { 272 client, err := clientResource.GetHTTP(nil, 0) 273 if err != nil { 274 return nil, err 275 } 276 defer client.Close() 277 conn, err := client.Call("FleetManager.GetUpdates") 278 if err != nil { 279 return nil, err 280 } 281 defer conn.Close() 282 request := fm_proto.GetUpdatesRequest{Location: *location, MaxUpdates: 1} 283 if err := conn.Encode(request); err != nil { 284 return nil, err 285 } 286 if err := conn.Flush(); err != nil { 287 return nil, err 288 } 289 var reply fm_proto.Update 290 if err := conn.Decode(&reply); err != nil { 291 return nil, err 292 } 293 if err := errors.New(reply.Error); err != nil { 294 return nil, err 295 } 296 tagsForHypervisors := make(map[string]tags.Tags, len(reply.ChangedMachines)) 297 for _, machine := range reply.ChangedMachines { 298 tagsForHypervisors[machine.Hostname] = machine.Tags 299 } 300 return tagsForHypervisors, nil 301 } 302 303 func listConnectedHypervisors(clientResource *srpc.ClientResource) ( 304 []string, error) { 305 return listConnectedHypervisorsInLocation(clientResource, *location) 306 } 307 308 func listConnectedHypervisorsInLocation(clientResource *srpc.ClientResource, 309 location string) ([]string, error) { 310 client, err := clientResource.GetHTTP(nil, 0) 311 if err != nil { 312 return nil, err 313 } 314 defer client.Put() 315 request := fm_proto.ListHypervisorsInLocationRequest{ 316 IncludeUnhealthy: true, 317 Location: location, 318 } 319 var reply fm_proto.ListHypervisorsInLocationResponse 320 err = client.RequestReply("FleetManager.ListHypervisorsInLocation", 321 request, &reply) 322 if err != nil { 323 return nil, err 324 } 325 if err := errors.New(reply.Error); err != nil { 326 return nil, err 327 } 328 return reply.HypervisorAddresses, nil 329 } 330 331 func markUnusedHypervisors(hypervisors []*hypervisorType, 332 cpuSharer cpusharer.CpuSharer) ( 333 map[*hypervisorType]struct{}, map[*hypervisorType]struct{}) { 334 dialer := libnet.NewCpuSharingDialer(&net.Dialer{}, cpuSharer) 335 waitGroup := &sync.WaitGroup{} 336 for _, hypervisor_ := range hypervisors { 337 waitGroup.Add(1) 338 go func(h *hypervisorType) { 339 defer waitGroup.Done() 340 cpuSharer.GrabCpu() 341 defer cpuSharer.ReleaseCpu() 342 client, err := h.hypervisorClientResource.GetHTTPWithDialer(nil, 343 dialer) 344 if err != nil { 345 h.logger.Printf("error connecting to hypervisor: %s\n", err) 346 return 347 } 348 defer client.Put() 349 request := hyper_proto.ListVMsRequest{ 350 IgnoreStateMask: 1<<hyper_proto.StateFailedToStart | 351 1<<hyper_proto.StateStopping | 352 1<<hyper_proto.StateStopped | 353 1<<hyper_proto.StateDestroying, 354 } 355 var reply hyper_proto.ListVMsResponse 356 err = client.RequestReply("Hypervisor.ListVMs", request, &reply) 357 if err != nil { 358 h.logger.Printf("error listing VMS: %s", err) 359 return 360 } 361 if len(reply.IpAddresses) < 1 { 362 h.noVMs = true 363 } 364 }(hypervisor_) 365 } 366 waitGroup.Wait() 367 unusedHypervisors := make(map[*hypervisorType]struct{}) 368 usedHypervisors := make(map[*hypervisorType]struct{}) 369 for _, hypervisor := range hypervisors { 370 if hypervisor.noVMs { 371 unusedHypervisors[hypervisor] = struct{}{} 372 } else { 373 usedHypervisors[hypervisor] = struct{}{} 374 } 375 } 376 return unusedHypervisors, usedHypervisors 377 } 378 379 func releaseImage(imageServerClientResource *srpc.ClientResource, 380 imageName string, expiresAt time.Time, logger log.DebugLogger) error { 381 if expiresAt.IsZero() { 382 return nil 383 } 384 logger.Debugln(0, "releasing image") 385 client, err := imageServerClientResource.GetHTTP(nil, 0) 386 if err != nil { 387 return err 388 } 389 defer client.Put() 390 return imageclient.ChangeImageExpiration(client, imageName, time.Time{}) 391 } 392 393 func setupHypervisor(hostname string, imageName string, tgs tags.Tags, 394 cpuSharer *cpusharer.FifoCpuSharer, 395 logger log.DebugLogger) *hypervisorType { 396 logger = prefixlogger.New(hostname+": ", logger) 397 currentRequiredImage := tgs["RequiredImage"] 398 if currentRequiredImage != "" && 399 path.Dir(currentRequiredImage) != path.Dir(imageName) { 400 logger.Printf( 401 "image stream: current=%s != new=%s, skipping\n", 402 path.Dir(currentRequiredImage), path.Dir(imageName)) 403 return nil 404 } 405 h := &hypervisorType{ 406 healthAgentClientResource: rpcclientpool.New("tcp", 407 fmt.Sprintf("%s:%d", hostname, 6910), true, ""), 408 hostname: hostname, 409 hypervisorClientResource: srpc.NewClientResource("tcp", 410 fmt.Sprintf("%s:%d", hostname, 411 constants.HypervisorPortNumber)), 412 initialTags: tgs, 413 initialUnhealthyList: make(map[string]struct{}), 414 logger: logger, 415 subClientResource: srpc.NewClientResource("tcp", 416 fmt.Sprintf("%s:%d", hostname, constants.SubPortNumber)), 417 } 418 if lastImage, err := h.getLastImageName(cpuSharer); err != nil { 419 logger.Printf("skipping: %s\n", err) 420 return nil 421 } else if lastImage == imageName { 422 logger.Println("already updated, skipping") 423 return nil 424 } else { 425 return h 426 } 427 } 428 429 func upgradeOneThenAll(fleetManagerClientResource *srpc.ClientResource, 430 imageName string, hypervisors map[*hypervisorType]struct{}, 431 cpuSharer *cpusharer.FifoCpuSharer, maxConcurrent uint) error { 432 if len(hypervisors) < 1 { 433 return nil 434 } 435 state := concurrent.NewStateWithLinearConcurrencyIncrease(1, maxConcurrent) 436 for hypervisor := range hypervisors { 437 hypervisor := hypervisor 438 err := state.GoRun(func() error { 439 err := hypervisor.upgrade(fleetManagerClientResource, imageName, 440 cpuSharer) 441 if err != nil { 442 return fmt.Errorf("error upgrading: %s: %s", 443 hypervisor.hostname, err) 444 } 445 return nil 446 }) 447 if err != nil { 448 return err 449 } 450 } 451 return state.Reap() 452 } 453 454 func (h *hypervisorType) getFailingHealthChecks( 455 cpuSharer *cpusharer.FifoCpuSharer, 456 timeout time.Duration) ([]string, time.Time, error) { 457 stopTime := time.Now().Add(timeout) 458 for ; time.Until(stopTime) >= 0; cpuSharer.Sleep(time.Second) { 459 if list, timestamp, err := h.getFailingHealthChecksOnce(); err == nil { 460 return list, timestamp, nil 461 } 462 } 463 return nil, time.Time{}, errors.New("timed out getting health status") 464 } 465 466 func (h *hypervisorType) getFailingHealthChecksOnce() ( 467 []string, time.Time, error) { 468 client, err := h.healthAgentClientResource.Get(nil) 469 if err != nil { 470 return nil, time.Time{}, err 471 } 472 defer client.Put() 473 var metric messages.Metric 474 err = client.Call("MetricsServer.GetMetric", 475 "/health-checks/*/unhealthy-list", &metric) 476 if err != nil { 477 client.Close() 478 return nil, time.Time{}, err 479 } 480 if list, ok := metric.Value.([]string); !ok { 481 return nil, time.Time{}, errors.New("list metric is not []string") 482 } else { 483 if timestamp, ok := metric.TimeStamp.(time.Time); ok { 484 return list, timestamp, nil 485 } else { 486 return list, time.Time{}, nil 487 } 488 } 489 } 490 491 func (h *hypervisorType) getLastImageName(cpuSharer *cpusharer.FifoCpuSharer) ( 492 string, error) { 493 client, err := h.subClientResource.GetHTTP(nil, time.Second*15) 494 if err != nil { 495 return "", fmt.Errorf("error connecting to sub: %s", err) 496 } 497 defer client.Put() 498 request := sub_proto.PollRequest{ShortPollOnly: true} 499 var reply sub_proto.PollResponse 500 if err := subclient.CallPoll(client, request, &reply); err != nil { 501 client.Close() 502 if err != io.EOF { 503 return "", fmt.Errorf("error polling sub: %s", err) 504 } 505 } 506 return reply.LastSuccessfulImageName, nil 507 } 508 509 func (h *hypervisorType) updateTagForHypervisor( 510 clientResource *srpc.ClientResource, key, value string) error { 511 newTags := h.initialTags.Copy() 512 newTags[key] = value 513 if key == "RequiredImage" { 514 delete(newTags, "PlannedImage") 515 } 516 if h.initialTags.Equal(newTags) { 517 return nil 518 } 519 client, err := clientResource.GetHTTP(nil, 0) 520 if err != nil { 521 return err 522 } 523 defer client.Put() 524 request := fm_proto.ChangeMachineTagsRequest{ 525 Hostname: h.hostname, 526 Tags: newTags, 527 } 528 var reply fm_proto.ChangeMachineTagsResponse 529 err = client.RequestReply("FleetManager.ChangeMachineTags", 530 request, &reply) 531 if err != nil { 532 return err 533 } 534 return errors.New(reply.Error) 535 } 536 537 func (h *hypervisorType) upgrade(clientResource *srpc.ClientResource, 538 imageName string, cpuSharer *cpusharer.FifoCpuSharer) error { 539 cpuSharer.GrabCpu() 540 defer cpuSharer.ReleaseCpu() 541 list, _, err := h.getFailingHealthChecks(cpuSharer, time.Second) 542 if err != nil { 543 h.logger.Println(err) 544 return nil 545 } else if len(list) > 0 { 546 for _, failed := range list { 547 h.initialUnhealthyList[failed] = struct{}{} 548 } 549 } 550 h.logger.Debugln(0, "upgrading") 551 err = h.updateTagForHypervisor(clientResource, "RequiredImage", imageName) 552 if err != nil { 553 return err 554 } 555 stopTime := time.Now().Add(time.Minute * 15) 556 updateCompleted := false 557 var lastError string 558 for ; time.Until(stopTime) > 0; cpuSharer.Sleep(time.Second) { 559 if syncedImage, err := h.getLastImageName(cpuSharer); err != nil { 560 if lastError != err.Error() { 561 h.logger.Debugln(0, err) 562 } 563 lastError = err.Error() 564 continue 565 } else if syncedImage == imageName { 566 updateCompleted = true 567 break 568 } 569 } 570 if !updateCompleted { 571 return errors.New("timed out waiting for image update to complete") 572 } 573 h.logger.Debugln(0, "upgraded") 574 cpuSharer.Sleep(time.Second * 15) 575 list, _, err = h.getFailingHealthChecks(cpuSharer, time.Minute) 576 if err != nil { 577 return err 578 } else { 579 for _, entry := range list { 580 if _, ok := h.initialUnhealthyList[entry]; !ok { 581 return fmt.Errorf("health check failed: %s:", entry) 582 } 583 } 584 } 585 h.logger.Debugln(0, "still healthy") 586 return nil 587 } 588 589 func (h *hypervisorType) waitLastImageName(cpuSharer *cpusharer.FifoCpuSharer) ( 590 string, error) { 591 stopTime := time.Now().Add(time.Minute) 592 for ; time.Until(stopTime) > 0; cpuSharer.Sleep(time.Second * 5) { 593 imageName, err := h.getLastImageName(cpuSharer) 594 if err != nil { 595 h.logger.Debugln(0, err) 596 continue 597 } 598 return imageName, nil 599 } 600 return "", errors.New("timed out getting last image name") 601 }