github.com/Cloud-Foundations/Dominator@v0.3.4/cmd/hyper-control/rolloutImage.go (about) 1 package main 2 3 import ( 4 "fmt" 5 "io" 6 "math" 7 "net" 8 "os" 9 "os/exec" 10 "path" 11 "path/filepath" 12 "sync" 13 "syscall" 14 "time" 15 16 imageclient "github.com/Cloud-Foundations/Dominator/imageserver/client" 17 "github.com/Cloud-Foundations/Dominator/lib/concurrent" 18 "github.com/Cloud-Foundations/Dominator/lib/constants" 19 "github.com/Cloud-Foundations/Dominator/lib/cpusharer" 20 "github.com/Cloud-Foundations/Dominator/lib/errors" 21 "github.com/Cloud-Foundations/Dominator/lib/format" 22 "github.com/Cloud-Foundations/Dominator/lib/json" 23 "github.com/Cloud-Foundations/Dominator/lib/log" 24 "github.com/Cloud-Foundations/Dominator/lib/log/prefixlogger" 25 libnet "github.com/Cloud-Foundations/Dominator/lib/net" 26 "github.com/Cloud-Foundations/Dominator/lib/rpcclientpool" 27 "github.com/Cloud-Foundations/Dominator/lib/srpc" 28 "github.com/Cloud-Foundations/Dominator/lib/tags" 29 fm_proto "github.com/Cloud-Foundations/Dominator/proto/fleetmanager" 30 hyper_proto "github.com/Cloud-Foundations/Dominator/proto/hypervisor" 31 sub_proto "github.com/Cloud-Foundations/Dominator/proto/sub" 32 subclient "github.com/Cloud-Foundations/Dominator/sub/client" 33 "github.com/Cloud-Foundations/tricorder/go/tricorder/messages" 34 ) 35 36 const ( 37 filePerms = syscall.S_IRUSR | syscall.S_IWUSR | syscall.S_IRGRP | 38 syscall.S_IROTH 39 ) 40 41 type hypervisorType struct { 42 alreadyUpdated bool 43 healthAgentClientResource *rpcclientpool.ClientResource 44 hostname string 45 hypervisorClientResource *srpc.ClientResource 46 initialTags tags.Tags 47 initialUnhealthyList map[string]struct{} 48 logger log.DebugLogger 49 noVMs bool 50 subClientResource *srpc.ClientResource 51 } 52 53 func rolloutImageSubcommand(args []string, logger log.DebugLogger) error { 54 err := rolloutImage(args[0], logger) 55 if err != nil { 56 return fmt.Errorf("error rolling out image: %s", err) 57 } 58 return nil 59 } 60 61 func checkCertificates(predictedDuration time.Duration) error { 62 predictedFinish := time.Now().Add(predictedDuration) 63 if srpc.GetEarliestClientCertExpiration().Before(predictedFinish) { 64 return fmt.Errorf("a certificate expires before: %s", predictedFinish) 65 } 66 return nil 67 } 68 69 func extendImageLifetime(imageServerClientResource *srpc.ClientResource, 70 imageName string, expiresAt time.Time, predictedDuration time.Duration, 71 logger log.DebugLogger) error { 72 if expiresAt.IsZero() { 73 return nil 74 } 75 if time.Until(expiresAt) >= predictedDuration { 76 return nil 77 } 78 newExpiration := time.Now().Add(predictedDuration) 79 logger.Debugf(0, "extending image lifetime by %s\n", 80 format.Duration(time.Until(newExpiration))) 81 client, err := imageServerClientResource.GetHTTP(nil, 0) 82 if err != nil { 83 return err 84 } 85 defer client.Put() 86 return imageclient.ChangeImageExpiration(client, imageName, newExpiration) 87 } 88 89 func gitCommand(repositoryDirectory string, command ...string) ([]byte, error) { 90 cmd := exec.Command("git", command...) 91 cmd.Dir = repositoryDirectory 92 cmd.Stderr = os.Stderr 93 if output, err := cmd.Output(); err != nil { 94 return nil, fmt.Errorf("error running git %v: %s", cmd.Args, err) 95 } else { 96 return output, nil 97 } 98 } 99 100 func rolloutImage(imageName string, logger log.DebugLogger) error { 101 startTime := time.Now() 102 cpuSharer := cpusharer.NewFifoCpuSharer() 103 if *topologyDir != "" { 104 logger.Debugln(0, "updating Git repository") 105 stdout, err := gitCommand(*topologyDir, "status", "--porcelain") 106 if err != nil { 107 return err 108 } 109 if len(stdout) > 0 { 110 return errors.New("Git repository is not clean") 111 } 112 if _, err := gitCommand(*topologyDir, "pull"); err != nil { 113 return err 114 } 115 } 116 logger.Debugln(0, "checking image") 117 imageServerClientResource := srpc.NewClientResource("tcp", 118 fmt.Sprintf("%s:%d", *imageServerHostname, *imageServerPortNum)) 119 defer imageServerClientResource.ScheduleClose() 120 expiresAt, err := checkImage(imageServerClientResource, imageName) 121 if err != nil { 122 return err 123 } 124 fleetManagerClientResource := srpc.NewClientResource("tcp", 125 fmt.Sprintf("%s:%d", *fleetManagerHostname, *fleetManagerPortNum)) 126 defer fleetManagerClientResource.ScheduleClose() 127 logger.Debugln(0, "finding good Hypervisors") 128 hypervisorAddresses, err := listConnectedHypervisors( 129 fleetManagerClientResource) 130 if err != nil { 131 return err 132 } 133 hypervisors := make([]*hypervisorType, 0, len(hypervisorAddresses)) 134 defer closeHypervisors(hypervisors) 135 tagsForHypervisors, err := getTagsForHypervisors(fleetManagerClientResource) 136 logger.Debugln(0, "checking and tagging Hypervisors") 137 if err != nil { 138 return fmt.Errorf("failure getting tags: %s", err) 139 } 140 hypervisorsChannel := make(chan *hypervisorType, len(hypervisorAddresses)) 141 for _, address := range hypervisorAddresses { 142 if hostname, _, err := net.SplitHostPort(address); err != nil { 143 return err 144 } else { 145 go func(hostname string) { 146 cpuSharer.GrabCpu() 147 defer cpuSharer.ReleaseCpu() 148 hypervisor := setupHypervisor(hostname, imageName, 149 tagsForHypervisors[hostname], cpuSharer, logger) 150 hypervisorsChannel <- hypervisor 151 }(hostname) 152 } 153 } 154 numAlreadyUpdated := 0 155 for range hypervisorAddresses { 156 if hypervisor := <-hypervisorsChannel; hypervisor != nil { 157 if hypervisor.alreadyUpdated { 158 numAlreadyUpdated++ 159 continue 160 } 161 err := hypervisor.updateTagForHypervisor( 162 fleetManagerClientResource, "PlannedImage", imageName) 163 if err != nil { 164 return fmt.Errorf("%s: failure updating tags: %s", 165 hypervisor.hostname, err) 166 } 167 hypervisors = append(hypervisors, hypervisor) 168 } 169 } 170 if numAlreadyUpdated == len(hypervisorAddresses) { 171 return releaseImage(imageServerClientResource, imageName, expiresAt, 172 logger) 173 } 174 if len(hypervisors) < 1 { 175 return errors.New("no hypervisors to update") 176 } 177 logger.Debugln(0, "splitting unused/used Hypervisors") 178 unusedHypervisors, usedHypervisors := markUnusedHypervisors(hypervisors, 179 cpuSharer) 180 logger.Debugf(0, "%d unused, %d used Hypervisors\n", 181 len(unusedHypervisors), len(usedHypervisors)) 182 numSteps := math.Sqrt(float64(len(unusedHypervisors)*2)) + 183 math.Sqrt(float64(len(usedHypervisors)*2)) 184 predictedDuration := time.Minute * 5 * time.Duration(numSteps) 185 if err := checkCertificates(predictedDuration); err != nil { 186 return err 187 } 188 err = extendImageLifetime(imageServerClientResource, imageName, expiresAt, 189 predictedDuration, logger) 190 if err != nil { 191 return err 192 } 193 logger.Debugln(0, "upgrading unused Hypervisors") 194 err = upgradeOneThenAll(fleetManagerClientResource, imageName, 195 unusedHypervisors, cpuSharer, uint(len(unusedHypervisors))) 196 if err != nil { 197 return err 198 } 199 numConcurrent := uint(len(usedHypervisors) / 2) 200 if numConcurrent < 1 { 201 numConcurrent = 1 202 } else if numConcurrent > uint(len(unusedHypervisors)) { 203 numConcurrent = 1 204 } else if numConcurrent*10 < uint(len(usedHypervisors)) { 205 numConcurrent++ 206 } 207 logger.Debugln(0, "upgrading used Hypervisors") 208 err = upgradeOneThenAll(fleetManagerClientResource, imageName, 209 usedHypervisors, cpuSharer, numConcurrent) 210 if err != nil { 211 return err 212 } 213 err = releaseImage(imageServerClientResource, imageName, expiresAt, logger) 214 if err != nil { 215 return err 216 } 217 if *topologyDir != "" { 218 var tgs tags.Tags 219 tagsFilename := filepath.Join(*topologyDir, *location, "tags.json") 220 if err := json.ReadFromFile(tagsFilename, &tgs); err != nil { 221 if !os.IsNotExist(err) { 222 return err 223 } 224 tgs = make(tags.Tags) 225 } 226 oldImageName := tgs["RequiredImage"] 227 tgs["RequiredImage"] = imageName 228 delete(tgs, "PlannedImage") 229 err := json.WriteToFile(tagsFilename, filePerms, " ", tgs) 230 if err != nil { 231 return err 232 } 233 if _, err := gitCommand(*topologyDir, "add", tagsFilename); err != nil { 234 return err 235 } 236 var locationInsert string 237 if *location != "" { 238 locationInsert = "in " + *location + " " 239 } 240 _, err = gitCommand(*topologyDir, "commit", "-m", 241 fmt.Sprintf("Upgrade %sfrom %s to %s", 242 locationInsert, oldImageName, imageName)) 243 if err != nil { 244 return err 245 } 246 if _, err := gitCommand(*topologyDir, "push"); err != nil { 247 return err 248 } 249 } 250 logger.Printf("rollout completed in %s\n", 251 format.Duration(time.Since(startTime))) 252 return nil 253 } 254 255 func checkImage(imageServerClientResource *srpc.ClientResource, 256 imageName string) (time.Time, error) { 257 client, err := imageServerClientResource.GetHTTP(nil, 0) 258 if err != nil { 259 return time.Time{}, err 260 } 261 defer client.Put() 262 expiresAt, err := imageclient.GetImageExpiration(client, imageName) 263 if err != nil { 264 return time.Time{}, err 265 } 266 if expiresAt.IsZero() { 267 return expiresAt, nil 268 } 269 return expiresAt, 270 imageclient.ChangeImageExpiration(client, imageName, expiresAt) 271 } 272 273 func closeHypervisors(hypervisors []*hypervisorType) { 274 for _, hypervisor := range hypervisors { 275 hypervisor.hypervisorClientResource.ScheduleClose() 276 hypervisor.subClientResource.ScheduleClose() 277 } 278 } 279 280 func getTagsForHypervisors(clientResource *srpc.ClientResource) ( 281 map[string]tags.Tags, error) { 282 client, err := clientResource.GetHTTP(nil, 0) 283 if err != nil { 284 return nil, err 285 } 286 defer client.Close() 287 conn, err := client.Call("FleetManager.GetUpdates") 288 if err != nil { 289 return nil, err 290 } 291 defer conn.Close() 292 request := fm_proto.GetUpdatesRequest{Location: *location, MaxUpdates: 1} 293 if err := conn.Encode(request); err != nil { 294 return nil, err 295 } 296 if err := conn.Flush(); err != nil { 297 return nil, err 298 } 299 var reply fm_proto.Update 300 if err := conn.Decode(&reply); err != nil { 301 return nil, err 302 } 303 if err := errors.New(reply.Error); err != nil { 304 return nil, err 305 } 306 tagsForHypervisors := make(map[string]tags.Tags, len(reply.ChangedMachines)) 307 for _, machine := range reply.ChangedMachines { 308 tagsForHypervisors[machine.Hostname] = machine.Tags 309 } 310 return tagsForHypervisors, nil 311 } 312 313 func listConnectedHypervisors(clientResource *srpc.ClientResource) ( 314 []string, error) { 315 return listConnectedHypervisorsInLocation(clientResource, *location) 316 } 317 318 func listConnectedHypervisorsInLocation(clientResource *srpc.ClientResource, 319 location string) ([]string, error) { 320 client, err := clientResource.GetHTTP(nil, 0) 321 if err != nil { 322 return nil, err 323 } 324 defer client.Put() 325 request := fm_proto.ListHypervisorsInLocationRequest{ 326 IncludeUnhealthy: true, 327 Location: location, 328 } 329 var reply fm_proto.ListHypervisorsInLocationResponse 330 err = client.RequestReply("FleetManager.ListHypervisorsInLocation", 331 request, &reply) 332 if err != nil { 333 return nil, err 334 } 335 if err := errors.New(reply.Error); err != nil { 336 return nil, err 337 } 338 return reply.HypervisorAddresses, nil 339 } 340 341 func markUnusedHypervisors(hypervisors []*hypervisorType, 342 cpuSharer cpusharer.CpuSharer) ( 343 map[*hypervisorType]struct{}, map[*hypervisorType]struct{}) { 344 dialer := libnet.NewCpuSharingDialer(&net.Dialer{}, cpuSharer) 345 waitGroup := &sync.WaitGroup{} 346 for _, hypervisor_ := range hypervisors { 347 waitGroup.Add(1) 348 go func(h *hypervisorType) { 349 defer waitGroup.Done() 350 cpuSharer.GrabCpu() 351 defer cpuSharer.ReleaseCpu() 352 client, err := h.hypervisorClientResource.GetHTTPWithDialer(nil, 353 dialer) 354 if err != nil { 355 h.logger.Printf("error connecting to hypervisor: %s\n", err) 356 return 357 } 358 defer client.Put() 359 request := hyper_proto.ListVMsRequest{ 360 IgnoreStateMask: 1<<hyper_proto.StateFailedToStart | 361 1<<hyper_proto.StateStopping | 362 1<<hyper_proto.StateStopped | 363 1<<hyper_proto.StateDestroying, 364 } 365 var reply hyper_proto.ListVMsResponse 366 err = client.RequestReply("Hypervisor.ListVMs", request, &reply) 367 if err != nil { 368 h.logger.Printf("error listing VMS: %s", err) 369 return 370 } 371 if len(reply.IpAddresses) < 1 { 372 h.noVMs = true 373 } 374 }(hypervisor_) 375 } 376 waitGroup.Wait() 377 unusedHypervisors := make(map[*hypervisorType]struct{}) 378 usedHypervisors := make(map[*hypervisorType]struct{}) 379 for _, hypervisor := range hypervisors { 380 if hypervisor.noVMs { 381 unusedHypervisors[hypervisor] = struct{}{} 382 } else { 383 usedHypervisors[hypervisor] = struct{}{} 384 } 385 } 386 return unusedHypervisors, usedHypervisors 387 } 388 389 func releaseImage(imageServerClientResource *srpc.ClientResource, 390 imageName string, expiresAt time.Time, logger log.DebugLogger) error { 391 if expiresAt.IsZero() { 392 logger.Debugln(1, "image already released") 393 return nil 394 } 395 logger.Debugln(0, "releasing image") 396 client, err := imageServerClientResource.GetHTTP(nil, 0) 397 if err != nil { 398 return err 399 } 400 defer client.Put() 401 return imageclient.ChangeImageExpiration(client, imageName, time.Time{}) 402 } 403 404 func setupHypervisor(hostname string, imageName string, tgs tags.Tags, 405 cpuSharer *cpusharer.FifoCpuSharer, 406 logger log.DebugLogger) *hypervisorType { 407 logger = prefixlogger.New(hostname+": ", logger) 408 currentRequiredImage := tgs["RequiredImage"] 409 if currentRequiredImage != "" && 410 path.Dir(currentRequiredImage) != path.Dir(imageName) { 411 logger.Printf( 412 "image stream: current=%s != new=%s, skipping\n", 413 path.Dir(currentRequiredImage), path.Dir(imageName)) 414 return nil 415 } 416 h := &hypervisorType{ 417 healthAgentClientResource: rpcclientpool.New("tcp", 418 fmt.Sprintf("%s:%d", hostname, 6910), true, ""), 419 hostname: hostname, 420 hypervisorClientResource: srpc.NewClientResource("tcp", 421 fmt.Sprintf("%s:%d", hostname, 422 constants.HypervisorPortNumber)), 423 initialTags: tgs, 424 initialUnhealthyList: make(map[string]struct{}), 425 logger: logger, 426 subClientResource: srpc.NewClientResource("tcp", 427 fmt.Sprintf("%s:%d", hostname, constants.SubPortNumber)), 428 } 429 if lastImage, err := h.getLastImageName(cpuSharer); err != nil { 430 logger.Printf("skipping: %s\n", err) 431 return nil 432 } else if lastImage == imageName { 433 logger.Println("already updated, skipping") 434 h.alreadyUpdated = true 435 return h 436 } else { 437 return h 438 } 439 } 440 441 func upgradeOneThenAll(fleetManagerClientResource *srpc.ClientResource, 442 imageName string, hypervisors map[*hypervisorType]struct{}, 443 cpuSharer *cpusharer.FifoCpuSharer, maxConcurrent uint) error { 444 if len(hypervisors) < 1 { 445 return nil 446 } 447 state := concurrent.NewStateWithLinearConcurrencyIncrease(1, maxConcurrent) 448 for hypervisor := range hypervisors { 449 hypervisor := hypervisor 450 err := state.GoRun(func() error { 451 err := hypervisor.upgrade(fleetManagerClientResource, imageName, 452 cpuSharer) 453 if err != nil { 454 return fmt.Errorf("error upgrading: %s: %s", 455 hypervisor.hostname, err) 456 } 457 return nil 458 }) 459 if err != nil { 460 return err 461 } 462 } 463 return state.Reap() 464 } 465 466 func (h *hypervisorType) getFailingHealthChecks( 467 cpuSharer *cpusharer.FifoCpuSharer, 468 timeout time.Duration) ([]string, time.Time, error) { 469 stopTime := time.Now().Add(timeout) 470 for ; time.Until(stopTime) >= 0; cpuSharer.Sleep(time.Second) { 471 if list, timestamp, err := h.getFailingHealthChecksOnce(); err == nil { 472 return list, timestamp, nil 473 } 474 } 475 return nil, time.Time{}, errors.New("timed out getting health status") 476 } 477 478 func (h *hypervisorType) getFailingHealthChecksOnce() ( 479 []string, time.Time, error) { 480 client, err := h.healthAgentClientResource.Get(nil) 481 if err != nil { 482 return nil, time.Time{}, err 483 } 484 defer client.Put() 485 var metric messages.Metric 486 err = client.Call("MetricsServer.GetMetric", 487 "/health-checks/*/unhealthy-list", &metric) 488 if err != nil { 489 client.Close() 490 return nil, time.Time{}, err 491 } 492 if list, ok := metric.Value.([]string); !ok { 493 return nil, time.Time{}, errors.New("list metric is not []string") 494 } else { 495 if timestamp, ok := metric.TimeStamp.(time.Time); ok { 496 return list, timestamp, nil 497 } else { 498 return list, time.Time{}, nil 499 } 500 } 501 } 502 503 func (h *hypervisorType) getLastImageName(cpuSharer *cpusharer.FifoCpuSharer) ( 504 string, error) { 505 client, err := h.subClientResource.GetHTTP(nil, time.Second*15) 506 if err != nil { 507 return "", fmt.Errorf("error connecting to sub: %s", err) 508 } 509 defer client.Put() 510 request := sub_proto.PollRequest{ShortPollOnly: true} 511 var reply sub_proto.PollResponse 512 if err := subclient.CallPoll(client, request, &reply); err != nil { 513 client.Close() 514 if err != io.EOF { 515 return "", fmt.Errorf("error polling sub: %s", err) 516 } 517 } 518 return reply.LastSuccessfulImageName, nil 519 } 520 521 func (h *hypervisorType) updateTagForHypervisor( 522 clientResource *srpc.ClientResource, key, value string) error { 523 newTags := h.initialTags.Copy() 524 newTags[key] = value 525 if key == "RequiredImage" { 526 delete(newTags, "PlannedImage") 527 } 528 if h.initialTags.Equal(newTags) { 529 return nil 530 } 531 client, err := clientResource.GetHTTP(nil, 0) 532 if err != nil { 533 return err 534 } 535 defer client.Put() 536 request := fm_proto.ChangeMachineTagsRequest{ 537 Hostname: h.hostname, 538 Tags: newTags, 539 } 540 var reply fm_proto.ChangeMachineTagsResponse 541 err = client.RequestReply("FleetManager.ChangeMachineTags", 542 request, &reply) 543 if err != nil { 544 return err 545 } 546 return errors.New(reply.Error) 547 } 548 549 func (h *hypervisorType) upgrade(clientResource *srpc.ClientResource, 550 imageName string, cpuSharer *cpusharer.FifoCpuSharer) error { 551 cpuSharer.GrabCpu() 552 defer cpuSharer.ReleaseCpu() 553 list, _, err := h.getFailingHealthChecks(cpuSharer, time.Second) 554 if err != nil { 555 h.logger.Println(err) 556 return nil 557 } else if len(list) > 0 { 558 for _, failed := range list { 559 h.initialUnhealthyList[failed] = struct{}{} 560 } 561 } 562 h.logger.Debugln(0, "upgrading") 563 err = h.updateTagForHypervisor(clientResource, "RequiredImage", imageName) 564 if err != nil { 565 return err 566 } 567 stopTime := time.Now().Add(time.Minute * 15) 568 updateCompleted := false 569 var lastError string 570 for ; time.Until(stopTime) > 0; cpuSharer.Sleep(time.Second) { 571 if syncedImage, err := h.getLastImageName(cpuSharer); err != nil { 572 if lastError != err.Error() { 573 h.logger.Debugln(0, err) 574 } 575 lastError = err.Error() 576 continue 577 } else if syncedImage == imageName { 578 updateCompleted = true 579 break 580 } 581 } 582 if !updateCompleted { 583 return errors.New("timed out waiting for image update to complete") 584 } 585 h.logger.Debugln(0, "upgraded") 586 cpuSharer.Sleep(time.Second * 15) 587 list, _, err = h.getFailingHealthChecks(cpuSharer, time.Minute) 588 if err != nil { 589 return err 590 } else { 591 for _, entry := range list { 592 if _, ok := h.initialUnhealthyList[entry]; !ok { 593 return fmt.Errorf("health check failed: %s:", entry) 594 } 595 } 596 } 597 h.logger.Debugln(0, "still healthy") 598 return nil 599 } 600 601 func (h *hypervisorType) waitLastImageName(cpuSharer *cpusharer.FifoCpuSharer) ( 602 string, error) { 603 stopTime := time.Now().Add(time.Minute) 604 for ; time.Until(stopTime) > 0; cpuSharer.Sleep(time.Second * 5) { 605 imageName, err := h.getLastImageName(cpuSharer) 606 if err != nil { 607 h.logger.Debugln(0, err) 608 continue 609 } 610 return imageName, nil 611 } 612 return "", errors.New("timed out getting last image name") 613 }