github.com/in4it/ecs-deploy@v0.0.42-0.20240508120354-ed77ff16df25/api/autoscaling.go (about) 1 package api 2 3 import ( 4 "sync" 5 6 "github.com/in4it/ecs-deploy/provider/ecs" 7 "github.com/in4it/ecs-deploy/service" 8 "github.com/in4it/ecs-deploy/util" 9 "github.com/juju/loggo" 10 11 "encoding/json" 12 "errors" 13 "io/ioutil" 14 "math" 15 "net/http" 16 "strconv" 17 "strings" 18 "time" 19 ) 20 21 type AutoscalingController struct { 22 muUp sync.Mutex 23 muDown sync.Mutex 24 } 25 26 var asAutoscalingControllerLogger = loggo.GetLogger("as-controller") 27 28 func (c *AutoscalingController) getClusterInfoWithCache(clusterName string, s service.ServiceIf, e ecs.ECSIf) (*service.DynamoCluster, error) { 29 return c.getClusterInfo(clusterName, true, s, e) 30 } 31 func (c *AutoscalingController) getClusterInfo(clusterName string, withCache bool, s service.ServiceIf, e ecs.ECSIf) (*service.DynamoCluster, error) { 32 var dc *service.DynamoCluster 33 var err error 34 35 if withCache { 36 dc, err = s.GetClusterInfo() 37 if err != nil { 38 return nil, err 39 } 40 } 41 if dc == nil || dc.Time.Before(time.Now().Add(-4*time.Minute /* 4 minutes cache */)) { 42 // no cache, need to retrieve everything 43 asAutoscalingControllerLogger.Debugf("No cache found, need to retrieve using API calls") 44 if dc == nil { 45 dc = &service.DynamoCluster{} 46 } else { 47 scalingOperation := dc.ScalingOperation 48 dc = &service.DynamoCluster{ 49 ContainerInstances: []service.DynamoClusterContainerInstance{}, 50 ScalingOperation: scalingOperation, 51 } 52 } 53 54 // calculate free resources 55 firs, _, err := e.GetInstanceResources(clusterName) 56 if err != nil { 57 return nil, err 58 } 59 for _, f := range firs { 60 var dcci service.DynamoClusterContainerInstance 61 dcci.ClusterName = clusterName 62 dcci.ContainerInstanceId = f.InstanceId 63 dcci.AvailabilityZone = f.AvailabilityZone 64 dcci.FreeMemory = f.FreeMemory 65 dcci.FreeCpu = f.FreeCpu 66 dcci.Status = f.Status 67 dc.ContainerInstances = append(dc.ContainerInstances, dcci) 68 } 69 } 70 return dc, nil 71 } 72 73 // return minimal cpu/memory resources that are needed for the cluster 74 func (c *AutoscalingController) getResourcesNeeded(clusterName string, cc ControllerIf) (int64, int64, error) { 75 dss, _ := cc.getServices() 76 memoryNeeded := make(map[string]int64) 77 cpuNeeded := make(map[string]int64) 78 for _, ds := range dss { 79 if val, ok := memoryNeeded[ds.C]; ok { 80 if ds.MemoryReservation > val { 81 memoryNeeded[ds.C] = ds.MemoryReservation 82 } 83 } else { 84 memoryNeeded[ds.C] = ds.MemoryReservation 85 } 86 if val, ok := cpuNeeded[ds.C]; ok { 87 if ds.CpuReservation > val { 88 cpuNeeded[ds.C] = ds.CpuReservation 89 } 90 } else { 91 cpuNeeded[ds.C] = ds.CpuReservation 92 } 93 } 94 if _, ok := memoryNeeded[clusterName]; !ok { 95 return 0, 0, errors.New("Minimal Memory needed for clusterName " + clusterName + " not found") 96 } 97 if _, ok := cpuNeeded[clusterName]; !ok { 98 return 0, 0, errors.New("Minimal CPU needed for clusterName " + clusterName + " not found") 99 } 100 return memoryNeeded[clusterName], cpuNeeded[clusterName], nil 101 } 102 func (c *AutoscalingController) getAutoscalingStrategy() (bool, bool) { 103 // Check whether Strategy is enabled 104 asStrategies := strings.Split(util.GetEnv("AUTOSCALING_STRATEGIES", "LargestContainerUp,LargestContainerDown"), ",") 105 asStrategyLargestContainerUp := false 106 asStrategyLargestContainerDown := false 107 for _, v := range asStrategies { 108 if strings.ToLower(v) == "largestcontainerup" { 109 asStrategyLargestContainerUp = true 110 } 111 if strings.ToLower(v) == "largestcontainerdown" { 112 asStrategyLargestContainerDown = true 113 } 114 } 115 return asStrategyLargestContainerUp, asStrategyLargestContainerDown 116 } 117 118 // Process ECS event message and determine to scale or not 119 func (c *AutoscalingController) processEcsMessage(message ecs.SNSPayloadEcs) error { 120 apiLogger.Debugf("found ecs notification") 121 s := service.NewService() 122 e := &ecs.ECS{} 123 cc := &Controller{} 124 autoscaling := ecs.AutoScaling{} 125 // determine cluster name 126 sp := strings.Split(message.Detail.ClusterArn, "/") 127 if len(sp) != 2 { 128 return errors.New("Could not determine cluster name from message (arn: " + message.Detail.ClusterArn + ")") 129 } 130 clusterName := sp[1] 131 // determine max reservation 132 memoryNeeded, cpuNeeded, err := c.getResourcesNeeded(clusterName, cc) 133 if err != nil { 134 return err 135 } 136 // calculate registered resources of the EC2 instance 137 f, err := e.ConvertResourceToRir(message.Detail.RegisteredResources) 138 if err != nil { 139 return err 140 } 141 registeredInstanceCpu := f.RegisteredCpu 142 registeredInstanceMemory := f.RegisteredMemory 143 // determine minimum reservations 144 dc, err := c.getClusterInfoWithCache(clusterName, s, e) 145 if err != nil { 146 return err 147 } 148 var found bool 149 for k, v := range dc.ContainerInstances { 150 if v.ContainerInstanceId == message.Detail.Ec2InstanceId { 151 found = true 152 dc.ContainerInstances[k].ClusterName = clusterName 153 // get resources 154 f, err := e.ConvertResourceToFir(message.Detail.RemainingResources) 155 if err != nil { 156 return err 157 } 158 dc.ContainerInstances[k].FreeMemory = f.FreeMemory 159 dc.ContainerInstances[k].FreeCpu = f.FreeCpu 160 // get az 161 for _, v := range message.Detail.Attributes { 162 if v.Name == "ecs.availability-zone" { 163 dc.ContainerInstances[k].AvailabilityZone = v.Value 164 } 165 } 166 } 167 } 168 if !found { 169 // add element 170 var dcci service.DynamoClusterContainerInstance 171 dcci.ClusterName = clusterName 172 dcci.ContainerInstanceId = message.Detail.Ec2InstanceId 173 f, err := e.ConvertResourceToFir(message.Detail.RemainingResources) 174 if err != nil { 175 return err 176 } 177 dcci.FreeMemory = f.FreeMemory 178 dcci.FreeCpu = f.FreeCpu 179 dcci.Status = f.Status 180 // get az 181 for _, v := range message.Detail.Attributes { 182 if v.Name == "ecs.availability-zone" { 183 dcci.AvailabilityZone = v.Value 184 } 185 } 186 dc.ContainerInstances = append(dc.ContainerInstances, dcci) 187 } 188 // check whether at min/max capacity 189 autoScalingGroupName, err := autoscaling.GetAutoScalingGroupByTag(clusterName) 190 if err != nil { 191 return err 192 } 193 minSize, desiredCapacity, maxSize, err := autoscaling.GetClusterNodeDesiredCount(autoScalingGroupName) 194 if err != nil { 195 return err 196 } 197 // Check whether Strategy is enabled 198 asStrategyLargestContainerUp, asStrategyLargestContainerDown := c.getAutoscalingStrategy() 199 // make scaling (up) decision 200 var resourcesFitGlobal bool 201 var scalingOp = "no" 202 var pendingScalingOp string 203 if asStrategyLargestContainerUp { 204 if desiredCapacity < maxSize { 205 resourcesFitGlobal = c.scaleUpDecision(clusterName, dc.ContainerInstances, cpuNeeded, memoryNeeded) 206 if !resourcesFitGlobal { 207 cooldownMin, err := strconv.ParseInt(util.GetEnv("AUTOSCALING_UP_COOLDOWN", "5"), 10, 64) 208 if err != nil { 209 cooldownMin = 5 210 } 211 startTime := time.Now().Add(-1 * time.Duration(cooldownMin) * time.Minute) 212 lastScalingOp, _, err := s.GetScalingActivity(clusterName, startTime) 213 if err != nil { 214 return err 215 } 216 if lastScalingOp == "no" { 217 if util.GetEnv("AUTOSCALING_UP_STRATEGY", "immediately") == "gracefully" { 218 pendingScalingOp = "up" 219 } else { 220 asAutoscalingControllerLogger.Infof("Initiating scaling activity") 221 scalingOp = "up" 222 err = autoscaling.ScaleClusterNodes(autoScalingGroupName, 1) 223 if err != nil { 224 return err 225 } 226 } 227 } 228 } 229 } 230 } else { 231 // if strategy is "latgestContainerUp" is disabled, resources always fit, and scaling down always needs to be checked 232 resourcesFitGlobal = true 233 } 234 // make scaling (down) decision 235 if asStrategyLargestContainerDown && desiredCapacity > minSize && (resourcesFitGlobal || desiredCapacity == maxSize) { 236 hasFreeResourcesGlobal := c.scaleDownDecision(clusterName, dc.ContainerInstances, registeredInstanceCpu, registeredInstanceMemory, cpuNeeded, memoryNeeded) 237 if hasFreeResourcesGlobal { 238 // check cooldown period 239 cooldownMin, err := strconv.ParseInt(util.GetEnv("AUTOSCALING_DOWN_COOLDOWN", "5"), 10, 64) 240 if err != nil { 241 cooldownMin = 5 242 } 243 startTime := time.Now().Add(-1 * time.Duration(cooldownMin) * time.Minute) 244 lastScalingOp, tmpPendingScalingOp, err := s.GetScalingActivity(clusterName, startTime) 245 if err != nil { 246 return err 247 } 248 // check whether there is a deploy running 249 deployRunning, err := s.IsDeployRunning() 250 if err != nil { 251 return err 252 } 253 // only scale down if the cooldown period is not active and if there are no deploys currently running 254 if lastScalingOp == "no" && tmpPendingScalingOp == "" && !deployRunning { 255 pendingScalingOp = "down" 256 } 257 } 258 } 259 if pendingScalingOp != "" { 260 // write object 261 _, err = s.PutClusterInfo(*dc, clusterName, scalingOp, pendingScalingOp) 262 if err != nil { 263 return err 264 } 265 // launch scaling operation 266 cc := &Controller{} 267 autoscaling := &ecs.AutoScaling{} 268 asAutoscalingControllerLogger.Infof("Scaling operation: scaling %s pending", pendingScalingOp) 269 go c.launchProcessPendingScalingOpWithLocking(clusterName, pendingScalingOp, registeredInstanceCpu, registeredInstanceMemory, s, cc, autoscaling) 270 } 271 return nil 272 } 273 func (c *AutoscalingController) getAutoscalingPeriodInterval(scalingOp string) (int64, int64) { 274 var period, interval int64 275 var err error 276 if scalingOp == "down" { 277 period, err = strconv.ParseInt(util.GetEnv("AUTOSCALING_DOWN_PERIOD", "5"), 10, 64) 278 if err != nil { 279 period = 5 280 } 281 interval, err = strconv.ParseInt(util.GetEnv("AUTOSCALING_DOWN_INTERVAL", "60"), 10, 64) 282 if err != nil { 283 interval = 60 284 } 285 } else if scalingOp == "up" { 286 period, err = strconv.ParseInt(util.GetEnv("AUTOSCALING_UP_PERIOD", "2"), 10, 64) 287 if err != nil { 288 period = 5 289 } 290 interval, err = strconv.ParseInt(util.GetEnv("AUTOSCALING_UP_INTERVAL", "60"), 10, 64) 291 if err != nil { 292 interval = 60 293 } 294 } else { 295 return 5, 60 296 } 297 return period, interval 298 } 299 300 func (c *AutoscalingController) launchProcessPendingScalingOpWithLocking(clusterName, scalingOp string, registeredInstanceCpu, registeredInstanceMemory int64, s service.ServiceIf, cc ControllerIf, autoscaling ecs.AutoScalingIf) error { 301 302 // lock scaling operation 303 asAutoscalingControllerLogger.Debugf("Getting autoscaling lock for scaling %s", scalingOp) 304 if scalingOp == "down" { 305 c.muDown.Lock() 306 } else { 307 c.muUp.Lock() 308 } 309 // execute launchProcessPendingScalingOp 310 err := c.launchProcessPendingScalingOp(clusterName, scalingOp, registeredInstanceCpu, registeredInstanceMemory, s, cc, autoscaling) 311 // unlock 312 asAutoscalingControllerLogger.Debugf("Releasing autoscaling lock for scaling %s", scalingOp) 313 if scalingOp == "down" { 314 c.muDown.Unlock() 315 } else { 316 c.muUp.Unlock() 317 } 318 if err != nil { 319 asAutoscalingControllerLogger.Errorf("launchProcessPendingScalingOp error: %s", err) 320 return err 321 } 322 return nil 323 } 324 func (c *AutoscalingController) launchProcessPendingScalingOp(clusterName, scalingOp string, registeredInstanceCpu, registeredInstanceMemory int64, s service.ServiceIf, cc ControllerIf, autoscaling ecs.AutoScalingIf) error { 325 var err error 326 var dcNew *service.DynamoCluster 327 var sizeChange int64 328 329 e := &ecs.ECS{} 330 331 if scalingOp == "up" { 332 sizeChange = 1 333 } else if scalingOp == "down" { 334 sizeChange = -1 335 } else { 336 return errors.New("Scalingop " + scalingOp + " not recognized") 337 } 338 339 period, interval := c.getAutoscalingPeriodInterval(scalingOp) 340 341 var abort, deployRunning, hasFreeResourcesGlobal, resourcesFit bool 342 var i int64 343 for i = 0; i < period && !abort; i++ { 344 time.Sleep(time.Duration(interval) * time.Second) 345 dcNew, err = c.getClusterInfo(clusterName, true, s, e) 346 if err != nil { 347 return err 348 } 349 memoryNeeded, cpuNeeded, err := c.getResourcesNeeded(clusterName, cc) 350 if err != nil { 351 return err 352 } 353 // check if scaling operation is still present 354 if dcNew.ScalingOperation.PendingAction != scalingOp { 355 asAutoscalingControllerLogger.Infof("Abort scaling operation: scaling %s not found anymore in dynamodb (scalingOp in db: %s)", scalingOp, dcNew.ScalingOperation.PendingAction) 356 abort = true 357 } 358 // pending scaling down logic 359 if scalingOp == "down" { 360 // make scaling decision 361 hasFreeResourcesGlobal = c.scaleDownDecision(clusterName, dcNew.ContainerInstances, registeredInstanceCpu, registeredInstanceMemory, cpuNeeded, memoryNeeded) 362 if hasFreeResourcesGlobal { 363 // abort if deploy is running 364 deployRunning, err = s.IsDeployRunning() 365 if err != nil { 366 return err 367 } 368 if deployRunning { 369 abort = true 370 } 371 // abort if not all services are scheduled 372 if !c.areAllTasksRunningInCluster(clusterName, cc) { 373 abort = true 374 } 375 } else { 376 abort = true 377 } 378 } else { 379 // pending scaling up logic 380 resourcesFit = c.scaleUpDecision(clusterName, dcNew.ContainerInstances, cpuNeeded, memoryNeeded) 381 if resourcesFit { 382 abort = true 383 } 384 } 385 } 386 387 if !abort { 388 asAutoscalingControllerLogger.Infof("Scaling operation: scaling %s now (%d)", scalingOp, sizeChange) 389 autoScalingGroupName, err := autoscaling.GetAutoScalingGroupByTag(clusterName) 390 if err != nil { 391 return err 392 } 393 err = autoscaling.ScaleClusterNodes(autoScalingGroupName, sizeChange) 394 if err != nil { 395 return err 396 } 397 _, err = s.PutClusterInfo(*dcNew, clusterName, scalingOp, "") 398 if err != nil { 399 return err 400 } 401 } else { 402 asAutoscalingControllerLogger.Infof("Scaling operation: scaling %s aborted. deploy running: %v, free resources (scaling down): %v, resources fit (scaling up): %v, pendingAction: %s", scalingOp, deployRunning, hasFreeResourcesGlobal, resourcesFit, dcNew.ScalingOperation.PendingAction) 403 } 404 return nil 405 } 406 407 func (c *AutoscalingController) areAllTasksRunningInCluster(clusterName string, cc ControllerIf) bool { 408 services, err := cc.describeServices() 409 if err != nil { 410 asAutoscalingControllerLogger.Errorf("Error while executing describeServices: %s", err) 411 return false 412 } 413 for _, service := range services { 414 if service.RunningCount != service.DesiredCount || service.PendingCount != 0 { 415 asAutoscalingControllerLogger.Infof("All tasks are not running in the cluster: Service: %s, RunningCount: %d, DesiredCount: %d, PendingCount: %d", service.ServiceName, service.RunningCount, service.DesiredCount, service.PendingCount) 416 return false 417 } 418 } 419 return true 420 } 421 422 func (c *AutoscalingController) scaleUpDecision(clusterName string, containerInstances []service.DynamoClusterContainerInstance, cpuNeeded, memoryNeeded int64) bool { 423 resourcesFit := make(map[string]bool) 424 resourcesFitGlobal := true 425 for _, dcci := range containerInstances { 426 if clusterName == dcci.ClusterName { 427 if dcci.Status != "DRAINING" && dcci.FreeCpu > cpuNeeded && dcci.FreeMemory > memoryNeeded { 428 resourcesFit[dcci.AvailabilityZone] = true 429 asAutoscalingControllerLogger.Debugf("Cluster %v needs at least %v cpu and %v memory. Found instance %v (%v) with %v cpu and %v memory", 430 clusterName, 431 cpuNeeded, 432 memoryNeeded, 433 dcci.ContainerInstanceId, 434 dcci.AvailabilityZone, 435 dcci.FreeCpu, 436 dcci.FreeMemory, 437 ) 438 } else { 439 // set resourcesFit[az] in case it's not set to true 440 if _, ok := resourcesFit[dcci.AvailabilityZone]; !ok { 441 resourcesFit[dcci.AvailabilityZone] = false 442 } 443 } 444 } 445 } 446 for k, v := range resourcesFit { 447 if !v { 448 resourcesFitGlobal = false 449 asAutoscalingControllerLogger.Infof("No instance found in %v with %v cpu and %v memory free", k, cpuNeeded, memoryNeeded) 450 } 451 } 452 return resourcesFitGlobal 453 } 454 func (c *AutoscalingController) scaleDownDecision(clusterName string, containerInstances []service.DynamoClusterContainerInstance, instanceCpu, instanceMemory, cpuNeeded, memoryNeeded int64) bool { 455 var clusterMemoryNeeded = instanceMemory + memoryNeeded // capacity of full container node + biggest task 456 clusterMemoryNeeded += int64(math.Ceil(float64(memoryNeeded) / 2)) // + buffer 457 var clusterCpuNeeded = instanceCpu + cpuNeeded 458 clusterCpuNeeded += int64(math.Ceil(float64(cpuNeeded) / 2)) // + buffer 459 totalFreeCpu := make(map[string]int64) 460 totalFreeMemory := make(map[string]int64) 461 hasFreeResources := make(map[string]bool) 462 hasFreeResourcesGlobal := true 463 for _, dcci := range containerInstances { 464 if clusterName == dcci.ClusterName { 465 if dcci.Status != "DRAINING" { 466 totalFreeCpu[dcci.AvailabilityZone] += dcci.FreeCpu 467 totalFreeMemory[dcci.AvailabilityZone] += dcci.FreeMemory 468 } 469 } 470 } 471 asStrategyLargestContainerUp, _ := c.getAutoscalingStrategy() 472 if !asStrategyLargestContainerUp { // if we're not using the LargestContainerUp strategy, scale down only when there's a full instance size of extra resources 473 clusterMemoryNeeded = instanceMemory 474 clusterCpuNeeded = instanceCpu 475 } 476 for k, _ := range totalFreeCpu { 477 asAutoscalingControllerLogger.Debugf("%v: Have %d cpu available, need %d", k, totalFreeCpu[k], clusterCpuNeeded) 478 asAutoscalingControllerLogger.Debugf("%v: Have %d memory available, need %d", k, totalFreeMemory[k], clusterMemoryNeeded) 479 if totalFreeCpu[k] >= clusterCpuNeeded && totalFreeMemory[k] >= clusterMemoryNeeded { 480 hasFreeResources[k] = true 481 } else { 482 // set hasFreeResources[k] in case the map key hasn't been set to true 483 if _, ok := hasFreeResources[k]; !ok { 484 hasFreeResources[k] = false 485 } 486 } 487 } 488 if asStrategyLargestContainerUp { 489 // when using LargestContainerUp, only downscale when all AZs have too much capacity, otherwise a scaleUp will immediately be triggered 490 for k, v := range hasFreeResources { 491 asAutoscalingControllerLogger.Debugf("%v has free resources: %v", k, v) 492 if !v { 493 hasFreeResourcesGlobal = false 494 } 495 } 496 } else { 497 // when not using the strategy LargestContainerUp, set hasFreeResourcesGlobal to true if any of the nodes has too many resources 498 foundTrue := false 499 for k, v := range hasFreeResources { 500 asAutoscalingControllerLogger.Debugf("%v has free resources: %v", k, v) 501 if v { 502 foundTrue = true 503 } 504 } 505 if !foundTrue { 506 hasFreeResourcesGlobal = false 507 } 508 } 509 510 return hasFreeResourcesGlobal 511 } 512 func (c *AutoscalingController) processLifecycleMessage(message ecs.SNSPayloadLifecycle) error { 513 e := ecs.ECS{} 514 clusterName, err := e.GetClusterNameByInstanceId(message.Detail.EC2InstanceId) 515 if err != nil { 516 return err 517 } 518 containerInstanceArn, err := e.GetContainerInstanceArnByInstanceId(clusterName, message.Detail.EC2InstanceId) 519 if err != nil { 520 return err 521 } 522 err = e.DrainNode(clusterName, containerInstanceArn) 523 if err != nil { 524 return err 525 } 526 s := service.NewService() 527 dc, err := s.GetClusterInfo() 528 if err != nil { 529 return err 530 } 531 // write new record to switch container instance to draining 532 var writeRecord bool 533 if dc != nil { 534 for i, dcci := range dc.ContainerInstances { 535 if clusterName == dcci.ClusterName && message.Detail.EC2InstanceId == dcci.ContainerInstanceId { 536 dc.ContainerInstances[i].Status = "DRAINING" 537 writeRecord = true 538 } 539 } 540 } 541 if writeRecord { 542 s.PutClusterInfo(*dc, clusterName, "no", "") 543 } 544 // monitor drained node 545 go e.LaunchWaitForDrainedNode(clusterName, containerInstanceArn, message.Detail.EC2InstanceId, message.Detail.AutoScalingGroupName, message.Detail.LifecycleHookName, message.Detail.LifecycleActionToken) 546 return nil 547 } 548 549 // start autoscaling polling 550 func (c *AutoscalingController) startAutoscalingPollingStrategy() { 551 e := ecs.ECS{} 552 s := service.NewService() 553 showEvents := true 554 showTasks := false 555 showStoppedTasks := false 556 lastChecked := time.Now().Add(-1 * time.Minute) 557 servicesFound := make(map[string]int) 558 // init 559 err := s.AutoscalingPullInit() 560 if err != nil { 561 asAutoscalingControllerLogger.Errorf("couldn't initialize autoscalingpull in backend: %v", err) 562 } 563 localId, err := c.getLocalId() 564 if err != nil { 565 asAutoscalingControllerLogger.Errorf("Error while setting getting localId: %v", err) 566 } 567 asAutoscalingControllerLogger.Infof("ecs-deploy local ID: %v", localId) 568 for { 569 // only 1 process should do the checking, lock row in dynamodb 570 lock, err := s.AutoscalingPullAcquireLock(localId) 571 if err != nil { 572 asAutoscalingControllerLogger.Errorf("Error while setting lock for pullautoscaling: %v", err) 573 } 574 if lock { 575 services := make(map[string][]*string) 576 // get services 577 var dss service.DynamoServices 578 err := s.GetServices(&dss) 579 if err != nil { 580 asAutoscalingControllerLogger.Errorf("couldn't get services from backend: %v", err) 581 } 582 // describe services 583 for _, ds := range dss.Services { 584 services[ds.C] = append(services[ds.C], &ds.S) 585 } 586 for clusterName, serviceList := range services { 587 rss, err := e.DescribeServicesWithOptions(clusterName, serviceList, showEvents, showTasks, showStoppedTasks, map[string]string{"sleep": "1"}) 588 if err != nil { 589 asAutoscalingControllerLogger.Errorf("Error occured during describe services: %v", err) 590 } 591 for _, rs := range rss { 592 if c.checkForUnschedulableServices(rs) { 593 scaled := false 594 if servicesFound[clusterName+":"+rs.ServiceName] < 6 { 595 servicesFound[clusterName+":"+rs.ServiceName]++ 596 } 597 asAutoscalingControllerLogger.Debugf("Checking service %v for unschedulable tasks where desired count > running count (count: %d)", rs.ServiceName, servicesFound[clusterName+":"+rs.ServiceName]) 598 for _, event := range rs.Events { 599 if event.CreatedAt.After(lastChecked) { 600 scaled = c.scaleWhenUnschedulableMessage(clusterName, event.Message) 601 } 602 } 603 if len(rs.Events) > 0 && servicesFound[clusterName+":"+rs.ServiceName] == 5 { 604 scaled = c.scaleWhenUnschedulableMessage(clusterName, rs.Events[0].Message) 605 } 606 if scaled { 607 servicesFound[clusterName+":"+rs.ServiceName] = 0 608 // write record in dynamodb 609 dc, err := s.GetClusterInfo() 610 if err != nil { 611 asAutoscalingControllerLogger.Debugf("Error while doing GetClusterInfo: %v", err) 612 } 613 _, err = s.PutClusterInfo(*dc, clusterName, "up", "") 614 if err != nil { 615 asAutoscalingControllerLogger.Debugf("Error while doing PutClusterInfo: %v", err) 616 } 617 } 618 } 619 } 620 } 621 lastChecked = time.Now() 622 } 623 time.Sleep(60 * time.Second) 624 } 625 } 626 627 func (c *AutoscalingController) checkForUnschedulableServices(rs service.RunningService) bool { 628 if rs.DesiredCount > rs.RunningCount { 629 return true 630 } 631 for _, deployment := range rs.Deployments { 632 if deployment.DesiredCount > deployment.RunningCount { 633 return true 634 } 635 } 636 return false 637 } 638 func (c *AutoscalingController) scaleWhenUnschedulableMessage(clusterName, message string) bool { 639 if strings.Contains(message, "was unable to place a task because no container instance met all of its requirements") && strings.Contains(message, "has insufficient") { 640 autoscaling := ecs.AutoScaling{} 641 asAutoscalingControllerLogger.Infof("Scaling operation: scaling up now") 642 autoScalingGroupName, err := autoscaling.GetAutoScalingGroupByTag(clusterName) 643 if err != nil { 644 asAutoscalingControllerLogger.Errorf("Error: %v", err) 645 } else { 646 err = autoscaling.ScaleClusterNodes(autoScalingGroupName, 1) 647 if err != nil { 648 asAutoscalingControllerLogger.Errorf("Error: %v", err) 649 } 650 } 651 return true 652 } 653 return false 654 } 655 656 func (c *AutoscalingController) getLocalId() (string, error) { 657 ret := "ecs-deploy-" + util.RandStringBytesMaskImprSrc(8) 658 var task ecs.EcsTaskMetadata 659 url := util.GetEnv("ECS_CONTAINER_METADATA_URI", "") + "/task" 660 timeout := time.Duration(10 * time.Second) 661 client := http.Client{ 662 Timeout: timeout, 663 } 664 resp, err := client.Get(url) 665 if err != nil { 666 return ret, err 667 } 668 defer resp.Body.Close() 669 contents, err := ioutil.ReadAll(resp.Body) 670 if err != nil { 671 return ret, err 672 } 673 err = json.Unmarshal(contents, &task) 674 if err != nil { 675 return ret, err 676 } 677 split := strings.Split(task.TaskARN, "task/") 678 if len(split) != 2 { 679 return ret, err 680 } 681 return split[1], nil 682 }