github.com/in4it/ecs-deploy@v0.0.42-0.20240508120354-ed77ff16df25/api/autoscaling.go (about)

     1  package api
     2  
     3  import (
     4  	"sync"
     5  
     6  	"github.com/in4it/ecs-deploy/provider/ecs"
     7  	"github.com/in4it/ecs-deploy/service"
     8  	"github.com/in4it/ecs-deploy/util"
     9  	"github.com/juju/loggo"
    10  
    11  	"encoding/json"
    12  	"errors"
    13  	"io/ioutil"
    14  	"math"
    15  	"net/http"
    16  	"strconv"
    17  	"strings"
    18  	"time"
    19  )
    20  
    21  type AutoscalingController struct {
    22  	muUp   sync.Mutex
    23  	muDown sync.Mutex
    24  }
    25  
    26  var asAutoscalingControllerLogger = loggo.GetLogger("as-controller")
    27  
    28  func (c *AutoscalingController) getClusterInfoWithCache(clusterName string, s service.ServiceIf, e ecs.ECSIf) (*service.DynamoCluster, error) {
    29  	return c.getClusterInfo(clusterName, true, s, e)
    30  }
    31  func (c *AutoscalingController) getClusterInfo(clusterName string, withCache bool, s service.ServiceIf, e ecs.ECSIf) (*service.DynamoCluster, error) {
    32  	var dc *service.DynamoCluster
    33  	var err error
    34  
    35  	if withCache {
    36  		dc, err = s.GetClusterInfo()
    37  		if err != nil {
    38  			return nil, err
    39  		}
    40  	}
    41  	if dc == nil || dc.Time.Before(time.Now().Add(-4*time.Minute /* 4 minutes cache */)) {
    42  		// no cache, need to retrieve everything
    43  		asAutoscalingControllerLogger.Debugf("No cache found, need to retrieve using API calls")
    44  		if dc == nil {
    45  			dc = &service.DynamoCluster{}
    46  		} else {
    47  			scalingOperation := dc.ScalingOperation
    48  			dc = &service.DynamoCluster{
    49  				ContainerInstances: []service.DynamoClusterContainerInstance{},
    50  				ScalingOperation:   scalingOperation,
    51  			}
    52  		}
    53  
    54  		// calculate free resources
    55  		firs, _, err := e.GetInstanceResources(clusterName)
    56  		if err != nil {
    57  			return nil, err
    58  		}
    59  		for _, f := range firs {
    60  			var dcci service.DynamoClusterContainerInstance
    61  			dcci.ClusterName = clusterName
    62  			dcci.ContainerInstanceId = f.InstanceId
    63  			dcci.AvailabilityZone = f.AvailabilityZone
    64  			dcci.FreeMemory = f.FreeMemory
    65  			dcci.FreeCpu = f.FreeCpu
    66  			dcci.Status = f.Status
    67  			dc.ContainerInstances = append(dc.ContainerInstances, dcci)
    68  		}
    69  	}
    70  	return dc, nil
    71  }
    72  
    73  // return minimal cpu/memory resources that are needed for the cluster
    74  func (c *AutoscalingController) getResourcesNeeded(clusterName string, cc ControllerIf) (int64, int64, error) {
    75  	dss, _ := cc.getServices()
    76  	memoryNeeded := make(map[string]int64)
    77  	cpuNeeded := make(map[string]int64)
    78  	for _, ds := range dss {
    79  		if val, ok := memoryNeeded[ds.C]; ok {
    80  			if ds.MemoryReservation > val {
    81  				memoryNeeded[ds.C] = ds.MemoryReservation
    82  			}
    83  		} else {
    84  			memoryNeeded[ds.C] = ds.MemoryReservation
    85  		}
    86  		if val, ok := cpuNeeded[ds.C]; ok {
    87  			if ds.CpuReservation > val {
    88  				cpuNeeded[ds.C] = ds.CpuReservation
    89  			}
    90  		} else {
    91  			cpuNeeded[ds.C] = ds.CpuReservation
    92  		}
    93  	}
    94  	if _, ok := memoryNeeded[clusterName]; !ok {
    95  		return 0, 0, errors.New("Minimal Memory needed for clusterName " + clusterName + " not found")
    96  	}
    97  	if _, ok := cpuNeeded[clusterName]; !ok {
    98  		return 0, 0, errors.New("Minimal CPU needed for clusterName " + clusterName + " not found")
    99  	}
   100  	return memoryNeeded[clusterName], cpuNeeded[clusterName], nil
   101  }
   102  func (c *AutoscalingController) getAutoscalingStrategy() (bool, bool) {
   103  	// Check whether Strategy is enabled
   104  	asStrategies := strings.Split(util.GetEnv("AUTOSCALING_STRATEGIES", "LargestContainerUp,LargestContainerDown"), ",")
   105  	asStrategyLargestContainerUp := false
   106  	asStrategyLargestContainerDown := false
   107  	for _, v := range asStrategies {
   108  		if strings.ToLower(v) == "largestcontainerup" {
   109  			asStrategyLargestContainerUp = true
   110  		}
   111  		if strings.ToLower(v) == "largestcontainerdown" {
   112  			asStrategyLargestContainerDown = true
   113  		}
   114  	}
   115  	return asStrategyLargestContainerUp, asStrategyLargestContainerDown
   116  }
   117  
   118  // Process ECS event message and determine to scale or not
   119  func (c *AutoscalingController) processEcsMessage(message ecs.SNSPayloadEcs) error {
   120  	apiLogger.Debugf("found ecs notification")
   121  	s := service.NewService()
   122  	e := &ecs.ECS{}
   123  	cc := &Controller{}
   124  	autoscaling := ecs.AutoScaling{}
   125  	// determine cluster name
   126  	sp := strings.Split(message.Detail.ClusterArn, "/")
   127  	if len(sp) != 2 {
   128  		return errors.New("Could not determine cluster name from message (arn: " + message.Detail.ClusterArn + ")")
   129  	}
   130  	clusterName := sp[1]
   131  	// determine max reservation
   132  	memoryNeeded, cpuNeeded, err := c.getResourcesNeeded(clusterName, cc)
   133  	if err != nil {
   134  		return err
   135  	}
   136  	// calculate registered resources of the EC2 instance
   137  	f, err := e.ConvertResourceToRir(message.Detail.RegisteredResources)
   138  	if err != nil {
   139  		return err
   140  	}
   141  	registeredInstanceCpu := f.RegisteredCpu
   142  	registeredInstanceMemory := f.RegisteredMemory
   143  	// determine minimum reservations
   144  	dc, err := c.getClusterInfoWithCache(clusterName, s, e)
   145  	if err != nil {
   146  		return err
   147  	}
   148  	var found bool
   149  	for k, v := range dc.ContainerInstances {
   150  		if v.ContainerInstanceId == message.Detail.Ec2InstanceId {
   151  			found = true
   152  			dc.ContainerInstances[k].ClusterName = clusterName
   153  			// get resources
   154  			f, err := e.ConvertResourceToFir(message.Detail.RemainingResources)
   155  			if err != nil {
   156  				return err
   157  			}
   158  			dc.ContainerInstances[k].FreeMemory = f.FreeMemory
   159  			dc.ContainerInstances[k].FreeCpu = f.FreeCpu
   160  			// get az
   161  			for _, v := range message.Detail.Attributes {
   162  				if v.Name == "ecs.availability-zone" {
   163  					dc.ContainerInstances[k].AvailabilityZone = v.Value
   164  				}
   165  			}
   166  		}
   167  	}
   168  	if !found {
   169  		// add element
   170  		var dcci service.DynamoClusterContainerInstance
   171  		dcci.ClusterName = clusterName
   172  		dcci.ContainerInstanceId = message.Detail.Ec2InstanceId
   173  		f, err := e.ConvertResourceToFir(message.Detail.RemainingResources)
   174  		if err != nil {
   175  			return err
   176  		}
   177  		dcci.FreeMemory = f.FreeMemory
   178  		dcci.FreeCpu = f.FreeCpu
   179  		dcci.Status = f.Status
   180  		// get az
   181  		for _, v := range message.Detail.Attributes {
   182  			if v.Name == "ecs.availability-zone" {
   183  				dcci.AvailabilityZone = v.Value
   184  			}
   185  		}
   186  		dc.ContainerInstances = append(dc.ContainerInstances, dcci)
   187  	}
   188  	// check whether at min/max capacity
   189  	autoScalingGroupName, err := autoscaling.GetAutoScalingGroupByTag(clusterName)
   190  	if err != nil {
   191  		return err
   192  	}
   193  	minSize, desiredCapacity, maxSize, err := autoscaling.GetClusterNodeDesiredCount(autoScalingGroupName)
   194  	if err != nil {
   195  		return err
   196  	}
   197  	// Check whether Strategy is enabled
   198  	asStrategyLargestContainerUp, asStrategyLargestContainerDown := c.getAutoscalingStrategy()
   199  	// make scaling (up) decision
   200  	var resourcesFitGlobal bool
   201  	var scalingOp = "no"
   202  	var pendingScalingOp string
   203  	if asStrategyLargestContainerUp {
   204  		if desiredCapacity < maxSize {
   205  			resourcesFitGlobal = c.scaleUpDecision(clusterName, dc.ContainerInstances, cpuNeeded, memoryNeeded)
   206  			if !resourcesFitGlobal {
   207  				cooldownMin, err := strconv.ParseInt(util.GetEnv("AUTOSCALING_UP_COOLDOWN", "5"), 10, 64)
   208  				if err != nil {
   209  					cooldownMin = 5
   210  				}
   211  				startTime := time.Now().Add(-1 * time.Duration(cooldownMin) * time.Minute)
   212  				lastScalingOp, _, err := s.GetScalingActivity(clusterName, startTime)
   213  				if err != nil {
   214  					return err
   215  				}
   216  				if lastScalingOp == "no" {
   217  					if util.GetEnv("AUTOSCALING_UP_STRATEGY", "immediately") == "gracefully" {
   218  						pendingScalingOp = "up"
   219  					} else {
   220  						asAutoscalingControllerLogger.Infof("Initiating scaling activity")
   221  						scalingOp = "up"
   222  						err = autoscaling.ScaleClusterNodes(autoScalingGroupName, 1)
   223  						if err != nil {
   224  							return err
   225  						}
   226  					}
   227  				}
   228  			}
   229  		}
   230  	} else {
   231  		// if strategy is "latgestContainerUp" is disabled, resources always fit, and scaling down always needs to be checked
   232  		resourcesFitGlobal = true
   233  	}
   234  	// make scaling (down) decision
   235  	if asStrategyLargestContainerDown && desiredCapacity > minSize && (resourcesFitGlobal || desiredCapacity == maxSize) {
   236  		hasFreeResourcesGlobal := c.scaleDownDecision(clusterName, dc.ContainerInstances, registeredInstanceCpu, registeredInstanceMemory, cpuNeeded, memoryNeeded)
   237  		if hasFreeResourcesGlobal {
   238  			// check cooldown period
   239  			cooldownMin, err := strconv.ParseInt(util.GetEnv("AUTOSCALING_DOWN_COOLDOWN", "5"), 10, 64)
   240  			if err != nil {
   241  				cooldownMin = 5
   242  			}
   243  			startTime := time.Now().Add(-1 * time.Duration(cooldownMin) * time.Minute)
   244  			lastScalingOp, tmpPendingScalingOp, err := s.GetScalingActivity(clusterName, startTime)
   245  			if err != nil {
   246  				return err
   247  			}
   248  			// check whether there is a deploy running
   249  			deployRunning, err := s.IsDeployRunning()
   250  			if err != nil {
   251  				return err
   252  			}
   253  			// only scale down if the cooldown period is not active and if there are no deploys currently running
   254  			if lastScalingOp == "no" && tmpPendingScalingOp == "" && !deployRunning {
   255  				pendingScalingOp = "down"
   256  			}
   257  		}
   258  	}
   259  	if pendingScalingOp != "" {
   260  		// write object
   261  		_, err = s.PutClusterInfo(*dc, clusterName, scalingOp, pendingScalingOp)
   262  		if err != nil {
   263  			return err
   264  		}
   265  		// launch scaling operation
   266  		cc := &Controller{}
   267  		autoscaling := &ecs.AutoScaling{}
   268  		asAutoscalingControllerLogger.Infof("Scaling operation: scaling %s pending", pendingScalingOp)
   269  		go c.launchProcessPendingScalingOpWithLocking(clusterName, pendingScalingOp, registeredInstanceCpu, registeredInstanceMemory, s, cc, autoscaling)
   270  	}
   271  	return nil
   272  }
   273  func (c *AutoscalingController) getAutoscalingPeriodInterval(scalingOp string) (int64, int64) {
   274  	var period, interval int64
   275  	var err error
   276  	if scalingOp == "down" {
   277  		period, err = strconv.ParseInt(util.GetEnv("AUTOSCALING_DOWN_PERIOD", "5"), 10, 64)
   278  		if err != nil {
   279  			period = 5
   280  		}
   281  		interval, err = strconv.ParseInt(util.GetEnv("AUTOSCALING_DOWN_INTERVAL", "60"), 10, 64)
   282  		if err != nil {
   283  			interval = 60
   284  		}
   285  	} else if scalingOp == "up" {
   286  		period, err = strconv.ParseInt(util.GetEnv("AUTOSCALING_UP_PERIOD", "2"), 10, 64)
   287  		if err != nil {
   288  			period = 5
   289  		}
   290  		interval, err = strconv.ParseInt(util.GetEnv("AUTOSCALING_UP_INTERVAL", "60"), 10, 64)
   291  		if err != nil {
   292  			interval = 60
   293  		}
   294  	} else {
   295  		return 5, 60
   296  	}
   297  	return period, interval
   298  }
   299  
   300  func (c *AutoscalingController) launchProcessPendingScalingOpWithLocking(clusterName, scalingOp string, registeredInstanceCpu, registeredInstanceMemory int64, s service.ServiceIf, cc ControllerIf, autoscaling ecs.AutoScalingIf) error {
   301  
   302  	// lock scaling operation
   303  	asAutoscalingControllerLogger.Debugf("Getting autoscaling lock for scaling %s", scalingOp)
   304  	if scalingOp == "down" {
   305  		c.muDown.Lock()
   306  	} else {
   307  		c.muUp.Lock()
   308  	}
   309  	// execute launchProcessPendingScalingOp
   310  	err := c.launchProcessPendingScalingOp(clusterName, scalingOp, registeredInstanceCpu, registeredInstanceMemory, s, cc, autoscaling)
   311  	// unlock
   312  	asAutoscalingControllerLogger.Debugf("Releasing autoscaling lock for scaling %s", scalingOp)
   313  	if scalingOp == "down" {
   314  		c.muDown.Unlock()
   315  	} else {
   316  		c.muUp.Unlock()
   317  	}
   318  	if err != nil {
   319  		asAutoscalingControllerLogger.Errorf("launchProcessPendingScalingOp error: %s", err)
   320  		return err
   321  	}
   322  	return nil
   323  }
   324  func (c *AutoscalingController) launchProcessPendingScalingOp(clusterName, scalingOp string, registeredInstanceCpu, registeredInstanceMemory int64, s service.ServiceIf, cc ControllerIf, autoscaling ecs.AutoScalingIf) error {
   325  	var err error
   326  	var dcNew *service.DynamoCluster
   327  	var sizeChange int64
   328  
   329  	e := &ecs.ECS{}
   330  
   331  	if scalingOp == "up" {
   332  		sizeChange = 1
   333  	} else if scalingOp == "down" {
   334  		sizeChange = -1
   335  	} else {
   336  		return errors.New("Scalingop " + scalingOp + " not recognized")
   337  	}
   338  
   339  	period, interval := c.getAutoscalingPeriodInterval(scalingOp)
   340  
   341  	var abort, deployRunning, hasFreeResourcesGlobal, resourcesFit bool
   342  	var i int64
   343  	for i = 0; i < period && !abort; i++ {
   344  		time.Sleep(time.Duration(interval) * time.Second)
   345  		dcNew, err = c.getClusterInfo(clusterName, true, s, e)
   346  		if err != nil {
   347  			return err
   348  		}
   349  		memoryNeeded, cpuNeeded, err := c.getResourcesNeeded(clusterName, cc)
   350  		if err != nil {
   351  			return err
   352  		}
   353  		// check if scaling operation is still present
   354  		if dcNew.ScalingOperation.PendingAction != scalingOp {
   355  			asAutoscalingControllerLogger.Infof("Abort scaling operation: scaling %s not found anymore in dynamodb (scalingOp in db: %s)", scalingOp, dcNew.ScalingOperation.PendingAction)
   356  			abort = true
   357  		}
   358  		// pending scaling down logic
   359  		if scalingOp == "down" {
   360  			// make scaling decision
   361  			hasFreeResourcesGlobal = c.scaleDownDecision(clusterName, dcNew.ContainerInstances, registeredInstanceCpu, registeredInstanceMemory, cpuNeeded, memoryNeeded)
   362  			if hasFreeResourcesGlobal {
   363  				// abort if deploy is running
   364  				deployRunning, err = s.IsDeployRunning()
   365  				if err != nil {
   366  					return err
   367  				}
   368  				if deployRunning {
   369  					abort = true
   370  				}
   371  				// abort if not all services are scheduled
   372  				if !c.areAllTasksRunningInCluster(clusterName, cc) {
   373  					abort = true
   374  				}
   375  			} else {
   376  				abort = true
   377  			}
   378  		} else {
   379  			// pending scaling up logic
   380  			resourcesFit = c.scaleUpDecision(clusterName, dcNew.ContainerInstances, cpuNeeded, memoryNeeded)
   381  			if resourcesFit {
   382  				abort = true
   383  			}
   384  		}
   385  	}
   386  
   387  	if !abort {
   388  		asAutoscalingControllerLogger.Infof("Scaling operation: scaling %s now (%d)", scalingOp, sizeChange)
   389  		autoScalingGroupName, err := autoscaling.GetAutoScalingGroupByTag(clusterName)
   390  		if err != nil {
   391  			return err
   392  		}
   393  		err = autoscaling.ScaleClusterNodes(autoScalingGroupName, sizeChange)
   394  		if err != nil {
   395  			return err
   396  		}
   397  		_, err = s.PutClusterInfo(*dcNew, clusterName, scalingOp, "")
   398  		if err != nil {
   399  			return err
   400  		}
   401  	} else {
   402  		asAutoscalingControllerLogger.Infof("Scaling operation: scaling %s aborted. deploy running: %v, free resources (scaling down): %v, resources fit (scaling up): %v, pendingAction: %s", scalingOp, deployRunning, hasFreeResourcesGlobal, resourcesFit, dcNew.ScalingOperation.PendingAction)
   403  	}
   404  	return nil
   405  }
   406  
   407  func (c *AutoscalingController) areAllTasksRunningInCluster(clusterName string, cc ControllerIf) bool {
   408  	services, err := cc.describeServices()
   409  	if err != nil {
   410  		asAutoscalingControllerLogger.Errorf("Error while executing describeServices: %s", err)
   411  		return false
   412  	}
   413  	for _, service := range services {
   414  		if service.RunningCount != service.DesiredCount || service.PendingCount != 0 {
   415  			asAutoscalingControllerLogger.Infof("All tasks are not running in the cluster: Service: %s, RunningCount: %d, DesiredCount: %d, PendingCount: %d", service.ServiceName, service.RunningCount, service.DesiredCount, service.PendingCount)
   416  			return false
   417  		}
   418  	}
   419  	return true
   420  }
   421  
   422  func (c *AutoscalingController) scaleUpDecision(clusterName string, containerInstances []service.DynamoClusterContainerInstance, cpuNeeded, memoryNeeded int64) bool {
   423  	resourcesFit := make(map[string]bool)
   424  	resourcesFitGlobal := true
   425  	for _, dcci := range containerInstances {
   426  		if clusterName == dcci.ClusterName {
   427  			if dcci.Status != "DRAINING" && dcci.FreeCpu > cpuNeeded && dcci.FreeMemory > memoryNeeded {
   428  				resourcesFit[dcci.AvailabilityZone] = true
   429  				asAutoscalingControllerLogger.Debugf("Cluster %v needs at least %v cpu and %v memory. Found instance %v (%v) with %v cpu and %v memory",
   430  					clusterName,
   431  					cpuNeeded,
   432  					memoryNeeded,
   433  					dcci.ContainerInstanceId,
   434  					dcci.AvailabilityZone,
   435  					dcci.FreeCpu,
   436  					dcci.FreeMemory,
   437  				)
   438  			} else {
   439  				// set resourcesFit[az] in case it's not set to true
   440  				if _, ok := resourcesFit[dcci.AvailabilityZone]; !ok {
   441  					resourcesFit[dcci.AvailabilityZone] = false
   442  				}
   443  			}
   444  		}
   445  	}
   446  	for k, v := range resourcesFit {
   447  		if !v {
   448  			resourcesFitGlobal = false
   449  			asAutoscalingControllerLogger.Infof("No instance found in %v with %v cpu and %v memory free", k, cpuNeeded, memoryNeeded)
   450  		}
   451  	}
   452  	return resourcesFitGlobal
   453  }
   454  func (c *AutoscalingController) scaleDownDecision(clusterName string, containerInstances []service.DynamoClusterContainerInstance, instanceCpu, instanceMemory, cpuNeeded, memoryNeeded int64) bool {
   455  	var clusterMemoryNeeded = instanceMemory + memoryNeeded            // capacity of full container node + biggest task
   456  	clusterMemoryNeeded += int64(math.Ceil(float64(memoryNeeded) / 2)) // + buffer
   457  	var clusterCpuNeeded = instanceCpu + cpuNeeded
   458  	clusterCpuNeeded += int64(math.Ceil(float64(cpuNeeded) / 2)) // + buffer
   459  	totalFreeCpu := make(map[string]int64)
   460  	totalFreeMemory := make(map[string]int64)
   461  	hasFreeResources := make(map[string]bool)
   462  	hasFreeResourcesGlobal := true
   463  	for _, dcci := range containerInstances {
   464  		if clusterName == dcci.ClusterName {
   465  			if dcci.Status != "DRAINING" {
   466  				totalFreeCpu[dcci.AvailabilityZone] += dcci.FreeCpu
   467  				totalFreeMemory[dcci.AvailabilityZone] += dcci.FreeMemory
   468  			}
   469  		}
   470  	}
   471  	asStrategyLargestContainerUp, _ := c.getAutoscalingStrategy()
   472  	if !asStrategyLargestContainerUp { // if we're not using the LargestContainerUp strategy, scale down only when there's a full instance size of extra resources
   473  		clusterMemoryNeeded = instanceMemory
   474  		clusterCpuNeeded = instanceCpu
   475  	}
   476  	for k, _ := range totalFreeCpu {
   477  		asAutoscalingControllerLogger.Debugf("%v: Have %d cpu available, need %d", k, totalFreeCpu[k], clusterCpuNeeded)
   478  		asAutoscalingControllerLogger.Debugf("%v: Have %d memory available, need %d", k, totalFreeMemory[k], clusterMemoryNeeded)
   479  		if totalFreeCpu[k] >= clusterCpuNeeded && totalFreeMemory[k] >= clusterMemoryNeeded {
   480  			hasFreeResources[k] = true
   481  		} else {
   482  			// set hasFreeResources[k] in case the map key hasn't been set to true
   483  			if _, ok := hasFreeResources[k]; !ok {
   484  				hasFreeResources[k] = false
   485  			}
   486  		}
   487  	}
   488  	if asStrategyLargestContainerUp {
   489  		// when using LargestContainerUp, only downscale when all AZs have too much capacity, otherwise a scaleUp will immediately be triggered
   490  		for k, v := range hasFreeResources {
   491  			asAutoscalingControllerLogger.Debugf("%v has free resources: %v", k, v)
   492  			if !v {
   493  				hasFreeResourcesGlobal = false
   494  			}
   495  		}
   496  	} else {
   497  		// when not using the strategy LargestContainerUp, set hasFreeResourcesGlobal to true if any of the nodes has too many resources
   498  		foundTrue := false
   499  		for k, v := range hasFreeResources {
   500  			asAutoscalingControllerLogger.Debugf("%v has free resources: %v", k, v)
   501  			if v {
   502  				foundTrue = true
   503  			}
   504  		}
   505  		if !foundTrue {
   506  			hasFreeResourcesGlobal = false
   507  		}
   508  	}
   509  
   510  	return hasFreeResourcesGlobal
   511  }
   512  func (c *AutoscalingController) processLifecycleMessage(message ecs.SNSPayloadLifecycle) error {
   513  	e := ecs.ECS{}
   514  	clusterName, err := e.GetClusterNameByInstanceId(message.Detail.EC2InstanceId)
   515  	if err != nil {
   516  		return err
   517  	}
   518  	containerInstanceArn, err := e.GetContainerInstanceArnByInstanceId(clusterName, message.Detail.EC2InstanceId)
   519  	if err != nil {
   520  		return err
   521  	}
   522  	err = e.DrainNode(clusterName, containerInstanceArn)
   523  	if err != nil {
   524  		return err
   525  	}
   526  	s := service.NewService()
   527  	dc, err := s.GetClusterInfo()
   528  	if err != nil {
   529  		return err
   530  	}
   531  	// write new record to switch container instance to draining
   532  	var writeRecord bool
   533  	if dc != nil {
   534  		for i, dcci := range dc.ContainerInstances {
   535  			if clusterName == dcci.ClusterName && message.Detail.EC2InstanceId == dcci.ContainerInstanceId {
   536  				dc.ContainerInstances[i].Status = "DRAINING"
   537  				writeRecord = true
   538  			}
   539  		}
   540  	}
   541  	if writeRecord {
   542  		s.PutClusterInfo(*dc, clusterName, "no", "")
   543  	}
   544  	// monitor drained node
   545  	go e.LaunchWaitForDrainedNode(clusterName, containerInstanceArn, message.Detail.EC2InstanceId, message.Detail.AutoScalingGroupName, message.Detail.LifecycleHookName, message.Detail.LifecycleActionToken)
   546  	return nil
   547  }
   548  
   549  // start autoscaling polling
   550  func (c *AutoscalingController) startAutoscalingPollingStrategy() {
   551  	e := ecs.ECS{}
   552  	s := service.NewService()
   553  	showEvents := true
   554  	showTasks := false
   555  	showStoppedTasks := false
   556  	lastChecked := time.Now().Add(-1 * time.Minute)
   557  	servicesFound := make(map[string]int)
   558  	// init
   559  	err := s.AutoscalingPullInit()
   560  	if err != nil {
   561  		asAutoscalingControllerLogger.Errorf("couldn't initialize autoscalingpull in backend: %v", err)
   562  	}
   563  	localId, err := c.getLocalId()
   564  	if err != nil {
   565  		asAutoscalingControllerLogger.Errorf("Error while setting getting localId: %v", err)
   566  	}
   567  	asAutoscalingControllerLogger.Infof("ecs-deploy local ID: %v", localId)
   568  	for {
   569  		// only 1 process should do the checking, lock row in dynamodb
   570  		lock, err := s.AutoscalingPullAcquireLock(localId)
   571  		if err != nil {
   572  			asAutoscalingControllerLogger.Errorf("Error while setting lock for pullautoscaling: %v", err)
   573  		}
   574  		if lock {
   575  			services := make(map[string][]*string)
   576  			// get services
   577  			var dss service.DynamoServices
   578  			err := s.GetServices(&dss)
   579  			if err != nil {
   580  				asAutoscalingControllerLogger.Errorf("couldn't get services from backend: %v", err)
   581  			}
   582  			// describe services
   583  			for _, ds := range dss.Services {
   584  				services[ds.C] = append(services[ds.C], &ds.S)
   585  			}
   586  			for clusterName, serviceList := range services {
   587  				rss, err := e.DescribeServicesWithOptions(clusterName, serviceList, showEvents, showTasks, showStoppedTasks, map[string]string{"sleep": "1"})
   588  				if err != nil {
   589  					asAutoscalingControllerLogger.Errorf("Error occured during describe services: %v", err)
   590  				}
   591  				for _, rs := range rss {
   592  					if c.checkForUnschedulableServices(rs) {
   593  						scaled := false
   594  						if servicesFound[clusterName+":"+rs.ServiceName] < 6 {
   595  							servicesFound[clusterName+":"+rs.ServiceName]++
   596  						}
   597  						asAutoscalingControllerLogger.Debugf("Checking service %v for unschedulable tasks where desired count > running count (count: %d)", rs.ServiceName, servicesFound[clusterName+":"+rs.ServiceName])
   598  						for _, event := range rs.Events {
   599  							if event.CreatedAt.After(lastChecked) {
   600  								scaled = c.scaleWhenUnschedulableMessage(clusterName, event.Message)
   601  							}
   602  						}
   603  						if len(rs.Events) > 0 && servicesFound[clusterName+":"+rs.ServiceName] == 5 {
   604  							scaled = c.scaleWhenUnschedulableMessage(clusterName, rs.Events[0].Message)
   605  						}
   606  						if scaled {
   607  							servicesFound[clusterName+":"+rs.ServiceName] = 0
   608  							// write record in dynamodb
   609  							dc, err := s.GetClusterInfo()
   610  							if err != nil {
   611  								asAutoscalingControllerLogger.Debugf("Error while doing GetClusterInfo: %v", err)
   612  							}
   613  							_, err = s.PutClusterInfo(*dc, clusterName, "up", "")
   614  							if err != nil {
   615  								asAutoscalingControllerLogger.Debugf("Error while doing PutClusterInfo: %v", err)
   616  							}
   617  						}
   618  					}
   619  				}
   620  			}
   621  			lastChecked = time.Now()
   622  		}
   623  		time.Sleep(60 * time.Second)
   624  	}
   625  }
   626  
   627  func (c *AutoscalingController) checkForUnschedulableServices(rs service.RunningService) bool {
   628  	if rs.DesiredCount > rs.RunningCount {
   629  		return true
   630  	}
   631  	for _, deployment := range rs.Deployments {
   632  		if deployment.DesiredCount > deployment.RunningCount {
   633  			return true
   634  		}
   635  	}
   636  	return false
   637  }
   638  func (c *AutoscalingController) scaleWhenUnschedulableMessage(clusterName, message string) bool {
   639  	if strings.Contains(message, "was unable to place a task because no container instance met all of its requirements") && strings.Contains(message, "has insufficient") {
   640  		autoscaling := ecs.AutoScaling{}
   641  		asAutoscalingControllerLogger.Infof("Scaling operation: scaling up now")
   642  		autoScalingGroupName, err := autoscaling.GetAutoScalingGroupByTag(clusterName)
   643  		if err != nil {
   644  			asAutoscalingControllerLogger.Errorf("Error: %v", err)
   645  		} else {
   646  			err = autoscaling.ScaleClusterNodes(autoScalingGroupName, 1)
   647  			if err != nil {
   648  				asAutoscalingControllerLogger.Errorf("Error: %v", err)
   649  			}
   650  		}
   651  		return true
   652  	}
   653  	return false
   654  }
   655  
   656  func (c *AutoscalingController) getLocalId() (string, error) {
   657  	ret := "ecs-deploy-" + util.RandStringBytesMaskImprSrc(8)
   658  	var task ecs.EcsTaskMetadata
   659  	url := util.GetEnv("ECS_CONTAINER_METADATA_URI", "") + "/task"
   660  	timeout := time.Duration(10 * time.Second)
   661  	client := http.Client{
   662  		Timeout: timeout,
   663  	}
   664  	resp, err := client.Get(url)
   665  	if err != nil {
   666  		return ret, err
   667  	}
   668  	defer resp.Body.Close()
   669  	contents, err := ioutil.ReadAll(resp.Body)
   670  	if err != nil {
   671  		return ret, err
   672  	}
   673  	err = json.Unmarshal(contents, &task)
   674  	if err != nil {
   675  		return ret, err
   676  	}
   677  	split := strings.Split(task.TaskARN, "task/")
   678  	if len(split) != 2 {
   679  		return ret, err
   680  	}
   681  	return split[1], nil
   682  }