bosun.org@v0.0.0-20210513094433-e25bc3e69a1f/cmd/scollector/collectors/aws.go (about)

     1  package collectors
     2  
     3  import (
     4  	"fmt"
     5  	"time"
     6  
     7  	"bosun.org/metadata"
     8  	"bosun.org/opentsdb"
     9  	"bosun.org/slog"
    10  	"github.com/aws/aws-sdk-go/aws"
    11  	"github.com/aws/aws-sdk-go/aws/credentials"
    12  	"github.com/aws/aws-sdk-go/aws/session"
    13  	"github.com/aws/aws-sdk-go/service/cloudwatch"
    14  	"github.com/aws/aws-sdk-go/service/ec2"
    15  	"github.com/aws/aws-sdk-go/service/elb"
    16  )
    17  
    18  const (
    19  	awsCPU                = "aws.ec2.cpu"
    20  	awsEC2DiskBytes       = "aws.ec2.disk.bytes"
    21  	awsEC2DiskOps         = "aws.ec2.disk.ops"
    22  	awsELBHostsHealthy    = "aws.elb.hosts.healthy"
    23  	awsELBHostsUnHealthy  = "aws.elb.hosts.unhealthy"
    24  	awsELBLatencyAvg      = "aws.elb.latency.average"
    25  	awsELBLatencyMax      = "aws.elb.latency.maximum"
    26  	awsELBLatencyMin      = "aws.elb.latency.minimum"
    27  	awsNetwork            = "aws.ec2.net.bytes"
    28  	awsStatusCheckFailed  = "aws.ec2.status.failed"
    29  	descAWSEC2CPU         = "The average CPU Utilization, gathered at a 60 second interval and averaged over five minutes."
    30  	descAWSEC2DiskBytes   = "The average bytes written or read via disk, gathered at a 60 second interval and averaged over five minutes."
    31  	descAWSEC2DiskOps     = "The average disk operations, either written or read, gathered at a 60 second interval and averaged over five minutes."
    32  	descAWSEC2NetBytes    = "The average bytes transmitted or received via network, gathered at a 60 second interval and averaged over five minutes."
    33  	descAWSEC2StatusCheck = "The EC2 Status Check, which includes both instance-level and system-level drill-down, gathered every 60 seconds."
    34  	descAWSELBHostCount   = "The number of instances in what the Elastic Load Balancer considers a healthy state, gathered every 60 seconds."
    35  	descAWSELBLatency     = "The minimum, maximum and average latency as reported by the load balancer, gathered at a 60 second interval and averaged over five minutes."
    36  )
    37  
    38  var aws_period = int64(60)
    39  
    40  func AWS(accessKey, secretKey, region, productCodes, bucketName, bucketPath string, purgeDays int) error {
    41  	if accessKey == "" || secretKey == "" || region == "" {
    42  		return fmt.Errorf("empty AccessKey, SecretKey, or Region in AWS")
    43  	}
    44  	//mhenderson: There are some alerts in the aws collector that we don't want to output in the event that
    45  	//billing only is enabled, as you might enable billing without having any EC3 or ELB instances.
    46  	billingEnabled := bucketName != "" && bucketPath != ""
    47  	collectors = append(collectors, &IntervalCollector{
    48  		F: func() (opentsdb.MultiDataPoint, error) {
    49  			return c_aws(accessKey, secretKey, region, billingEnabled)
    50  		},
    51  		Interval: 60 * time.Second,
    52  		name:     fmt.Sprintf("aws-%s", region),
    53  	})
    54  
    55  	if billingEnabled {
    56  		collectors = append(collectors, &IntervalCollector{
    57  			F: func() (opentsdb.MultiDataPoint, error) {
    58  				return c_awsBilling(accessKey, secretKey, region, productCodes, bucketName, bucketPath, purgeDays)
    59  			},
    60  			Interval: 1 * time.Hour,
    61  			name:     fmt.Sprintf("awsBilling-%s", region),
    62  		})
    63  	}
    64  	return nil
    65  }
    66  
    67  func c_aws(accessKey, secretKey, region string, billingEnabled bool) (opentsdb.MultiDataPoint, error) {
    68  	var md opentsdb.MultiDataPoint
    69  	creds := credentials.NewStaticCredentials(accessKey, secretKey, "")
    70  	conf := &aws.Config{
    71  		Credentials: creds,
    72  		Region:      &region,
    73  	}
    74  	ecc := ec2.New(session.New(), conf)
    75  	if ecc == nil {
    76  		return nil, fmt.Errorf("unable to login to EC2")
    77  	}
    78  	elb := elb.New(session.New(), conf)
    79  	if elb == nil {
    80  		return nil, fmt.Errorf("unable to login to ELB")
    81  	}
    82  	cw := cloudwatch.New(session.New(), conf)
    83  	if cw == nil {
    84  		return nil, fmt.Errorf("unable to login to CloudWatch")
    85  	}
    86  	instances, err := awsGetInstances(*ecc)
    87  	if err != nil && !billingEnabled {
    88  		slog.Warning("No EC2 Instances found.")
    89  	}
    90  	loadBalancers, err := awsGetLoadBalancers(*elb)
    91  	if err != nil && !billingEnabled {
    92  		slog.Warning("No ELB Load Balancers found.")
    93  	}
    94  	for _, loadBalancer := range loadBalancers {
    95  		awsGetELBLatency(*cw, &md, loadBalancer)
    96  		awsGetELBHostCounts(*cw, &md, loadBalancer)
    97  	}
    98  	for _, instance := range instances {
    99  		awsGetCPU(*cw, &md, instance)
   100  		awsGetNetwork(*cw, &md, instance)
   101  		awsGetDiskBytes(*cw, &md, instance)
   102  		awsGetDiskOps(*cw, &md, instance)
   103  		awsGetStatusChecks(*cw, &md, instance)
   104  	}
   105  	return md, nil
   106  }
   107  
   108  func awsGetInstances(ecc ec2.EC2) ([]*ec2.Instance, error) {
   109  	instancelist := []*ec2.Instance{}
   110  	resp, err := ecc.DescribeInstances(nil)
   111  	if err != nil {
   112  		return nil, fmt.Errorf("unable to describe EC2 Instances")
   113  	}
   114  	for _, reservation := range resp.Reservations {
   115  		for _, instance := range reservation.Instances {
   116  			instancelist = append(instancelist, instance)
   117  		}
   118  	}
   119  	return instancelist, nil
   120  }
   121  
   122  func awsGetLoadBalancers(lb elb.ELB) ([]*elb.LoadBalancerDescription, error) {
   123  	lbList := []*elb.LoadBalancerDescription{}
   124  	resp, err := lb.DescribeLoadBalancers(nil)
   125  	if err != nil {
   126  		return nil, fmt.Errorf("unable to describe ELB Balancers")
   127  	}
   128  	for _, loadBalancer := range resp.LoadBalancerDescriptions {
   129  		lbList = append(lbList, loadBalancer)
   130  	}
   131  	return lbList, nil
   132  }
   133  
   134  func awsGetCPU(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, instance *ec2.Instance) error {
   135  	search := cloudwatch.GetMetricStatisticsInput{
   136  		StartTime:  aws.Time(time.Now().UTC().Add(time.Second * -600)),
   137  		EndTime:    aws.Time(time.Now().UTC()),
   138  		MetricName: aws.String("CPUUtilization"),
   139  		Period:     &aws_period,
   140  		Statistics: []*string{aws.String("Average")},
   141  		Namespace:  aws.String("AWS/EC2"),
   142  		Unit:       aws.String("Percent"),
   143  		Dimensions: []*cloudwatch.Dimension{{Name: aws.String("InstanceId"), Value: instance.InstanceId}},
   144  	}
   145  	resp, err := cw.GetMetricStatistics(&search)
   146  	if err != nil {
   147  		return err
   148  	}
   149  	tags := opentsdb.TagSet{
   150  		"instance": *instance.InstanceId,
   151  	}
   152  	for _, datapoint := range resp.Datapoints {
   153  		AddTS(md, awsCPU, datapoint.Timestamp.Unix(), *datapoint.Average, tags, metadata.Gauge, metadata.Pct, descAWSEC2CPU)
   154  	}
   155  	return nil
   156  }
   157  func awsGetNetwork(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, instance *ec2.Instance) error {
   158  	search := cloudwatch.GetMetricStatisticsInput{
   159  		StartTime:  aws.Time(time.Now().UTC().Add(time.Second * -600)),
   160  		EndTime:    aws.Time(time.Now().UTC()),
   161  		MetricName: aws.String("NetworkIn"),
   162  		Period:     &aws_period,
   163  		Statistics: []*string{aws.String("Average")},
   164  		Namespace:  aws.String("AWS/EC2"),
   165  		Unit:       aws.String("Bytes"),
   166  		Dimensions: []*cloudwatch.Dimension{{Name: aws.String("InstanceId"), Value: instance.InstanceId}},
   167  	}
   168  	resp, err := cw.GetMetricStatistics(&search)
   169  	if err != nil {
   170  		return err
   171  	}
   172  	for _, datapoint := range resp.Datapoints {
   173  		AddTS(md, awsNetwork, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "direction": "in"}, metadata.Gauge, metadata.Bytes, descAWSEC2NetBytes)
   174  	}
   175  	search.MetricName = aws.String("NetworkOut")
   176  	resp, err = cw.GetMetricStatistics(&search)
   177  	if err != nil {
   178  		return err
   179  	}
   180  	for _, datapoint := range resp.Datapoints {
   181  		AddTS(md, awsNetwork, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "direction": "out"}, metadata.Gauge, metadata.Bytes, descAWSEC2NetBytes)
   182  	}
   183  	return nil
   184  }
   185  
   186  func awsGetDiskBytes(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, instance *ec2.Instance) error {
   187  	search := cloudwatch.GetMetricStatisticsInput{
   188  		StartTime:  aws.Time(time.Now().UTC().Add(time.Second * -600)),
   189  		EndTime:    aws.Time(time.Now().UTC()),
   190  		MetricName: aws.String("DiskReadBytes"),
   191  		Period:     &aws_period,
   192  		Statistics: []*string{aws.String("Average")},
   193  		Namespace:  aws.String("AWS/EC2"),
   194  		Unit:       aws.String("Bytes"),
   195  		Dimensions: []*cloudwatch.Dimension{{Name: aws.String("InstanceId"), Value: instance.InstanceId}},
   196  	}
   197  	resp, err := cw.GetMetricStatistics(&search)
   198  	if err != nil {
   199  		return err
   200  	}
   201  	for _, datapoint := range resp.Datapoints {
   202  		AddTS(md, awsEC2DiskBytes, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "operation": "read"}, metadata.Gauge, metadata.Bytes, descAWSEC2DiskBytes)
   203  	}
   204  	search.MetricName = aws.String("DiskWriteBytes")
   205  	resp, err = cw.GetMetricStatistics(&search)
   206  	if err != nil {
   207  		return err
   208  	}
   209  	for _, datapoint := range resp.Datapoints {
   210  		AddTS(md, awsEC2DiskBytes, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "operation": "write"}, metadata.Gauge, metadata.Bytes, descAWSEC2DiskBytes)
   211  	}
   212  	return nil
   213  }
   214  
   215  func awsGetDiskOps(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, instance *ec2.Instance) error {
   216  	search := cloudwatch.GetMetricStatisticsInput{
   217  		StartTime:  aws.Time(time.Now().UTC().Add(time.Second * -600)),
   218  		EndTime:    aws.Time(time.Now().UTC()),
   219  		MetricName: aws.String("DiskReadOps"),
   220  		Period:     &aws_period,
   221  		Statistics: []*string{aws.String("Average")},
   222  		Namespace:  aws.String("AWS/EC2"),
   223  		Unit:       aws.String("Count"),
   224  		Dimensions: []*cloudwatch.Dimension{{Name: aws.String("InstanceId"), Value: instance.InstanceId}},
   225  	}
   226  	resp, err := cw.GetMetricStatistics(&search)
   227  	if err != nil {
   228  		return err
   229  	}
   230  	for _, datapoint := range resp.Datapoints {
   231  		AddTS(md, awsEC2DiskOps, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "operation": "read"}, metadata.Gauge, metadata.Count, descAWSEC2DiskOps)
   232  	}
   233  	search.MetricName = aws.String("DiskWriteOps")
   234  	resp, err = cw.GetMetricStatistics(&search)
   235  	if err != nil {
   236  		return err
   237  	}
   238  	for _, datapoint := range resp.Datapoints {
   239  		AddTS(md, awsEC2DiskOps, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "operation": "write"}, metadata.Gauge, metadata.Count, descAWSEC2DiskOps)
   240  	}
   241  	return nil
   242  }
   243  
   244  func awsGetStatusChecks(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, instance *ec2.Instance) error {
   245  	period := int64(60)
   246  	search := cloudwatch.GetMetricStatisticsInput{
   247  		StartTime:  aws.Time(time.Now().UTC().Add(time.Second * -60)),
   248  		EndTime:    aws.Time(time.Now().UTC()),
   249  		MetricName: aws.String("StatusCheckFailed"),
   250  		Period:     &period,
   251  		Statistics: []*string{aws.String("Average")},
   252  		Namespace:  aws.String("AWS/EC2"),
   253  		Unit:       aws.String("Count"),
   254  		Dimensions: []*cloudwatch.Dimension{{Name: aws.String("InstanceId"), Value: instance.InstanceId}},
   255  	}
   256  	resp, err := cw.GetMetricStatistics(&search)
   257  	if err != nil {
   258  		return err
   259  	}
   260  	for _, datapoint := range resp.Datapoints {
   261  		AddTS(md, awsStatusCheckFailed, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId}, metadata.Gauge, metadata.Count, descAWSEC2StatusCheck)
   262  	}
   263  	search.MetricName = aws.String("StatusCheckFailed_Instance")
   264  	resp, err = cw.GetMetricStatistics(&search)
   265  	if err != nil {
   266  		return err
   267  	}
   268  	for _, datapoint := range resp.Datapoints {
   269  		AddTS(md, awsStatusCheckFailed, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "category": "instance"}, metadata.Gauge, metadata.Count, descAWSEC2StatusCheck)
   270  	}
   271  	search.MetricName = aws.String("StatusCheckFailed_System")
   272  	resp, err = cw.GetMetricStatistics(&search)
   273  	if err != nil {
   274  		return err
   275  	}
   276  	for _, datapoint := range resp.Datapoints {
   277  		AddTS(md, awsStatusCheckFailed, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "category": "system"}, metadata.Gauge, metadata.Count, descAWSEC2StatusCheck)
   278  	}
   279  	return nil
   280  }
   281  
   282  func awsGetELBLatency(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, loadBalancer *elb.LoadBalancerDescription) error {
   283  	search := cloudwatch.GetMetricStatisticsInput{
   284  		StartTime:  aws.Time(time.Now().UTC().Add(time.Second * -4000)),
   285  		EndTime:    aws.Time(time.Now().UTC()),
   286  		MetricName: aws.String("Latency"),
   287  		Period:     &aws_period,
   288  		Statistics: []*string{aws.String("Average"), aws.String("Minimum"), aws.String("Maximum")},
   289  		Namespace:  aws.String("AWS/ELB"),
   290  		Unit:       aws.String("Seconds"),
   291  		Dimensions: []*cloudwatch.Dimension{{Name: aws.String("LoadBalancerName"), Value: loadBalancer.LoadBalancerName}},
   292  	}
   293  	resp, err := cw.GetMetricStatistics(&search)
   294  	if err != nil {
   295  		return err
   296  	}
   297  	for _, datapoint := range resp.Datapoints {
   298  		AddTS(md, awsELBLatencyMin, datapoint.Timestamp.Unix(), *datapoint.Minimum, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Second, descAWSELBLatency)
   299  		AddTS(md, awsELBLatencyMax, datapoint.Timestamp.Unix(), *datapoint.Maximum, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Second, descAWSELBLatency)
   300  		AddTS(md, awsELBLatencyAvg, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Second, descAWSELBLatency)
   301  	}
   302  	return nil
   303  }
   304  func awsGetELBHostCounts(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, loadBalancer *elb.LoadBalancerDescription) error {
   305  	search := cloudwatch.GetMetricStatisticsInput{
   306  		StartTime:  aws.Time(time.Now().UTC().Add(time.Second * -60)),
   307  		EndTime:    aws.Time(time.Now().UTC()),
   308  		MetricName: aws.String("HealthyHostCount"),
   309  		Period:     &aws_period,
   310  		Statistics: []*string{aws.String("Average")},
   311  		Namespace:  aws.String("AWS/ELB"),
   312  		Unit:       aws.String("Count"),
   313  		Dimensions: []*cloudwatch.Dimension{{Name: aws.String("LoadBalancerName"), Value: loadBalancer.LoadBalancerName}},
   314  	}
   315  	resp, err := cw.GetMetricStatistics(&search)
   316  	if err != nil {
   317  		return err
   318  	}
   319  	for _, datapoint := range resp.Datapoints {
   320  		AddTS(md, awsELBHostsHealthy, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Count, descAWSELBHostCount)
   321  	}
   322  	search.MetricName = aws.String("UnhealthyHostCount")
   323  	resp, err = cw.GetMetricStatistics(&search)
   324  	if err != nil {
   325  		return err
   326  	}
   327  	if resp.Datapoints == nil {
   328  		AddTS(md, awsELBHostsUnHealthy, time.Now().UTC().Unix(), 0, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Count, descAWSELBHostCount)
   329  	} else {
   330  		for _, datapoint := range resp.Datapoints {
   331  			AddTS(md, awsELBHostsUnHealthy, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Count, descAWSELBHostCount)
   332  		}
   333  	}
   334  	return nil
   335  }