bosun.org@v0.0.0-20250213104149-b8d3e981f37d/cmd/scollector/collectors/aws.go (about)

     1  package collectors
     2  
     3  import (
     4  	"fmt"
     5  	"time"
     6  
     7  	"bosun.org/metadata"
     8  	"bosun.org/opentsdb"
     9  	"bosun.org/slog"
    10  	"github.com/aws/aws-sdk-go/aws"
    11  	"github.com/aws/aws-sdk-go/aws/credentials"
    12  	"github.com/aws/aws-sdk-go/aws/session"
    13  	"github.com/aws/aws-sdk-go/service/cloudwatch"
    14  	"github.com/aws/aws-sdk-go/service/ec2"
    15  	"github.com/aws/aws-sdk-go/service/elb"
    16  )
    17  
    18  const (
    19  	awsCPU                = "aws.ec2.cpu"
    20  	awsEC2DiskBytes       = "aws.ec2.disk.bytes"
    21  	awsEC2DiskOps         = "aws.ec2.disk.ops"
    22  	awsELBHostsHealthy    = "aws.elb.hosts.healthy"
    23  	awsELBHostsUnHealthy  = "aws.elb.hosts.unhealthy"
    24  	awsELBLatencyAvg      = "aws.elb.latency.average"
    25  	awsELBLatencyMax      = "aws.elb.latency.maximum"
    26  	awsELBLatencyMin      = "aws.elb.latency.minimum"
    27  	awsNetwork            = "aws.ec2.net.bytes"
    28  	awsStatusCheckFailed  = "aws.ec2.status.failed"
    29  	descAWSEC2CPU         = "The average CPU Utilization, gathered at a 60 second interval and averaged over five minutes."
    30  	descAWSEC2DiskBytes   = "The average bytes written or read via disk, gathered at a 60 second interval and averaged over five minutes."
    31  	descAWSEC2DiskOps     = "The average disk operations, either written or read, gathered at a 60 second interval and averaged over five minutes."
    32  	descAWSEC2NetBytes    = "The average bytes transmitted or received via network, gathered at a 60 second interval and averaged over five minutes."
    33  	descAWSEC2StatusCheck = "The EC2 Status Check, which includes both instance-level and system-level drill-down, gathered every 60 seconds."
    34  	descAWSELBHostCount   = "The number of instances in what the Elastic Load Balancer considers a healthy state, gathered every 60 seconds."
    35  	descAWSELBLatency     = "The minimum, maximum and average latency as reported by the load balancer, gathered at a 60 second interval and averaged over five minutes."
    36  )
    37  
    38  var aws_period = int64(60)
    39  
    40  func AWS(accessKey, secretKey, region, productCodes, bucketName, bucketPath string, purgeDays int) error {
    41  	if accessKey == "" || secretKey == "" || region == "" {
    42  		return fmt.Errorf("empty AccessKey, SecretKey, or Region in AWS")
    43  	}
    44  	//mhenderson: There are some alerts in the aws collector that we don't want to output in the event that
    45  	//billing only is enabled, as you might enable billing without having any EC3 or ELB instances.
    46  	billingEnabled := bucketName != "" && bucketPath != ""
    47  	collectors = append(collectors, &IntervalCollector{
    48  		F: func() (opentsdb.MultiDataPoint, error) {
    49  			return c_aws(accessKey, secretKey, region, billingEnabled)
    50  		},
    51  		Interval: 60 * time.Second,
    52  		name:     fmt.Sprintf("aws-%s", region),
    53  	})
    54  
    55  	if billingEnabled {
    56  		collectors = append(collectors, &IntervalCollector{
    57  			F: func() (opentsdb.MultiDataPoint, error) {
    58  				return c_awsBilling(accessKey, secretKey, region, productCodes, bucketName, bucketPath, purgeDays)
    59  			},
    60  			Interval: 1 * time.Hour,
    61  			name:     fmt.Sprintf("awsBilling-%s", region),
    62  		})
    63  	}
    64  	return nil
    65  }
    66  
    67  func c_aws(accessKey, secretKey, region string, billingEnabled bool) (opentsdb.MultiDataPoint, error) {
    68  	var md opentsdb.MultiDataPoint
    69  	creds := credentials.NewStaticCredentials(accessKey, secretKey, "")
    70  	conf := &aws.Config{
    71  		Credentials: creds,
    72  		Region:      &region,
    73  	}
    74  	ecc := ec2.New(session.New(), conf)
    75  	if ecc == nil {
    76  		return nil, fmt.Errorf("unable to login to EC2")
    77  	}
    78  	elb := elb.New(session.New(), conf)
    79  	if elb == nil {
    80  		return nil, fmt.Errorf("unable to login to ELB")
    81  	}
    82  	cw := cloudwatch.New(session.New(), conf)
    83  	if cw == nil {
    84  		return nil, fmt.Errorf("unable to login to CloudWatch")
    85  	}
    86  	instances, err := awsGetInstances(*ecc)
    87  	if err != nil && !billingEnabled {
    88  		slog.Warning("No EC2 Instances found.")
    89  	}
    90  	loadBalancers, err := awsGetLoadBalancers(*elb)
    91  	if err != nil && !billingEnabled {
    92  		slog.Warning("No ELB Load Balancers found.")
    93  	}
    94  	for _, loadBalancer := range loadBalancers {
    95  		awsGetELBLatency(*cw, &md, loadBalancer)
    96  		awsGetELBHostCounts(*cw, &md, loadBalancer)
    97  	}
    98  	for _, instance := range instances {
    99  		awsGetCPU(*cw, &md, instance)
   100  		awsGetNetwork(*cw, &md, instance)
   101  		awsGetDiskBytes(*cw, &md, instance)
   102  		awsGetDiskOps(*cw, &md, instance)
   103  		awsGetStatusChecks(*cw, &md, instance)
   104  	}
   105  	return md, nil
   106  }
   107  
   108  func awsGetInstances(ecc ec2.EC2) ([]*ec2.Instance, error) {
   109  	instancelist := []*ec2.Instance{}
   110  	resp, err := ecc.DescribeInstances(nil)
   111  	if err != nil {
   112  		return nil, fmt.Errorf("unable to describe EC2 Instances")
   113  	}
   114  	for _, reservation := range resp.Reservations {
   115  		instancelist = append(instancelist, reservation.Instances...)
   116  	}
   117  	return instancelist, nil
   118  }
   119  
   120  func awsGetLoadBalancers(lb elb.ELB) ([]*elb.LoadBalancerDescription, error) {
   121  	lbList := []*elb.LoadBalancerDescription{}
   122  	resp, err := lb.DescribeLoadBalancers(nil)
   123  	if err != nil {
   124  		return nil, fmt.Errorf("unable to describe ELB Balancers")
   125  	}
   126  	lbList = append(lbList, resp.LoadBalancerDescriptions...)
   127  	return lbList, nil
   128  }
   129  
   130  func awsGetCPU(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, instance *ec2.Instance) error {
   131  	search := cloudwatch.GetMetricStatisticsInput{
   132  		StartTime:  aws.Time(time.Now().UTC().Add(time.Second * -600)),
   133  		EndTime:    aws.Time(time.Now().UTC()),
   134  		MetricName: aws.String("CPUUtilization"),
   135  		Period:     &aws_period,
   136  		Statistics: []*string{aws.String("Average")},
   137  		Namespace:  aws.String("AWS/EC2"),
   138  		Unit:       aws.String("Percent"),
   139  		Dimensions: []*cloudwatch.Dimension{{Name: aws.String("InstanceId"), Value: instance.InstanceId}},
   140  	}
   141  	resp, err := cw.GetMetricStatistics(&search)
   142  	if err != nil {
   143  		return err
   144  	}
   145  	tags := opentsdb.TagSet{
   146  		"instance": *instance.InstanceId,
   147  	}
   148  	for _, datapoint := range resp.Datapoints {
   149  		AddTS(md, awsCPU, datapoint.Timestamp.Unix(), *datapoint.Average, tags, metadata.Gauge, metadata.Pct, descAWSEC2CPU)
   150  	}
   151  	return nil
   152  }
   153  func awsGetNetwork(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, instance *ec2.Instance) error {
   154  	search := cloudwatch.GetMetricStatisticsInput{
   155  		StartTime:  aws.Time(time.Now().UTC().Add(time.Second * -600)),
   156  		EndTime:    aws.Time(time.Now().UTC()),
   157  		MetricName: aws.String("NetworkIn"),
   158  		Period:     &aws_period,
   159  		Statistics: []*string{aws.String("Average")},
   160  		Namespace:  aws.String("AWS/EC2"),
   161  		Unit:       aws.String("Bytes"),
   162  		Dimensions: []*cloudwatch.Dimension{{Name: aws.String("InstanceId"), Value: instance.InstanceId}},
   163  	}
   164  	resp, err := cw.GetMetricStatistics(&search)
   165  	if err != nil {
   166  		return err
   167  	}
   168  	for _, datapoint := range resp.Datapoints {
   169  		AddTS(md, awsNetwork, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "direction": "in"}, metadata.Gauge, metadata.Bytes, descAWSEC2NetBytes)
   170  	}
   171  	search.MetricName = aws.String("NetworkOut")
   172  	resp, err = cw.GetMetricStatistics(&search)
   173  	if err != nil {
   174  		return err
   175  	}
   176  	for _, datapoint := range resp.Datapoints {
   177  		AddTS(md, awsNetwork, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "direction": "out"}, metadata.Gauge, metadata.Bytes, descAWSEC2NetBytes)
   178  	}
   179  	return nil
   180  }
   181  
   182  func awsGetDiskBytes(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, instance *ec2.Instance) error {
   183  	search := cloudwatch.GetMetricStatisticsInput{
   184  		StartTime:  aws.Time(time.Now().UTC().Add(time.Second * -600)),
   185  		EndTime:    aws.Time(time.Now().UTC()),
   186  		MetricName: aws.String("DiskReadBytes"),
   187  		Period:     &aws_period,
   188  		Statistics: []*string{aws.String("Average")},
   189  		Namespace:  aws.String("AWS/EC2"),
   190  		Unit:       aws.String("Bytes"),
   191  		Dimensions: []*cloudwatch.Dimension{{Name: aws.String("InstanceId"), Value: instance.InstanceId}},
   192  	}
   193  	resp, err := cw.GetMetricStatistics(&search)
   194  	if err != nil {
   195  		return err
   196  	}
   197  	for _, datapoint := range resp.Datapoints {
   198  		AddTS(md, awsEC2DiskBytes, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "operation": "read"}, metadata.Gauge, metadata.Bytes, descAWSEC2DiskBytes)
   199  	}
   200  	search.MetricName = aws.String("DiskWriteBytes")
   201  	resp, err = cw.GetMetricStatistics(&search)
   202  	if err != nil {
   203  		return err
   204  	}
   205  	for _, datapoint := range resp.Datapoints {
   206  		AddTS(md, awsEC2DiskBytes, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "operation": "write"}, metadata.Gauge, metadata.Bytes, descAWSEC2DiskBytes)
   207  	}
   208  	return nil
   209  }
   210  
   211  func awsGetDiskOps(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, instance *ec2.Instance) error {
   212  	search := cloudwatch.GetMetricStatisticsInput{
   213  		StartTime:  aws.Time(time.Now().UTC().Add(time.Second * -600)),
   214  		EndTime:    aws.Time(time.Now().UTC()),
   215  		MetricName: aws.String("DiskReadOps"),
   216  		Period:     &aws_period,
   217  		Statistics: []*string{aws.String("Average")},
   218  		Namespace:  aws.String("AWS/EC2"),
   219  		Unit:       aws.String("Count"),
   220  		Dimensions: []*cloudwatch.Dimension{{Name: aws.String("InstanceId"), Value: instance.InstanceId}},
   221  	}
   222  	resp, err := cw.GetMetricStatistics(&search)
   223  	if err != nil {
   224  		return err
   225  	}
   226  	for _, datapoint := range resp.Datapoints {
   227  		AddTS(md, awsEC2DiskOps, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "operation": "read"}, metadata.Gauge, metadata.Count, descAWSEC2DiskOps)
   228  	}
   229  	search.MetricName = aws.String("DiskWriteOps")
   230  	resp, err = cw.GetMetricStatistics(&search)
   231  	if err != nil {
   232  		return err
   233  	}
   234  	for _, datapoint := range resp.Datapoints {
   235  		AddTS(md, awsEC2DiskOps, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "operation": "write"}, metadata.Gauge, metadata.Count, descAWSEC2DiskOps)
   236  	}
   237  	return nil
   238  }
   239  
   240  func awsGetStatusChecks(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, instance *ec2.Instance) error {
   241  	period := int64(60)
   242  	search := cloudwatch.GetMetricStatisticsInput{
   243  		StartTime:  aws.Time(time.Now().UTC().Add(time.Second * -60)),
   244  		EndTime:    aws.Time(time.Now().UTC()),
   245  		MetricName: aws.String("StatusCheckFailed"),
   246  		Period:     &period,
   247  		Statistics: []*string{aws.String("Average")},
   248  		Namespace:  aws.String("AWS/EC2"),
   249  		Unit:       aws.String("Count"),
   250  		Dimensions: []*cloudwatch.Dimension{{Name: aws.String("InstanceId"), Value: instance.InstanceId}},
   251  	}
   252  	resp, err := cw.GetMetricStatistics(&search)
   253  	if err != nil {
   254  		return err
   255  	}
   256  	for _, datapoint := range resp.Datapoints {
   257  		AddTS(md, awsStatusCheckFailed, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId}, metadata.Gauge, metadata.Count, descAWSEC2StatusCheck)
   258  	}
   259  	search.MetricName = aws.String("StatusCheckFailed_Instance")
   260  	resp, err = cw.GetMetricStatistics(&search)
   261  	if err != nil {
   262  		return err
   263  	}
   264  	for _, datapoint := range resp.Datapoints {
   265  		AddTS(md, awsStatusCheckFailed, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "category": "instance"}, metadata.Gauge, metadata.Count, descAWSEC2StatusCheck)
   266  	}
   267  	search.MetricName = aws.String("StatusCheckFailed_System")
   268  	resp, err = cw.GetMetricStatistics(&search)
   269  	if err != nil {
   270  		return err
   271  	}
   272  	for _, datapoint := range resp.Datapoints {
   273  		AddTS(md, awsStatusCheckFailed, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "category": "system"}, metadata.Gauge, metadata.Count, descAWSEC2StatusCheck)
   274  	}
   275  	return nil
   276  }
   277  
   278  func awsGetELBLatency(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, loadBalancer *elb.LoadBalancerDescription) error {
   279  	search := cloudwatch.GetMetricStatisticsInput{
   280  		StartTime:  aws.Time(time.Now().UTC().Add(time.Second * -4000)),
   281  		EndTime:    aws.Time(time.Now().UTC()),
   282  		MetricName: aws.String("Latency"),
   283  		Period:     &aws_period,
   284  		Statistics: []*string{aws.String("Average"), aws.String("Minimum"), aws.String("Maximum")},
   285  		Namespace:  aws.String("AWS/ELB"),
   286  		Unit:       aws.String("Seconds"),
   287  		Dimensions: []*cloudwatch.Dimension{{Name: aws.String("LoadBalancerName"), Value: loadBalancer.LoadBalancerName}},
   288  	}
   289  	resp, err := cw.GetMetricStatistics(&search)
   290  	if err != nil {
   291  		return err
   292  	}
   293  	for _, datapoint := range resp.Datapoints {
   294  		AddTS(md, awsELBLatencyMin, datapoint.Timestamp.Unix(), *datapoint.Minimum, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Second, descAWSELBLatency)
   295  		AddTS(md, awsELBLatencyMax, datapoint.Timestamp.Unix(), *datapoint.Maximum, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Second, descAWSELBLatency)
   296  		AddTS(md, awsELBLatencyAvg, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Second, descAWSELBLatency)
   297  	}
   298  	return nil
   299  }
   300  func awsGetELBHostCounts(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, loadBalancer *elb.LoadBalancerDescription) error {
   301  	search := cloudwatch.GetMetricStatisticsInput{
   302  		StartTime:  aws.Time(time.Now().UTC().Add(time.Second * -60)),
   303  		EndTime:    aws.Time(time.Now().UTC()),
   304  		MetricName: aws.String("HealthyHostCount"),
   305  		Period:     &aws_period,
   306  		Statistics: []*string{aws.String("Average")},
   307  		Namespace:  aws.String("AWS/ELB"),
   308  		Unit:       aws.String("Count"),
   309  		Dimensions: []*cloudwatch.Dimension{{Name: aws.String("LoadBalancerName"), Value: loadBalancer.LoadBalancerName}},
   310  	}
   311  	resp, err := cw.GetMetricStatistics(&search)
   312  	if err != nil {
   313  		return err
   314  	}
   315  	for _, datapoint := range resp.Datapoints {
   316  		AddTS(md, awsELBHostsHealthy, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Count, descAWSELBHostCount)
   317  	}
   318  	search.MetricName = aws.String("UnhealthyHostCount")
   319  	resp, err = cw.GetMetricStatistics(&search)
   320  	if err != nil {
   321  		return err
   322  	}
   323  	if resp.Datapoints == nil {
   324  		AddTS(md, awsELBHostsUnHealthy, time.Now().UTC().Unix(), 0, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Count, descAWSELBHostCount)
   325  	} else {
   326  		for _, datapoint := range resp.Datapoints {
   327  			AddTS(md, awsELBHostsUnHealthy, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Count, descAWSELBHostCount)
   328  		}
   329  	}
   330  	return nil
   331  }