bosun.org@v0.0.0-20250213104149-b8d3e981f37d/cmd/scollector/collectors/aws.go (about) 1 package collectors 2 3 import ( 4 "fmt" 5 "time" 6 7 "bosun.org/metadata" 8 "bosun.org/opentsdb" 9 "bosun.org/slog" 10 "github.com/aws/aws-sdk-go/aws" 11 "github.com/aws/aws-sdk-go/aws/credentials" 12 "github.com/aws/aws-sdk-go/aws/session" 13 "github.com/aws/aws-sdk-go/service/cloudwatch" 14 "github.com/aws/aws-sdk-go/service/ec2" 15 "github.com/aws/aws-sdk-go/service/elb" 16 ) 17 18 const ( 19 awsCPU = "aws.ec2.cpu" 20 awsEC2DiskBytes = "aws.ec2.disk.bytes" 21 awsEC2DiskOps = "aws.ec2.disk.ops" 22 awsELBHostsHealthy = "aws.elb.hosts.healthy" 23 awsELBHostsUnHealthy = "aws.elb.hosts.unhealthy" 24 awsELBLatencyAvg = "aws.elb.latency.average" 25 awsELBLatencyMax = "aws.elb.latency.maximum" 26 awsELBLatencyMin = "aws.elb.latency.minimum" 27 awsNetwork = "aws.ec2.net.bytes" 28 awsStatusCheckFailed = "aws.ec2.status.failed" 29 descAWSEC2CPU = "The average CPU Utilization, gathered at a 60 second interval and averaged over five minutes." 30 descAWSEC2DiskBytes = "The average bytes written or read via disk, gathered at a 60 second interval and averaged over five minutes." 31 descAWSEC2DiskOps = "The average disk operations, either written or read, gathered at a 60 second interval and averaged over five minutes." 32 descAWSEC2NetBytes = "The average bytes transmitted or received via network, gathered at a 60 second interval and averaged over five minutes." 33 descAWSEC2StatusCheck = "The EC2 Status Check, which includes both instance-level and system-level drill-down, gathered every 60 seconds." 34 descAWSELBHostCount = "The number of instances in what the Elastic Load Balancer considers a healthy state, gathered every 60 seconds." 35 descAWSELBLatency = "The minimum, maximum and average latency as reported by the load balancer, gathered at a 60 second interval and averaged over five minutes." 36 ) 37 38 var aws_period = int64(60) 39 40 func AWS(accessKey, secretKey, region, productCodes, bucketName, bucketPath string, purgeDays int) error { 41 if accessKey == "" || secretKey == "" || region == "" { 42 return fmt.Errorf("empty AccessKey, SecretKey, or Region in AWS") 43 } 44 //mhenderson: There are some alerts in the aws collector that we don't want to output in the event that 45 //billing only is enabled, as you might enable billing without having any EC3 or ELB instances. 46 billingEnabled := bucketName != "" && bucketPath != "" 47 collectors = append(collectors, &IntervalCollector{ 48 F: func() (opentsdb.MultiDataPoint, error) { 49 return c_aws(accessKey, secretKey, region, billingEnabled) 50 }, 51 Interval: 60 * time.Second, 52 name: fmt.Sprintf("aws-%s", region), 53 }) 54 55 if billingEnabled { 56 collectors = append(collectors, &IntervalCollector{ 57 F: func() (opentsdb.MultiDataPoint, error) { 58 return c_awsBilling(accessKey, secretKey, region, productCodes, bucketName, bucketPath, purgeDays) 59 }, 60 Interval: 1 * time.Hour, 61 name: fmt.Sprintf("awsBilling-%s", region), 62 }) 63 } 64 return nil 65 } 66 67 func c_aws(accessKey, secretKey, region string, billingEnabled bool) (opentsdb.MultiDataPoint, error) { 68 var md opentsdb.MultiDataPoint 69 creds := credentials.NewStaticCredentials(accessKey, secretKey, "") 70 conf := &aws.Config{ 71 Credentials: creds, 72 Region: ®ion, 73 } 74 ecc := ec2.New(session.New(), conf) 75 if ecc == nil { 76 return nil, fmt.Errorf("unable to login to EC2") 77 } 78 elb := elb.New(session.New(), conf) 79 if elb == nil { 80 return nil, fmt.Errorf("unable to login to ELB") 81 } 82 cw := cloudwatch.New(session.New(), conf) 83 if cw == nil { 84 return nil, fmt.Errorf("unable to login to CloudWatch") 85 } 86 instances, err := awsGetInstances(*ecc) 87 if err != nil && !billingEnabled { 88 slog.Warning("No EC2 Instances found.") 89 } 90 loadBalancers, err := awsGetLoadBalancers(*elb) 91 if err != nil && !billingEnabled { 92 slog.Warning("No ELB Load Balancers found.") 93 } 94 for _, loadBalancer := range loadBalancers { 95 awsGetELBLatency(*cw, &md, loadBalancer) 96 awsGetELBHostCounts(*cw, &md, loadBalancer) 97 } 98 for _, instance := range instances { 99 awsGetCPU(*cw, &md, instance) 100 awsGetNetwork(*cw, &md, instance) 101 awsGetDiskBytes(*cw, &md, instance) 102 awsGetDiskOps(*cw, &md, instance) 103 awsGetStatusChecks(*cw, &md, instance) 104 } 105 return md, nil 106 } 107 108 func awsGetInstances(ecc ec2.EC2) ([]*ec2.Instance, error) { 109 instancelist := []*ec2.Instance{} 110 resp, err := ecc.DescribeInstances(nil) 111 if err != nil { 112 return nil, fmt.Errorf("unable to describe EC2 Instances") 113 } 114 for _, reservation := range resp.Reservations { 115 instancelist = append(instancelist, reservation.Instances...) 116 } 117 return instancelist, nil 118 } 119 120 func awsGetLoadBalancers(lb elb.ELB) ([]*elb.LoadBalancerDescription, error) { 121 lbList := []*elb.LoadBalancerDescription{} 122 resp, err := lb.DescribeLoadBalancers(nil) 123 if err != nil { 124 return nil, fmt.Errorf("unable to describe ELB Balancers") 125 } 126 lbList = append(lbList, resp.LoadBalancerDescriptions...) 127 return lbList, nil 128 } 129 130 func awsGetCPU(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, instance *ec2.Instance) error { 131 search := cloudwatch.GetMetricStatisticsInput{ 132 StartTime: aws.Time(time.Now().UTC().Add(time.Second * -600)), 133 EndTime: aws.Time(time.Now().UTC()), 134 MetricName: aws.String("CPUUtilization"), 135 Period: &aws_period, 136 Statistics: []*string{aws.String("Average")}, 137 Namespace: aws.String("AWS/EC2"), 138 Unit: aws.String("Percent"), 139 Dimensions: []*cloudwatch.Dimension{{Name: aws.String("InstanceId"), Value: instance.InstanceId}}, 140 } 141 resp, err := cw.GetMetricStatistics(&search) 142 if err != nil { 143 return err 144 } 145 tags := opentsdb.TagSet{ 146 "instance": *instance.InstanceId, 147 } 148 for _, datapoint := range resp.Datapoints { 149 AddTS(md, awsCPU, datapoint.Timestamp.Unix(), *datapoint.Average, tags, metadata.Gauge, metadata.Pct, descAWSEC2CPU) 150 } 151 return nil 152 } 153 func awsGetNetwork(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, instance *ec2.Instance) error { 154 search := cloudwatch.GetMetricStatisticsInput{ 155 StartTime: aws.Time(time.Now().UTC().Add(time.Second * -600)), 156 EndTime: aws.Time(time.Now().UTC()), 157 MetricName: aws.String("NetworkIn"), 158 Period: &aws_period, 159 Statistics: []*string{aws.String("Average")}, 160 Namespace: aws.String("AWS/EC2"), 161 Unit: aws.String("Bytes"), 162 Dimensions: []*cloudwatch.Dimension{{Name: aws.String("InstanceId"), Value: instance.InstanceId}}, 163 } 164 resp, err := cw.GetMetricStatistics(&search) 165 if err != nil { 166 return err 167 } 168 for _, datapoint := range resp.Datapoints { 169 AddTS(md, awsNetwork, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "direction": "in"}, metadata.Gauge, metadata.Bytes, descAWSEC2NetBytes) 170 } 171 search.MetricName = aws.String("NetworkOut") 172 resp, err = cw.GetMetricStatistics(&search) 173 if err != nil { 174 return err 175 } 176 for _, datapoint := range resp.Datapoints { 177 AddTS(md, awsNetwork, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "direction": "out"}, metadata.Gauge, metadata.Bytes, descAWSEC2NetBytes) 178 } 179 return nil 180 } 181 182 func awsGetDiskBytes(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, instance *ec2.Instance) error { 183 search := cloudwatch.GetMetricStatisticsInput{ 184 StartTime: aws.Time(time.Now().UTC().Add(time.Second * -600)), 185 EndTime: aws.Time(time.Now().UTC()), 186 MetricName: aws.String("DiskReadBytes"), 187 Period: &aws_period, 188 Statistics: []*string{aws.String("Average")}, 189 Namespace: aws.String("AWS/EC2"), 190 Unit: aws.String("Bytes"), 191 Dimensions: []*cloudwatch.Dimension{{Name: aws.String("InstanceId"), Value: instance.InstanceId}}, 192 } 193 resp, err := cw.GetMetricStatistics(&search) 194 if err != nil { 195 return err 196 } 197 for _, datapoint := range resp.Datapoints { 198 AddTS(md, awsEC2DiskBytes, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "operation": "read"}, metadata.Gauge, metadata.Bytes, descAWSEC2DiskBytes) 199 } 200 search.MetricName = aws.String("DiskWriteBytes") 201 resp, err = cw.GetMetricStatistics(&search) 202 if err != nil { 203 return err 204 } 205 for _, datapoint := range resp.Datapoints { 206 AddTS(md, awsEC2DiskBytes, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "operation": "write"}, metadata.Gauge, metadata.Bytes, descAWSEC2DiskBytes) 207 } 208 return nil 209 } 210 211 func awsGetDiskOps(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, instance *ec2.Instance) error { 212 search := cloudwatch.GetMetricStatisticsInput{ 213 StartTime: aws.Time(time.Now().UTC().Add(time.Second * -600)), 214 EndTime: aws.Time(time.Now().UTC()), 215 MetricName: aws.String("DiskReadOps"), 216 Period: &aws_period, 217 Statistics: []*string{aws.String("Average")}, 218 Namespace: aws.String("AWS/EC2"), 219 Unit: aws.String("Count"), 220 Dimensions: []*cloudwatch.Dimension{{Name: aws.String("InstanceId"), Value: instance.InstanceId}}, 221 } 222 resp, err := cw.GetMetricStatistics(&search) 223 if err != nil { 224 return err 225 } 226 for _, datapoint := range resp.Datapoints { 227 AddTS(md, awsEC2DiskOps, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "operation": "read"}, metadata.Gauge, metadata.Count, descAWSEC2DiskOps) 228 } 229 search.MetricName = aws.String("DiskWriteOps") 230 resp, err = cw.GetMetricStatistics(&search) 231 if err != nil { 232 return err 233 } 234 for _, datapoint := range resp.Datapoints { 235 AddTS(md, awsEC2DiskOps, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "operation": "write"}, metadata.Gauge, metadata.Count, descAWSEC2DiskOps) 236 } 237 return nil 238 } 239 240 func awsGetStatusChecks(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, instance *ec2.Instance) error { 241 period := int64(60) 242 search := cloudwatch.GetMetricStatisticsInput{ 243 StartTime: aws.Time(time.Now().UTC().Add(time.Second * -60)), 244 EndTime: aws.Time(time.Now().UTC()), 245 MetricName: aws.String("StatusCheckFailed"), 246 Period: &period, 247 Statistics: []*string{aws.String("Average")}, 248 Namespace: aws.String("AWS/EC2"), 249 Unit: aws.String("Count"), 250 Dimensions: []*cloudwatch.Dimension{{Name: aws.String("InstanceId"), Value: instance.InstanceId}}, 251 } 252 resp, err := cw.GetMetricStatistics(&search) 253 if err != nil { 254 return err 255 } 256 for _, datapoint := range resp.Datapoints { 257 AddTS(md, awsStatusCheckFailed, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId}, metadata.Gauge, metadata.Count, descAWSEC2StatusCheck) 258 } 259 search.MetricName = aws.String("StatusCheckFailed_Instance") 260 resp, err = cw.GetMetricStatistics(&search) 261 if err != nil { 262 return err 263 } 264 for _, datapoint := range resp.Datapoints { 265 AddTS(md, awsStatusCheckFailed, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "category": "instance"}, metadata.Gauge, metadata.Count, descAWSEC2StatusCheck) 266 } 267 search.MetricName = aws.String("StatusCheckFailed_System") 268 resp, err = cw.GetMetricStatistics(&search) 269 if err != nil { 270 return err 271 } 272 for _, datapoint := range resp.Datapoints { 273 AddTS(md, awsStatusCheckFailed, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "category": "system"}, metadata.Gauge, metadata.Count, descAWSEC2StatusCheck) 274 } 275 return nil 276 } 277 278 func awsGetELBLatency(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, loadBalancer *elb.LoadBalancerDescription) error { 279 search := cloudwatch.GetMetricStatisticsInput{ 280 StartTime: aws.Time(time.Now().UTC().Add(time.Second * -4000)), 281 EndTime: aws.Time(time.Now().UTC()), 282 MetricName: aws.String("Latency"), 283 Period: &aws_period, 284 Statistics: []*string{aws.String("Average"), aws.String("Minimum"), aws.String("Maximum")}, 285 Namespace: aws.String("AWS/ELB"), 286 Unit: aws.String("Seconds"), 287 Dimensions: []*cloudwatch.Dimension{{Name: aws.String("LoadBalancerName"), Value: loadBalancer.LoadBalancerName}}, 288 } 289 resp, err := cw.GetMetricStatistics(&search) 290 if err != nil { 291 return err 292 } 293 for _, datapoint := range resp.Datapoints { 294 AddTS(md, awsELBLatencyMin, datapoint.Timestamp.Unix(), *datapoint.Minimum, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Second, descAWSELBLatency) 295 AddTS(md, awsELBLatencyMax, datapoint.Timestamp.Unix(), *datapoint.Maximum, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Second, descAWSELBLatency) 296 AddTS(md, awsELBLatencyAvg, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Second, descAWSELBLatency) 297 } 298 return nil 299 } 300 func awsGetELBHostCounts(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, loadBalancer *elb.LoadBalancerDescription) error { 301 search := cloudwatch.GetMetricStatisticsInput{ 302 StartTime: aws.Time(time.Now().UTC().Add(time.Second * -60)), 303 EndTime: aws.Time(time.Now().UTC()), 304 MetricName: aws.String("HealthyHostCount"), 305 Period: &aws_period, 306 Statistics: []*string{aws.String("Average")}, 307 Namespace: aws.String("AWS/ELB"), 308 Unit: aws.String("Count"), 309 Dimensions: []*cloudwatch.Dimension{{Name: aws.String("LoadBalancerName"), Value: loadBalancer.LoadBalancerName}}, 310 } 311 resp, err := cw.GetMetricStatistics(&search) 312 if err != nil { 313 return err 314 } 315 for _, datapoint := range resp.Datapoints { 316 AddTS(md, awsELBHostsHealthy, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Count, descAWSELBHostCount) 317 } 318 search.MetricName = aws.String("UnhealthyHostCount") 319 resp, err = cw.GetMetricStatistics(&search) 320 if err != nil { 321 return err 322 } 323 if resp.Datapoints == nil { 324 AddTS(md, awsELBHostsUnHealthy, time.Now().UTC().Unix(), 0, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Count, descAWSELBHostCount) 325 } else { 326 for _, datapoint := range resp.Datapoints { 327 AddTS(md, awsELBHostsUnHealthy, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Count, descAWSELBHostCount) 328 } 329 } 330 return nil 331 }