bosun.org@v0.0.0-20210513094433-e25bc3e69a1f/cmd/scollector/collectors/aws.go (about) 1 package collectors 2 3 import ( 4 "fmt" 5 "time" 6 7 "bosun.org/metadata" 8 "bosun.org/opentsdb" 9 "bosun.org/slog" 10 "github.com/aws/aws-sdk-go/aws" 11 "github.com/aws/aws-sdk-go/aws/credentials" 12 "github.com/aws/aws-sdk-go/aws/session" 13 "github.com/aws/aws-sdk-go/service/cloudwatch" 14 "github.com/aws/aws-sdk-go/service/ec2" 15 "github.com/aws/aws-sdk-go/service/elb" 16 ) 17 18 const ( 19 awsCPU = "aws.ec2.cpu" 20 awsEC2DiskBytes = "aws.ec2.disk.bytes" 21 awsEC2DiskOps = "aws.ec2.disk.ops" 22 awsELBHostsHealthy = "aws.elb.hosts.healthy" 23 awsELBHostsUnHealthy = "aws.elb.hosts.unhealthy" 24 awsELBLatencyAvg = "aws.elb.latency.average" 25 awsELBLatencyMax = "aws.elb.latency.maximum" 26 awsELBLatencyMin = "aws.elb.latency.minimum" 27 awsNetwork = "aws.ec2.net.bytes" 28 awsStatusCheckFailed = "aws.ec2.status.failed" 29 descAWSEC2CPU = "The average CPU Utilization, gathered at a 60 second interval and averaged over five minutes." 30 descAWSEC2DiskBytes = "The average bytes written or read via disk, gathered at a 60 second interval and averaged over five minutes." 31 descAWSEC2DiskOps = "The average disk operations, either written or read, gathered at a 60 second interval and averaged over five minutes." 32 descAWSEC2NetBytes = "The average bytes transmitted or received via network, gathered at a 60 second interval and averaged over five minutes." 33 descAWSEC2StatusCheck = "The EC2 Status Check, which includes both instance-level and system-level drill-down, gathered every 60 seconds." 34 descAWSELBHostCount = "The number of instances in what the Elastic Load Balancer considers a healthy state, gathered every 60 seconds." 35 descAWSELBLatency = "The minimum, maximum and average latency as reported by the load balancer, gathered at a 60 second interval and averaged over five minutes." 36 ) 37 38 var aws_period = int64(60) 39 40 func AWS(accessKey, secretKey, region, productCodes, bucketName, bucketPath string, purgeDays int) error { 41 if accessKey == "" || secretKey == "" || region == "" { 42 return fmt.Errorf("empty AccessKey, SecretKey, or Region in AWS") 43 } 44 //mhenderson: There are some alerts in the aws collector that we don't want to output in the event that 45 //billing only is enabled, as you might enable billing without having any EC3 or ELB instances. 46 billingEnabled := bucketName != "" && bucketPath != "" 47 collectors = append(collectors, &IntervalCollector{ 48 F: func() (opentsdb.MultiDataPoint, error) { 49 return c_aws(accessKey, secretKey, region, billingEnabled) 50 }, 51 Interval: 60 * time.Second, 52 name: fmt.Sprintf("aws-%s", region), 53 }) 54 55 if billingEnabled { 56 collectors = append(collectors, &IntervalCollector{ 57 F: func() (opentsdb.MultiDataPoint, error) { 58 return c_awsBilling(accessKey, secretKey, region, productCodes, bucketName, bucketPath, purgeDays) 59 }, 60 Interval: 1 * time.Hour, 61 name: fmt.Sprintf("awsBilling-%s", region), 62 }) 63 } 64 return nil 65 } 66 67 func c_aws(accessKey, secretKey, region string, billingEnabled bool) (opentsdb.MultiDataPoint, error) { 68 var md opentsdb.MultiDataPoint 69 creds := credentials.NewStaticCredentials(accessKey, secretKey, "") 70 conf := &aws.Config{ 71 Credentials: creds, 72 Region: ®ion, 73 } 74 ecc := ec2.New(session.New(), conf) 75 if ecc == nil { 76 return nil, fmt.Errorf("unable to login to EC2") 77 } 78 elb := elb.New(session.New(), conf) 79 if elb == nil { 80 return nil, fmt.Errorf("unable to login to ELB") 81 } 82 cw := cloudwatch.New(session.New(), conf) 83 if cw == nil { 84 return nil, fmt.Errorf("unable to login to CloudWatch") 85 } 86 instances, err := awsGetInstances(*ecc) 87 if err != nil && !billingEnabled { 88 slog.Warning("No EC2 Instances found.") 89 } 90 loadBalancers, err := awsGetLoadBalancers(*elb) 91 if err != nil && !billingEnabled { 92 slog.Warning("No ELB Load Balancers found.") 93 } 94 for _, loadBalancer := range loadBalancers { 95 awsGetELBLatency(*cw, &md, loadBalancer) 96 awsGetELBHostCounts(*cw, &md, loadBalancer) 97 } 98 for _, instance := range instances { 99 awsGetCPU(*cw, &md, instance) 100 awsGetNetwork(*cw, &md, instance) 101 awsGetDiskBytes(*cw, &md, instance) 102 awsGetDiskOps(*cw, &md, instance) 103 awsGetStatusChecks(*cw, &md, instance) 104 } 105 return md, nil 106 } 107 108 func awsGetInstances(ecc ec2.EC2) ([]*ec2.Instance, error) { 109 instancelist := []*ec2.Instance{} 110 resp, err := ecc.DescribeInstances(nil) 111 if err != nil { 112 return nil, fmt.Errorf("unable to describe EC2 Instances") 113 } 114 for _, reservation := range resp.Reservations { 115 for _, instance := range reservation.Instances { 116 instancelist = append(instancelist, instance) 117 } 118 } 119 return instancelist, nil 120 } 121 122 func awsGetLoadBalancers(lb elb.ELB) ([]*elb.LoadBalancerDescription, error) { 123 lbList := []*elb.LoadBalancerDescription{} 124 resp, err := lb.DescribeLoadBalancers(nil) 125 if err != nil { 126 return nil, fmt.Errorf("unable to describe ELB Balancers") 127 } 128 for _, loadBalancer := range resp.LoadBalancerDescriptions { 129 lbList = append(lbList, loadBalancer) 130 } 131 return lbList, nil 132 } 133 134 func awsGetCPU(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, instance *ec2.Instance) error { 135 search := cloudwatch.GetMetricStatisticsInput{ 136 StartTime: aws.Time(time.Now().UTC().Add(time.Second * -600)), 137 EndTime: aws.Time(time.Now().UTC()), 138 MetricName: aws.String("CPUUtilization"), 139 Period: &aws_period, 140 Statistics: []*string{aws.String("Average")}, 141 Namespace: aws.String("AWS/EC2"), 142 Unit: aws.String("Percent"), 143 Dimensions: []*cloudwatch.Dimension{{Name: aws.String("InstanceId"), Value: instance.InstanceId}}, 144 } 145 resp, err := cw.GetMetricStatistics(&search) 146 if err != nil { 147 return err 148 } 149 tags := opentsdb.TagSet{ 150 "instance": *instance.InstanceId, 151 } 152 for _, datapoint := range resp.Datapoints { 153 AddTS(md, awsCPU, datapoint.Timestamp.Unix(), *datapoint.Average, tags, metadata.Gauge, metadata.Pct, descAWSEC2CPU) 154 } 155 return nil 156 } 157 func awsGetNetwork(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, instance *ec2.Instance) error { 158 search := cloudwatch.GetMetricStatisticsInput{ 159 StartTime: aws.Time(time.Now().UTC().Add(time.Second * -600)), 160 EndTime: aws.Time(time.Now().UTC()), 161 MetricName: aws.String("NetworkIn"), 162 Period: &aws_period, 163 Statistics: []*string{aws.String("Average")}, 164 Namespace: aws.String("AWS/EC2"), 165 Unit: aws.String("Bytes"), 166 Dimensions: []*cloudwatch.Dimension{{Name: aws.String("InstanceId"), Value: instance.InstanceId}}, 167 } 168 resp, err := cw.GetMetricStatistics(&search) 169 if err != nil { 170 return err 171 } 172 for _, datapoint := range resp.Datapoints { 173 AddTS(md, awsNetwork, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "direction": "in"}, metadata.Gauge, metadata.Bytes, descAWSEC2NetBytes) 174 } 175 search.MetricName = aws.String("NetworkOut") 176 resp, err = cw.GetMetricStatistics(&search) 177 if err != nil { 178 return err 179 } 180 for _, datapoint := range resp.Datapoints { 181 AddTS(md, awsNetwork, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "direction": "out"}, metadata.Gauge, metadata.Bytes, descAWSEC2NetBytes) 182 } 183 return nil 184 } 185 186 func awsGetDiskBytes(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, instance *ec2.Instance) error { 187 search := cloudwatch.GetMetricStatisticsInput{ 188 StartTime: aws.Time(time.Now().UTC().Add(time.Second * -600)), 189 EndTime: aws.Time(time.Now().UTC()), 190 MetricName: aws.String("DiskReadBytes"), 191 Period: &aws_period, 192 Statistics: []*string{aws.String("Average")}, 193 Namespace: aws.String("AWS/EC2"), 194 Unit: aws.String("Bytes"), 195 Dimensions: []*cloudwatch.Dimension{{Name: aws.String("InstanceId"), Value: instance.InstanceId}}, 196 } 197 resp, err := cw.GetMetricStatistics(&search) 198 if err != nil { 199 return err 200 } 201 for _, datapoint := range resp.Datapoints { 202 AddTS(md, awsEC2DiskBytes, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "operation": "read"}, metadata.Gauge, metadata.Bytes, descAWSEC2DiskBytes) 203 } 204 search.MetricName = aws.String("DiskWriteBytes") 205 resp, err = cw.GetMetricStatistics(&search) 206 if err != nil { 207 return err 208 } 209 for _, datapoint := range resp.Datapoints { 210 AddTS(md, awsEC2DiskBytes, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "operation": "write"}, metadata.Gauge, metadata.Bytes, descAWSEC2DiskBytes) 211 } 212 return nil 213 } 214 215 func awsGetDiskOps(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, instance *ec2.Instance) error { 216 search := cloudwatch.GetMetricStatisticsInput{ 217 StartTime: aws.Time(time.Now().UTC().Add(time.Second * -600)), 218 EndTime: aws.Time(time.Now().UTC()), 219 MetricName: aws.String("DiskReadOps"), 220 Period: &aws_period, 221 Statistics: []*string{aws.String("Average")}, 222 Namespace: aws.String("AWS/EC2"), 223 Unit: aws.String("Count"), 224 Dimensions: []*cloudwatch.Dimension{{Name: aws.String("InstanceId"), Value: instance.InstanceId}}, 225 } 226 resp, err := cw.GetMetricStatistics(&search) 227 if err != nil { 228 return err 229 } 230 for _, datapoint := range resp.Datapoints { 231 AddTS(md, awsEC2DiskOps, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "operation": "read"}, metadata.Gauge, metadata.Count, descAWSEC2DiskOps) 232 } 233 search.MetricName = aws.String("DiskWriteOps") 234 resp, err = cw.GetMetricStatistics(&search) 235 if err != nil { 236 return err 237 } 238 for _, datapoint := range resp.Datapoints { 239 AddTS(md, awsEC2DiskOps, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "operation": "write"}, metadata.Gauge, metadata.Count, descAWSEC2DiskOps) 240 } 241 return nil 242 } 243 244 func awsGetStatusChecks(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, instance *ec2.Instance) error { 245 period := int64(60) 246 search := cloudwatch.GetMetricStatisticsInput{ 247 StartTime: aws.Time(time.Now().UTC().Add(time.Second * -60)), 248 EndTime: aws.Time(time.Now().UTC()), 249 MetricName: aws.String("StatusCheckFailed"), 250 Period: &period, 251 Statistics: []*string{aws.String("Average")}, 252 Namespace: aws.String("AWS/EC2"), 253 Unit: aws.String("Count"), 254 Dimensions: []*cloudwatch.Dimension{{Name: aws.String("InstanceId"), Value: instance.InstanceId}}, 255 } 256 resp, err := cw.GetMetricStatistics(&search) 257 if err != nil { 258 return err 259 } 260 for _, datapoint := range resp.Datapoints { 261 AddTS(md, awsStatusCheckFailed, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId}, metadata.Gauge, metadata.Count, descAWSEC2StatusCheck) 262 } 263 search.MetricName = aws.String("StatusCheckFailed_Instance") 264 resp, err = cw.GetMetricStatistics(&search) 265 if err != nil { 266 return err 267 } 268 for _, datapoint := range resp.Datapoints { 269 AddTS(md, awsStatusCheckFailed, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "category": "instance"}, metadata.Gauge, metadata.Count, descAWSEC2StatusCheck) 270 } 271 search.MetricName = aws.String("StatusCheckFailed_System") 272 resp, err = cw.GetMetricStatistics(&search) 273 if err != nil { 274 return err 275 } 276 for _, datapoint := range resp.Datapoints { 277 AddTS(md, awsStatusCheckFailed, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"instance": *instance.InstanceId, "category": "system"}, metadata.Gauge, metadata.Count, descAWSEC2StatusCheck) 278 } 279 return nil 280 } 281 282 func awsGetELBLatency(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, loadBalancer *elb.LoadBalancerDescription) error { 283 search := cloudwatch.GetMetricStatisticsInput{ 284 StartTime: aws.Time(time.Now().UTC().Add(time.Second * -4000)), 285 EndTime: aws.Time(time.Now().UTC()), 286 MetricName: aws.String("Latency"), 287 Period: &aws_period, 288 Statistics: []*string{aws.String("Average"), aws.String("Minimum"), aws.String("Maximum")}, 289 Namespace: aws.String("AWS/ELB"), 290 Unit: aws.String("Seconds"), 291 Dimensions: []*cloudwatch.Dimension{{Name: aws.String("LoadBalancerName"), Value: loadBalancer.LoadBalancerName}}, 292 } 293 resp, err := cw.GetMetricStatistics(&search) 294 if err != nil { 295 return err 296 } 297 for _, datapoint := range resp.Datapoints { 298 AddTS(md, awsELBLatencyMin, datapoint.Timestamp.Unix(), *datapoint.Minimum, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Second, descAWSELBLatency) 299 AddTS(md, awsELBLatencyMax, datapoint.Timestamp.Unix(), *datapoint.Maximum, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Second, descAWSELBLatency) 300 AddTS(md, awsELBLatencyAvg, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Second, descAWSELBLatency) 301 } 302 return nil 303 } 304 func awsGetELBHostCounts(cw cloudwatch.CloudWatch, md *opentsdb.MultiDataPoint, loadBalancer *elb.LoadBalancerDescription) error { 305 search := cloudwatch.GetMetricStatisticsInput{ 306 StartTime: aws.Time(time.Now().UTC().Add(time.Second * -60)), 307 EndTime: aws.Time(time.Now().UTC()), 308 MetricName: aws.String("HealthyHostCount"), 309 Period: &aws_period, 310 Statistics: []*string{aws.String("Average")}, 311 Namespace: aws.String("AWS/ELB"), 312 Unit: aws.String("Count"), 313 Dimensions: []*cloudwatch.Dimension{{Name: aws.String("LoadBalancerName"), Value: loadBalancer.LoadBalancerName}}, 314 } 315 resp, err := cw.GetMetricStatistics(&search) 316 if err != nil { 317 return err 318 } 319 for _, datapoint := range resp.Datapoints { 320 AddTS(md, awsELBHostsHealthy, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Count, descAWSELBHostCount) 321 } 322 search.MetricName = aws.String("UnhealthyHostCount") 323 resp, err = cw.GetMetricStatistics(&search) 324 if err != nil { 325 return err 326 } 327 if resp.Datapoints == nil { 328 AddTS(md, awsELBHostsUnHealthy, time.Now().UTC().Unix(), 0, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Count, descAWSELBHostCount) 329 } else { 330 for _, datapoint := range resp.Datapoints { 331 AddTS(md, awsELBHostsUnHealthy, datapoint.Timestamp.Unix(), *datapoint.Average, opentsdb.TagSet{"loadbalancer": *loadBalancer.LoadBalancerName}, metadata.Gauge, metadata.Count, descAWSELBHostCount) 332 } 333 } 334 return nil 335 }