github.com/galamsiva2020/kubernetes-heapster-monitoring@v0.0.0-20210823134957-3c1baa7c1e70/metrics/sinks/stackdriver/stackdriver.go (about) 1 // Copyright 2015 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package stackdriver 16 17 import ( 18 "context" 19 "encoding/json" 20 "fmt" 21 "math/rand" 22 "net/url" 23 "strconv" 24 "strings" 25 "time" 26 27 gce "cloud.google.com/go/compute/metadata" 28 sd_api "cloud.google.com/go/monitoring/apiv3" 29 "github.com/golang/glog" 30 google_proto "github.com/golang/protobuf/ptypes/timestamp" 31 "github.com/prometheus/client_golang/prometheus" 32 "google.golang.org/genproto/googleapis/api/metric" 33 "google.golang.org/genproto/googleapis/api/monitoredres" 34 monitoringpb "google.golang.org/genproto/googleapis/monitoring/v3" 35 grpc_codes "google.golang.org/grpc/codes" 36 grpc_status "google.golang.org/grpc/status" 37 gce_util "k8s.io/heapster/common/gce" 38 "k8s.io/heapster/metrics/core" 39 ) 40 41 const ( 42 maxTimeseriesPerRequest = 200 43 // 2 seconds on SD side, 1 extra for networking overhead 44 sdRequestLatencySec = 3 45 ) 46 47 type StackdriverSink struct { 48 project string 49 clusterName string 50 clusterLocation string 51 heapsterZone string 52 stackdriverClient *sd_api.MetricClient 53 minInterval time.Duration 54 lastExportTime time.Time 55 batchExportTimeoutSec int 56 initialDelaySec int 57 useOldResourceModel bool 58 useNewResourceModel bool 59 } 60 61 type metricMetadata struct { 62 MetricKind metric.MetricDescriptor_MetricKind 63 ValueType metric.MetricDescriptor_ValueType 64 Name string 65 } 66 67 var ( 68 // Sink performance metrics 69 70 requestsSent = prometheus.NewCounterVec( 71 prometheus.CounterOpts{ 72 Namespace: "heapster", 73 Subsystem: "stackdriver", 74 Name: "requests_count", 75 Help: "Number of requests with return codes", 76 }, 77 []string{"code"}, 78 ) 79 80 timeseriesSent = prometheus.NewCounterVec( 81 prometheus.CounterOpts{ 82 Namespace: "heapster", 83 Subsystem: "stackdriver", 84 Name: "timeseries_count", 85 Help: "Number of Timeseries sent with return codes", 86 }, 87 []string{"code"}, 88 ) 89 requestLatency = prometheus.NewSummary( 90 prometheus.SummaryOpts{ 91 Namespace: "heapster", 92 Subsystem: "stackdriver", 93 Name: "request_latency_milliseconds", 94 Help: "Latency of requests to Stackdriver Monitoring API.", 95 }, 96 ) 97 ) 98 99 func (sink *StackdriverSink) Name() string { 100 return "Stackdriver Sink" 101 } 102 103 func (sink *StackdriverSink) Stop() { 104 } 105 106 func (sink *StackdriverSink) processMetrics(metricValues map[string]core.MetricValue, 107 timestamp time.Time, labels map[string]string, collectionStartTime time.Time, entityCreateTime time.Time) []*monitoringpb.TimeSeries { 108 var timeSeries []*monitoringpb.TimeSeries 109 if sink.useOldResourceModel { 110 for name, value := range metricValues { 111 if ts := sink.LegacyTranslateMetric(timestamp, labels, name, value, collectionStartTime); ts != nil { 112 timeSeries = append(timeSeries, ts) 113 } 114 } 115 } 116 if sink.useNewResourceModel { 117 for name, value := range metricValues { 118 if ts := sink.TranslateMetric(timestamp, labels, name, value, collectionStartTime, entityCreateTime); ts != nil { 119 timeSeries = append(timeSeries, ts) 120 } 121 } 122 } 123 return timeSeries 124 } 125 126 func (sink *StackdriverSink) ExportData(dataBatch *core.DataBatch) { 127 // Make sure we don't export metrics too often. 128 if dataBatch.Timestamp.Before(sink.lastExportTime.Add(sink.minInterval)) { 129 glog.V(2).Infof("Skipping batch from %s because there hasn't passed %s from last export time %s", dataBatch.Timestamp, sink.minInterval, sink.lastExportTime) 130 return 131 } 132 sink.lastExportTime = dataBatch.Timestamp 133 134 requests := []*monitoringpb.CreateTimeSeriesRequest{} 135 req := getReq(sink.project) 136 for key, metricSet := range dataBatch.MetricSets { 137 switch metricSet.Labels["type"] { 138 case core.MetricSetTypeNode, core.MetricSetTypePod, core.MetricSetTypePodContainer, core.MetricSetTypeSystemContainer: 139 default: 140 continue 141 } 142 143 if metricSet.CollectionStartTime.IsZero() { 144 glog.V(2).Infof("Skipping incorrect metric set %s because collection start time is zero", key) 145 continue 146 } 147 148 // Hack used with legacy resource type "gke_container". It is used to represent three 149 // Kubernetes resources: container, pod or node. For pods container name is empty, for nodes it 150 // is set to artificial value "machine". Otherwise it stores actual container name. 151 // With new resource types, container_name is ignored for resources other than "k8s_container" 152 if sink.useOldResourceModel && metricSet.Labels["type"] == core.MetricSetTypeNode { 153 metricSet.Labels[core.LabelContainerName.Key] = "machine" 154 } 155 156 derivedMetrics := sink.computeDerivedMetrics(metricSet) 157 158 derivedTimeseries := sink.processMetrics(derivedMetrics.MetricValues, dataBatch.Timestamp, metricSet.Labels, metricSet.CollectionStartTime, metricSet.EntityCreateTime) 159 timeseries := sink.processMetrics(metricSet.MetricValues, dataBatch.Timestamp, metricSet.Labels, metricSet.CollectionStartTime, metricSet.EntityCreateTime) 160 161 timeseries = append(timeseries, derivedTimeseries...) 162 163 for _, ts := range timeseries { 164 req.TimeSeries = append(req.TimeSeries, ts) 165 if len(req.TimeSeries) >= maxTimeseriesPerRequest { 166 requests = append(requests, req) 167 req = getReq(sink.project) 168 } 169 } 170 171 for _, metric := range metricSet.LabeledMetrics { 172 if sink.useOldResourceModel { 173 if point := sink.LegacyTranslateLabeledMetric(dataBatch.Timestamp, metricSet.Labels, metric, metricSet.CollectionStartTime); point != nil { 174 req.TimeSeries = append(req.TimeSeries, point) 175 } 176 177 if len(req.TimeSeries) >= maxTimeseriesPerRequest { 178 requests = append(requests, req) 179 req = getReq(sink.project) 180 } 181 } 182 if sink.useNewResourceModel { 183 point := sink.TranslateLabeledMetric(dataBatch.Timestamp, metricSet.Labels, metric, metricSet.CollectionStartTime) 184 if point != nil { 185 req.TimeSeries = append(req.TimeSeries, point) 186 } 187 188 if len(req.TimeSeries) >= maxTimeseriesPerRequest { 189 requests = append(requests, req) 190 req = getReq(sink.project) 191 } 192 } 193 } 194 } 195 196 if len(req.TimeSeries) > 0 { 197 requests = append(requests, req) 198 } 199 200 go sink.sendRequests(requests) 201 } 202 203 func (sink *StackdriverSink) sendRequests(requests []*monitoringpb.CreateTimeSeriesRequest) { 204 // Each worker can handle at least batchExportTimeout/sdRequestLatencySec requests within the specified period. 205 // 5 extra workers just in case. 206 workers := 5 + len(requests)/(sink.batchExportTimeoutSec/sdRequestLatencySec) 207 requestQueue := make(chan *monitoringpb.CreateTimeSeriesRequest) 208 completedQueue := make(chan bool) 209 210 // Launch Go routines responsible for sending requests 211 for i := 0; i < workers; i++ { 212 go sink.requestSender(requestQueue, completedQueue) 213 } 214 215 timeout := time.Duration(sink.batchExportTimeoutSec) * time.Second 216 timeoutSending := time.After(timeout) 217 timeoutCompleted := time.After(timeout) 218 219 forloop: 220 for i, r := range requests { 221 select { 222 case requestQueue <- r: 223 // yet another request added to queue 224 case <-timeoutSending: 225 glog.Warningf("Timeout while exporting metrics to Stackdriver. Dropping %d out of %d requests.", len(requests)-i, len(requests)) 226 // TODO(piosz): consider cancelling requests in flight 227 // Report dropped requests in metrics. 228 for _, req := range requests[i:] { 229 requestsSent.WithLabelValues(grpc_codes.DeadlineExceeded.String()).Inc() 230 timeseriesSent. 231 WithLabelValues(grpc_codes.DeadlineExceeded.String()). 232 Add(float64(len(req.TimeSeries))) 233 } 234 break forloop 235 } 236 } 237 238 // Close the channel in order to cancel exporting routines. 239 close(requestQueue) 240 241 workersCompleted := 0 242 for { 243 select { 244 case <-completedQueue: 245 workersCompleted++ 246 if workersCompleted == workers { 247 glog.V(4).Infof("All %d workers successfully finished sending requests to SD.", workersCompleted) 248 return 249 } 250 case <-timeoutCompleted: 251 glog.Warningf("Only %d out of %d workers successfully finished sending requests to SD. Some metrics might be lost.", workersCompleted, workers) 252 return 253 } 254 } 255 } 256 257 func (sink *StackdriverSink) requestSender(reqQueue chan *monitoringpb.CreateTimeSeriesRequest, completedQueue chan bool) { 258 defer func() { 259 completedQueue <- true 260 }() 261 time.Sleep(time.Duration(rand.Intn(1000*sink.initialDelaySec)) * time.Millisecond) 262 for req := range reqQueue { 263 sink.sendOneRequest(req) 264 } 265 } 266 267 func marshalRequestAndLog(printer func([]byte), req *monitoringpb.CreateTimeSeriesRequest) { 268 reqJson, errJson := json.Marshal(req) 269 if errJson != nil { 270 glog.Errorf("Couldn't marshal Stackdriver request %v", errJson) 271 } else { 272 printer(reqJson) 273 } 274 } 275 276 func (sink *StackdriverSink) sendOneRequest(req *monitoringpb.CreateTimeSeriesRequest) { 277 startTime := time.Now() 278 err := sink.stackdriverClient.CreateTimeSeries(context.Background(), req) 279 280 var responseCode grpc_codes.Code 281 if err != nil { 282 glog.Warningf("Error while sending request to Stackdriver %v", err) 283 // Convert request to json and log it, but only if logging level is equal to 2 or more. 284 if glog.V(2) { 285 marshalRequestAndLog(func(reqJson []byte) { 286 glog.V(2).Infof("The request was: %s", reqJson) 287 }, req) 288 } 289 if status, ok := grpc_status.FromError(err); ok { 290 responseCode = status.Code() 291 } else { 292 responseCode = grpc_codes.Unknown 293 } 294 } else { 295 // Convert request to json and log it, but only if logging level is equal to 10 or more. 296 if glog.V(10) { 297 marshalRequestAndLog(func(reqJson []byte) { 298 glog.V(10).Infof("Stackdriver request sent: %s", reqJson) 299 }, req) 300 } 301 responseCode = grpc_codes.OK 302 } 303 304 requestsSent.WithLabelValues(responseCode.String()).Inc() 305 timeseriesSent. 306 WithLabelValues(responseCode.String()). 307 Add(float64(len(req.TimeSeries))) 308 requestLatency.Observe(time.Since(startTime).Seconds() / time.Millisecond.Seconds()) 309 } 310 311 func CreateStackdriverSink(uri *url.URL) (core.DataSink, error) { 312 if len(uri.Scheme) > 0 { 313 return nil, fmt.Errorf("Scheme should not be set for Stackdriver sink") 314 } 315 if len(uri.Host) > 0 { 316 return nil, fmt.Errorf("Host should not be set for Stackdriver sink") 317 } 318 319 opts := uri.Query() 320 321 useOldResourceModel := true 322 if err := parseBoolFlag(opts, "use_old_resources", &useOldResourceModel); err != nil { 323 return nil, err 324 } 325 useNewResourceModel := false 326 if err := parseBoolFlag(opts, "use_new_resources", &useNewResourceModel); err != nil { 327 return nil, err 328 } 329 330 minInterval := time.Nanosecond 331 if len(opts["min_interval_sec"]) >= 1 { 332 if interval, err := strconv.Atoi(opts["min_interval_sec"][0]); err != nil { 333 return nil, fmt.Errorf("Min interval should be an integer, found: %v", opts["min_interval_sec"][0]) 334 } else { 335 minInterval = time.Duration(interval) * time.Second 336 } 337 } 338 339 batchExportTimeoutSec := 60 340 var err error 341 if len(opts["batch_export_timeout_sec"]) >= 1 { 342 if batchExportTimeoutSec, err = strconv.Atoi(opts["batch_export_timeout_sec"][0]); err != nil { 343 return nil, fmt.Errorf("Batch export timeout should be an integer, found: %v", opts["batch_export_timeout_sec"][0]) 344 } 345 } 346 347 initialDelaySec := sdRequestLatencySec 348 if len(opts["initial_delay_sec"]) >= 1 { 349 if initialDelaySec, err = strconv.Atoi(opts["initial_delay_sec"][0]); err != nil { 350 return nil, fmt.Errorf("Initial delay should be an integer, found: %v", opts["initial_delay_sec"][0]) 351 } 352 } 353 354 var projectId, heapsterZone string 355 // Cluster name and location are required when useNewResourceModel is true. 356 var clusterName, clusterLocation string 357 358 if len(opts["cluster_name"]) >= 1 { 359 clusterName = opts["cluster_name"][0] 360 } 361 362 if len(opts["cluster_location"]) >= 1 { 363 clusterLocation = opts["cluster_location"][0] 364 } 365 366 if gce.OnGCE() { 367 // Detect project ID 368 projectId, err = gce.ProjectID() 369 if err != nil { 370 return nil, err 371 } 372 373 // Detect zone for old resource model 374 heapsterZone, err = gce.Zone() 375 if err != nil { 376 glog.Warningf("Zone could not be discovered using the GCE Metadata Server: %s", err) 377 378 if useOldResourceModel { 379 return nil, err 380 } 381 } 382 383 if useNewResourceModel { 384 if clusterName == "" { 385 glog.Info("An empty cluster name has been provided, checking the GCE Metadata Server to try to auto-detect.") 386 387 clusterName, err = gce.InstanceAttributeValue("cluster-name") 388 if err == nil { 389 glog.Infof("Discovered '%s' as the cluster name from the GCE Metadata Server.", clusterName) 390 } else { 391 glog.Warningf("Cluster name could not be discovered using the GCE Metadata Server: %s", err) 392 } 393 } 394 395 if clusterLocation == "" { 396 glog.Info("An empty cluster location has been provided, checking the GCE Metadata Server to try to auto-detect.") 397 398 clusterLocation, err = gce.InstanceAttributeValue("cluster-location") 399 if err == nil { 400 glog.Infof("Discovered '%s' as the cluster location from the GCE Metadata Server.", clusterLocation) 401 } else { 402 glog.Warningf("Cluster location could not be discovered using the GCE Metadata Server: %s", err) 403 } 404 } 405 } 406 } else { 407 // Detect project ID from the environment 408 projectId, err = gce_util.GetProjectId() 409 if err != nil { 410 return nil, err 411 } 412 413 heapsterZone = opts["zone"][0] 414 } 415 416 if useNewResourceModel { 417 if clusterName == "" { 418 glog.Warning("Cluster name required but not provided, using empty cluster name.") 419 } 420 421 if clusterLocation == "" { 422 glog.Warning("Cluster location required with new resource model but not provided. Falling back to the zone where Heapster runs.") 423 clusterLocation = heapsterZone 424 } 425 } 426 427 // Create Metric Client 428 stackdriverClient, err := sd_api.NewMetricClient(context.Background()) 429 if err != nil { 430 return nil, err 431 } 432 433 sink := &StackdriverSink{ 434 project: projectId, 435 clusterName: clusterName, 436 clusterLocation: clusterLocation, 437 heapsterZone: heapsterZone, 438 stackdriverClient: stackdriverClient, 439 minInterval: minInterval, 440 batchExportTimeoutSec: batchExportTimeoutSec, 441 initialDelaySec: initialDelaySec, 442 useOldResourceModel: useOldResourceModel, 443 useNewResourceModel: useNewResourceModel, 444 } 445 446 // Register sink metrics 447 prometheus.MustRegister(requestsSent) 448 prometheus.MustRegister(timeseriesSent) 449 prometheus.MustRegister(requestLatency) 450 451 glog.Infof("Created Stackdriver sink") 452 453 return sink, nil 454 } 455 456 func parseBoolFlag(opts map[string][]string, name string, targetValue *bool) error { 457 if len(opts[name]) >= 1 { 458 var err error 459 *targetValue, err = strconv.ParseBool(opts[name][0]) 460 if err != nil { 461 return fmt.Errorf("%s = %s is not correct boolean value", name, opts[name][0]) 462 } 463 } 464 return nil 465 } 466 467 func (sink *StackdriverSink) computeDerivedMetrics(metricSet *core.MetricSet) *core.MetricSet { 468 newMetricSet := &core.MetricSet{MetricValues: map[string]core.MetricValue{}} 469 usage, usageOK := metricSet.MetricValues[core.MetricMemoryUsage.MetricDescriptor.Name] 470 workingSet, workingSetOK := metricSet.MetricValues[core.MetricMemoryWorkingSet.MetricDescriptor.Name] 471 472 if usageOK && workingSetOK { 473 newMetricSet.MetricValues["memory/bytes_used"] = core.MetricValue{ 474 IntValue: usage.IntValue - workingSet.IntValue, 475 } 476 } 477 478 memoryFaults, memoryFaultsOK := metricSet.MetricValues[core.MetricMemoryPageFaults.MetricDescriptor.Name] 479 majorMemoryFaults, majorMemoryFaultsOK := metricSet.MetricValues[core.MetricMemoryMajorPageFaults.MetricDescriptor.Name] 480 if memoryFaultsOK && majorMemoryFaultsOK { 481 newMetricSet.MetricValues["memory/minor_page_faults"] = core.MetricValue{ 482 IntValue: memoryFaults.IntValue - majorMemoryFaults.IntValue, 483 } 484 } 485 486 return newMetricSet 487 } 488 489 func (sink *StackdriverSink) LegacyTranslateLabeledMetric(timestamp time.Time, labels map[string]string, metric core.LabeledMetric, collectionStartTime time.Time) *monitoringpb.TimeSeries { 490 resourceLabels := sink.legacyGetResourceLabels(labels) 491 switch metric.Name { 492 case core.MetricFilesystemUsage.MetricDescriptor.Name: 493 point := sink.intPoint(timestamp, timestamp, metric.IntValue) 494 ts := legacyCreateTimeSeries(resourceLabels, legacyDiskBytesUsedMD, point) 495 ts.Metric.Labels = map[string]string{ 496 "device_name": metric.Labels[core.LabelResourceID.Key], 497 } 498 return ts 499 case core.MetricFilesystemLimit.MetricDescriptor.Name: 500 point := sink.intPoint(timestamp, timestamp, metric.IntValue) 501 ts := legacyCreateTimeSeries(resourceLabels, legacyDiskBytesTotalMD, point) 502 ts.Metric.Labels = map[string]string{ 503 "device_name": metric.Labels[core.LabelResourceID.Key], 504 } 505 return ts 506 case core.MetricAcceleratorMemoryTotal.MetricDescriptor.Name: 507 point := sink.intPoint(timestamp, timestamp, metric.IntValue) 508 ts := legacyCreateTimeSeries(resourceLabels, legacyAcceleratorMemoryTotalMD, point) 509 ts.Metric.Labels = map[string]string{ 510 core.LabelAcceleratorMake.Key: metric.Labels[core.LabelAcceleratorMake.Key], 511 core.LabelAcceleratorModel.Key: metric.Labels[core.LabelAcceleratorModel.Key], 512 core.LabelAcceleratorID.Key: metric.Labels[core.LabelAcceleratorID.Key], 513 } 514 return ts 515 case core.MetricAcceleratorMemoryUsed.MetricDescriptor.Name: 516 point := sink.intPoint(timestamp, timestamp, metric.IntValue) 517 ts := legacyCreateTimeSeries(resourceLabels, legacyAcceleratorMemoryUsedMD, point) 518 ts.Metric.Labels = map[string]string{ 519 core.LabelAcceleratorMake.Key: metric.Labels[core.LabelAcceleratorMake.Key], 520 core.LabelAcceleratorModel.Key: metric.Labels[core.LabelAcceleratorModel.Key], 521 core.LabelAcceleratorID.Key: metric.Labels[core.LabelAcceleratorID.Key], 522 } 523 return ts 524 case core.MetricAcceleratorDutyCycle.MetricDescriptor.Name: 525 point := sink.intPoint(timestamp, timestamp, metric.IntValue) 526 ts := legacyCreateTimeSeries(resourceLabels, legacyAcceleratorDutyCycleMD, point) 527 ts.Metric.Labels = map[string]string{ 528 core.LabelAcceleratorMake.Key: metric.Labels[core.LabelAcceleratorMake.Key], 529 core.LabelAcceleratorModel.Key: metric.Labels[core.LabelAcceleratorModel.Key], 530 core.LabelAcceleratorID.Key: metric.Labels[core.LabelAcceleratorID.Key], 531 } 532 return ts 533 } 534 return nil 535 } 536 537 func (sink *StackdriverSink) LegacyTranslateMetric(timestamp time.Time, labels map[string]string, name string, value core.MetricValue, collectionStartTime time.Time) *monitoringpb.TimeSeries { 538 resourceLabels := sink.legacyGetResourceLabels(labels) 539 if !collectionStartTime.Before(timestamp) { 540 glog.V(4).Infof("Error translating metric %v for pod %v: batch timestamp %v earlier than pod create time %v", name, labels["pod_name"], timestamp, collectionStartTime) 541 return nil 542 } 543 switch name { 544 case core.MetricUptime.MetricDescriptor.Name: 545 doubleValue := float64(value.IntValue) / float64(time.Second/time.Millisecond) 546 point := sink.doublePoint(timestamp, collectionStartTime, doubleValue) 547 return legacyCreateTimeSeries(resourceLabels, legacyUptimeMD, point) 548 case core.MetricCpuLimit.MetricDescriptor.Name: 549 // converting from millicores to cores 550 point := sink.doublePoint(timestamp, timestamp, float64(value.IntValue)/1000) 551 return legacyCreateTimeSeries(resourceLabels, legacyCPUReservedCoresMD, point) 552 case core.MetricCpuUsage.MetricDescriptor.Name: 553 point := sink.doublePoint(timestamp, collectionStartTime, float64(value.IntValue)/float64(time.Second/time.Nanosecond)) 554 return legacyCreateTimeSeries(resourceLabels, legacyCPUUsageTimeMD, point) 555 case core.MetricNetworkRx.MetricDescriptor.Name: 556 point := sink.intPoint(timestamp, collectionStartTime, value.IntValue) 557 return legacyCreateTimeSeries(resourceLabels, legacyNetworkRxMD, point) 558 case core.MetricNetworkTx.MetricDescriptor.Name: 559 point := sink.intPoint(timestamp, collectionStartTime, value.IntValue) 560 return legacyCreateTimeSeries(resourceLabels, legacyNetworkTxMD, point) 561 case core.MetricMemoryLimit.MetricDescriptor.Name: 562 // omit nodes, using memory/node_allocatable instead 563 if labels["type"] == core.MetricSetTypeNode { 564 return nil 565 } 566 point := sink.intPoint(timestamp, timestamp, value.IntValue) 567 return legacyCreateTimeSeries(resourceLabels, legacyMemoryLimitMD, point) 568 case core.MetricNodeMemoryAllocatable.MetricDescriptor.Name: 569 point := sink.intPoint(timestamp, timestamp, value.IntValue) 570 return legacyCreateTimeSeries(resourceLabels, legacyMemoryLimitMD, point) 571 case core.MetricMemoryMajorPageFaults.MetricDescriptor.Name: 572 point := sink.intPoint(timestamp, collectionStartTime, value.IntValue) 573 ts := legacyCreateTimeSeries(resourceLabels, legacyMemoryPageFaultsMD, point) 574 ts.Metric.Labels = map[string]string{ 575 "fault_type": "major", 576 } 577 return ts 578 case "memory/bytes_used": 579 point := sink.intPoint(timestamp, timestamp, value.IntValue) 580 ts := legacyCreateTimeSeries(resourceLabels, legacyMemoryBytesUsedMD, point) 581 ts.Metric.Labels = map[string]string{ 582 "memory_type": "evictable", 583 } 584 return ts 585 case "nvidia.com/gpu/request": 586 point := sink.intPoint(timestamp, timestamp, value.IntValue) 587 ts := legacyCreateTimeSeries(resourceLabels, legacyAcceleratorRequestMD, point) 588 ts.Metric.Labels = map[string]string{ 589 "resource_name": "nvidia.com/gpu", 590 } 591 return ts 592 case core.MetricMemoryWorkingSet.MetricDescriptor.Name: 593 point := sink.intPoint(timestamp, timestamp, value.IntValue) 594 ts := legacyCreateTimeSeries(resourceLabels, legacyMemoryBytesUsedMD, point) 595 ts.Metric.Labels = map[string]string{ 596 "memory_type": "non-evictable", 597 } 598 return ts 599 case "memory/minor_page_faults": 600 point := sink.intPoint(timestamp, collectionStartTime, value.IntValue) 601 ts := legacyCreateTimeSeries(resourceLabels, legacyMemoryPageFaultsMD, point) 602 ts.Metric.Labels = map[string]string{ 603 "fault_type": "minor", 604 } 605 return ts 606 } 607 return nil 608 } 609 610 func (sink *StackdriverSink) TranslateLabeledMetric(timestamp time.Time, labels map[string]string, metric core.LabeledMetric, collectionStartTime time.Time) *monitoringpb.TimeSeries { 611 switch labels["type"] { 612 case core.MetricSetTypePod: 613 podLabels := sink.getPodResourceLabels(labels) 614 switch metric.Name { 615 case core.MetricFilesystemUsage.MetricDescriptor.Name: 616 point := sink.intPoint(timestamp, timestamp, metric.MetricValue.IntValue) 617 ts := createTimeSeries("k8s_pod", podLabels, volumeUsedBytesMD, point) 618 ts.Metric.Labels = map[string]string{ 619 core.LabelVolumeName.Key: strings.TrimPrefix(metric.Labels[core.LabelResourceID.Key], "Volume:"), 620 } 621 return ts 622 case core.MetricFilesystemLimit.MetricDescriptor.Name: 623 point := sink.intPoint(timestamp, timestamp, metric.MetricValue.IntValue) 624 ts := createTimeSeries("k8s_pod", podLabels, volumeTotalBytesMD, point) 625 ts.Metric.Labels = map[string]string{ 626 core.LabelVolumeName.Key: strings.TrimPrefix(metric.Labels[core.LabelResourceID.Key], "Volume:"), 627 } 628 return ts 629 } 630 case core.MetricSetTypePodContainer: 631 containerLabels := sink.getContainerResourceLabels(labels) 632 switch metric.Name { 633 case core.MetricAcceleratorMemoryTotal.MetricDescriptor.Name: 634 point := sink.intPoint(timestamp, timestamp, metric.IntValue) 635 ts := createTimeSeries("k8s_container", containerLabels, acceleratorMemoryTotalMD, point) 636 ts.Metric.Labels = map[string]string{ 637 core.LabelAcceleratorMake.Key: metric.Labels[core.LabelAcceleratorMake.Key], 638 core.LabelAcceleratorModel.Key: metric.Labels[core.LabelAcceleratorModel.Key], 639 core.LabelAcceleratorID.Key: metric.Labels[core.LabelAcceleratorID.Key], 640 } 641 return ts 642 case core.MetricAcceleratorMemoryUsed.MetricDescriptor.Name: 643 point := sink.intPoint(timestamp, timestamp, metric.IntValue) 644 ts := createTimeSeries("k8s_container", containerLabels, acceleratorMemoryUsedMD, point) 645 ts.Metric.Labels = map[string]string{ 646 core.LabelAcceleratorMake.Key: metric.Labels[core.LabelAcceleratorMake.Key], 647 core.LabelAcceleratorModel.Key: metric.Labels[core.LabelAcceleratorModel.Key], 648 core.LabelAcceleratorID.Key: metric.Labels[core.LabelAcceleratorID.Key], 649 } 650 return ts 651 case core.MetricAcceleratorDutyCycle.MetricDescriptor.Name: 652 point := sink.intPoint(timestamp, timestamp, metric.IntValue) 653 ts := createTimeSeries("k8s_container", containerLabels, acceleratorDutyCycleMD, point) 654 ts.Metric.Labels = map[string]string{ 655 core.LabelAcceleratorMake.Key: metric.Labels[core.LabelAcceleratorMake.Key], 656 core.LabelAcceleratorModel.Key: metric.Labels[core.LabelAcceleratorModel.Key], 657 core.LabelAcceleratorID.Key: metric.Labels[core.LabelAcceleratorID.Key], 658 } 659 return ts 660 } 661 } 662 return nil 663 } 664 665 func (sink *StackdriverSink) TranslateMetric(timestamp time.Time, labels map[string]string, name string, value core.MetricValue, collectionStartTime time.Time, entityCreateTime time.Time) *monitoringpb.TimeSeries { 666 if !collectionStartTime.Before(timestamp) { 667 glog.V(4).Infof("Error translating metric %v for pod %v: batch timestamp %v earlier than pod create time %v", name, labels["pod_name"], timestamp, collectionStartTime) 668 return nil 669 } 670 switch labels["type"] { 671 case core.MetricSetTypePodContainer: 672 containerLabels := sink.getContainerResourceLabels(labels) 673 switch name { 674 case core.MetricUptime.MetricDescriptor.Name: 675 doubleValue := float64(value.IntValue) / float64(time.Second/time.Millisecond) 676 point := sink.doublePoint(timestamp, timestamp, doubleValue) 677 return createTimeSeries("k8s_container", containerLabels, containerUptimeMD, point) 678 case core.MetricCpuLimit.MetricDescriptor.Name: 679 point := sink.doublePoint(timestamp, timestamp, float64(value.IntValue)/1000) 680 return createTimeSeries("k8s_container", containerLabels, cpuLimitCoresMD, point) 681 case core.MetricCpuRequest.MetricDescriptor.Name: 682 point := sink.doublePoint(timestamp, timestamp, float64(value.IntValue)/1000) 683 return createTimeSeries("k8s_container", containerLabels, cpuRequestedCoresMD, point) 684 case core.MetricCpuUsage.MetricDescriptor.Name: 685 point := sink.doublePoint(timestamp, collectionStartTime, float64(value.IntValue)/float64(time.Second/time.Nanosecond)) 686 return createTimeSeries("k8s_container", containerLabels, cpuContainerCoreUsageTimeMD, point) 687 case core.MetricMemoryLimit.MetricDescriptor.Name: 688 point := sink.intPoint(timestamp, timestamp, value.IntValue) 689 return createTimeSeries("k8s_container", containerLabels, memoryLimitBytesMD, point) 690 case "memory/bytes_used": 691 point := sink.intPoint(timestamp, timestamp, value.IntValue) 692 ts := createTimeSeries("k8s_container", containerLabels, memoryContainerUsedBytesMD, point) 693 ts.Metric.Labels = map[string]string{ 694 "memory_type": "evictable", 695 } 696 return ts 697 case "nvidia.com/gpu/request": 698 point := sink.intPoint(timestamp, timestamp, value.IntValue) 699 ts := createTimeSeries("k8s_container", containerLabels, acceleratorRequestedMD, point) 700 ts.Metric.Labels = map[string]string{ 701 "resource_name": "nvidia.com/gpu", 702 } 703 return ts 704 case core.MetricMemoryWorkingSet.MetricDescriptor.Name: 705 point := sink.intPoint(timestamp, timestamp, value.IntValue) 706 ts := createTimeSeries("k8s_container", containerLabels, memoryContainerUsedBytesMD, point) 707 ts.Metric.Labels = map[string]string{ 708 "memory_type": "non-evictable", 709 } 710 return ts 711 case core.MetricMemoryRequest.MetricDescriptor.Name: 712 point := sink.intPoint(timestamp, timestamp, value.IntValue) 713 return createTimeSeries("k8s_container", containerLabels, memoryRequestedBytesMD, point) 714 case core.MetricEphemeralStorageLimit.MetricDescriptor.Name: 715 point := sink.intPoint(timestamp, timestamp, value.IntValue) 716 return createTimeSeries("k8s_container", containerLabels, ephemeralstorageLimitBytesMD, point) 717 case core.MetricEphemeralStorageRequest.MetricDescriptor.Name: 718 point := sink.intPoint(timestamp, timestamp, value.IntValue) 719 return createTimeSeries("k8s_container", containerLabels, ephemeralstorageRequestedBytesMD, point) 720 case core.MetricEphemeralStorageUsage.MetricDescriptor.Name: 721 point := sink.intPoint(timestamp, timestamp, value.IntValue) 722 return createTimeSeries("k8s_container", containerLabels, ephemeralstorageContainerUsedBytesMD, point) 723 724 case core.MetricRestartCount.MetricDescriptor.Name: 725 if entityCreateTime.IsZero() { 726 glog.V(2).Infof("Skipping restart_count metric for container %s because entity create time is zero", core.PodContainerKey(containerLabels["namespace_name"], containerLabels["pod_name"], containerLabels["container_name"])) 727 return nil 728 } 729 point := sink.intPoint(timestamp, entityCreateTime, value.IntValue) 730 return createTimeSeries("k8s_container", containerLabels, restartCountMD, point) 731 } 732 case core.MetricSetTypePod: 733 podLabels := sink.getPodResourceLabels(labels) 734 switch name { 735 case core.MetricNetworkRx.MetricDescriptor.Name: 736 point := sink.intPoint(timestamp, collectionStartTime, value.IntValue) 737 return createTimeSeries("k8s_pod", podLabels, networkPodReceivedBytesMD, point) 738 case core.MetricNetworkTx.MetricDescriptor.Name: 739 point := sink.intPoint(timestamp, collectionStartTime, value.IntValue) 740 return createTimeSeries("k8s_pod", podLabels, networkPodSentBytesMD, point) 741 } 742 case core.MetricSetTypeNode: 743 nodeLabels := sink.getNodeResourceLabels(labels) 744 switch name { 745 case core.MetricNodeCpuCapacity.MetricDescriptor.Name: 746 point := sink.doublePoint(timestamp, timestamp, float64(value.FloatValue)/1000) 747 return createTimeSeries("k8s_node", nodeLabels, cpuTotalCoresMD, point) 748 case core.MetricNodeCpuAllocatable.MetricDescriptor.Name: 749 point := sink.doublePoint(timestamp, timestamp, float64(value.FloatValue)/1000) 750 return createTimeSeries("k8s_node", nodeLabels, cpuAllocatableCoresMD, point) 751 case core.MetricCpuUsage.MetricDescriptor.Name: 752 point := sink.doublePoint(timestamp, collectionStartTime, float64(value.IntValue)/float64(time.Second/time.Nanosecond)) 753 return createTimeSeries("k8s_node", nodeLabels, cpuNodeCoreUsageTimeMD, point) 754 case core.MetricNodeMemoryCapacity.MetricDescriptor.Name: 755 point := sink.intPoint(timestamp, timestamp, int64(value.FloatValue)) 756 return createTimeSeries("k8s_node", nodeLabels, memoryTotalBytesMD, point) 757 case core.MetricNodeMemoryAllocatable.MetricDescriptor.Name: 758 point := sink.intPoint(timestamp, timestamp, int64(value.FloatValue)) 759 return createTimeSeries("k8s_node", nodeLabels, memoryAllocatableBytesMD, point) 760 case "memory/bytes_used": 761 point := sink.intPoint(timestamp, timestamp, value.IntValue) 762 ts := createTimeSeries("k8s_node", nodeLabels, memoryNodeUsedBytesMD, point) 763 ts.Metric.Labels = map[string]string{ 764 "memory_type": "evictable", 765 } 766 return ts 767 case core.MetricMemoryWorkingSet.MetricDescriptor.Name: 768 point := sink.intPoint(timestamp, timestamp, value.IntValue) 769 ts := createTimeSeries("k8s_node", nodeLabels, memoryNodeUsedBytesMD, point) 770 ts.Metric.Labels = map[string]string{ 771 "memory_type": "non-evictable", 772 } 773 return ts 774 case core.MetricNetworkRx.MetricDescriptor.Name: 775 point := sink.intPoint(timestamp, collectionStartTime, value.IntValue) 776 return createTimeSeries("k8s_node", nodeLabels, networkNodeReceivedBytesMD, point) 777 case core.MetricNetworkTx.MetricDescriptor.Name: 778 point := sink.intPoint(timestamp, collectionStartTime, value.IntValue) 779 return createTimeSeries("k8s_node", nodeLabels, networkNodeSentBytesMD, point) 780 case core.MetricNodeEphemeralStorageCapacity.MetricDescriptor.Name: 781 point := sink.intPoint(timestamp, timestamp, value.IntValue) 782 return createTimeSeries("k8s_node", nodeLabels, ephemeralstorageTotalBytesMD, point) 783 784 } 785 case core.MetricSetTypeSystemContainer: 786 nodeLabels := sink.getNodeResourceLabels(labels) 787 switch name { 788 case "memory/bytes_used": 789 point := sink.intPoint(timestamp, timestamp, value.IntValue) 790 ts := createTimeSeries("k8s_node", nodeLabels, memoryNodeDaemonUsedBytesMD, point) 791 ts.Metric.Labels = map[string]string{ 792 "component": labels[core.LabelContainerName.Key], 793 "memory_type": "evictable", 794 } 795 return ts 796 case core.MetricMemoryWorkingSet.MetricDescriptor.Name: 797 point := sink.intPoint(timestamp, timestamp, value.IntValue) 798 ts := createTimeSeries("k8s_node", nodeLabels, memoryNodeDaemonUsedBytesMD, point) 799 ts.Metric.Labels = map[string]string{ 800 "component": labels[core.LabelContainerName.Key], 801 "memory_type": "non-evictable", 802 } 803 return ts 804 case core.MetricCpuUsage.MetricDescriptor.Name: 805 point := sink.doublePoint(timestamp, collectionStartTime, float64(value.IntValue)/float64(time.Second/time.Nanosecond)) 806 ts := createTimeSeries("k8s_node", nodeLabels, cpuNodeDaemonCoreUsageTimeMD, point) 807 ts.Metric.Labels = map[string]string{ 808 "component": labels[core.LabelContainerName.Key], 809 } 810 return ts 811 } 812 } 813 return nil 814 } 815 816 func (sink *StackdriverSink) legacyGetResourceLabels(labels map[string]string) map[string]string { 817 return map[string]string{ 818 "project_id": sink.project, 819 "cluster_name": sink.clusterName, 820 "zone": sink.heapsterZone, 821 "instance_id": labels[core.LabelHostID.Key], 822 "namespace_id": labels[core.LabelPodNamespaceUID.Key], 823 "pod_id": labels[core.LabelPodId.Key], 824 "container_name": labels[core.LabelContainerName.Key], 825 } 826 } 827 828 func (sink *StackdriverSink) getContainerResourceLabels(labels map[string]string) map[string]string { 829 return map[string]string{ 830 "project_id": sink.project, 831 "location": sink.clusterLocation, 832 "cluster_name": sink.clusterName, 833 "namespace_name": labels[core.LabelNamespaceName.Key], 834 "pod_name": labels[core.LabelPodName.Key], 835 "container_name": labels[core.LabelContainerName.Key], 836 } 837 } 838 839 func (sink *StackdriverSink) getPodResourceLabels(labels map[string]string) map[string]string { 840 return map[string]string{ 841 "project_id": sink.project, 842 "location": sink.clusterLocation, 843 "cluster_name": sink.clusterName, 844 "namespace_name": labels[core.LabelNamespaceName.Key], 845 "pod_name": labels[core.LabelPodName.Key], 846 } 847 } 848 849 func (sink *StackdriverSink) getNodeResourceLabels(labels map[string]string) map[string]string { 850 return map[string]string{ 851 "project_id": sink.project, 852 "location": sink.clusterLocation, 853 "cluster_name": sink.clusterName, 854 "node_name": labels[core.LabelNodename.Key], 855 } 856 } 857 858 func legacyCreateTimeSeries(resourceLabels map[string]string, metadata *metricMetadata, point *monitoringpb.Point) *monitoringpb.TimeSeries { 859 return createTimeSeries("gke_container", resourceLabels, metadata, point) 860 } 861 862 func createTimeSeries(resource string, resourceLabels map[string]string, metadata *metricMetadata, point *monitoringpb.Point) *monitoringpb.TimeSeries { 863 return &monitoringpb.TimeSeries{ 864 Metric: &metric.Metric{ 865 Type: metadata.Name, 866 }, 867 MetricKind: metadata.MetricKind, 868 ValueType: metadata.ValueType, 869 Resource: &monitoredres.MonitoredResource{ 870 Labels: resourceLabels, 871 Type: resource, 872 }, 873 Points: []*monitoringpb.Point{point}, 874 } 875 } 876 877 func (sink *StackdriverSink) doublePoint(endTime time.Time, startTime time.Time, value float64) *monitoringpb.Point { 878 return &monitoringpb.Point{ 879 Interval: &monitoringpb.TimeInterval{ 880 EndTime: &google_proto.Timestamp{Seconds: endTime.Unix(), Nanos: int32(endTime.Nanosecond())}, 881 StartTime: &google_proto.Timestamp{Seconds: startTime.Unix(), Nanos: int32(startTime.Nanosecond())}, 882 }, 883 Value: &monitoringpb.TypedValue{ 884 Value: &monitoringpb.TypedValue_DoubleValue{ 885 DoubleValue: value, 886 }, 887 }, 888 } 889 890 } 891 892 func (sink *StackdriverSink) intPoint(endTime time.Time, startTime time.Time, value int64) *monitoringpb.Point { 893 return &monitoringpb.Point{ 894 Interval: &monitoringpb.TimeInterval{ 895 EndTime: &google_proto.Timestamp{Seconds: endTime.Unix(), Nanos: int32(endTime.Nanosecond())}, 896 StartTime: &google_proto.Timestamp{Seconds: startTime.Unix(), Nanos: int32(startTime.Nanosecond())}, 897 }, 898 Value: &monitoringpb.TypedValue{ 899 Value: &monitoringpb.TypedValue_Int64Value{ 900 Int64Value: value, 901 }, 902 }, 903 } 904 } 905 906 func fullProjectName(name string) string { 907 return fmt.Sprintf("projects/%s", name) 908 } 909 910 func getReq(project string) *monitoringpb.CreateTimeSeriesRequest { 911 return &monitoringpb.CreateTimeSeriesRequest{ 912 TimeSeries: nil, 913 Name: fullProjectName(project), 914 } 915 }