github.com/google/cloudprober@v0.11.3/surfacers/stackdriver/stackdriver.go (about) 1 // Copyright 2017-2021 The Cloudprober Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 /* 16 Package stackdriver implements the Stackdriver version of the Surfacer 17 object. This package allows users to create an initialized Stack Driver 18 Surfacer and use it to write custom metrics data. 19 */ 20 package stackdriver 21 22 import ( 23 "context" 24 "fmt" 25 "math/rand" 26 "regexp" 27 "strings" 28 "time" 29 30 "cloud.google.com/go/compute/metadata" 31 "github.com/google/cloudprober/logger" 32 "golang.org/x/oauth2/google" 33 monitoring "google.golang.org/api/monitoring/v3" 34 35 "github.com/google/cloudprober/metrics" 36 "github.com/google/cloudprober/surfacers/common/options" 37 configpb "github.com/google/cloudprober/surfacers/stackdriver/proto" 38 ) 39 40 const batchSize = 200 41 42 //----------------------------------------------------------------------------- 43 // Stack Driver Surfacer Specific Code 44 //----------------------------------------------------------------------------- 45 46 // SDSurfacer structure for StackDriver, which includes an authenticated client 47 // for making StackDriver API calls, and a registered which is in charge of 48 // keeping track of what metrics have already been registereded 49 type SDSurfacer struct { 50 c *configpb.SurfacerConf 51 opts *options.Options 52 53 // Metrics regexp 54 allowedMetricsRegex *regexp.Regexp 55 56 // Internal cache for saving metric data until a batch is sent 57 cache map[string]*monitoring.TimeSeries 58 knownMetrics map[string]bool 59 60 // Channel for writing the data without blocking 61 writeChan chan *metrics.EventMetrics 62 63 // VM Information 64 onGCE bool 65 projectName string 66 resource *monitoring.MonitoredResource 67 68 // Time when stackdriver module was initialized. This is used as start time 69 // for cumulative metrics. 70 startTime time.Time 71 72 // Cloud logger 73 l *logger.Logger 74 failCnt int64 75 76 // Monitoring client 77 client *monitoring.Service 78 } 79 80 // New initializes a SDSurfacer for Stack Driver with all its necessary internal 81 // variables for call references (project and instances variables) as well 82 // as provisioning it with clients for making the necessary API calls. New 83 // requires you to pass in a valid stackdriver surfacer configuration. 84 func New(ctx context.Context, config *configpb.SurfacerConf, opts *options.Options, l *logger.Logger) (*SDSurfacer, error) { 85 // Create a cache, which is used for batching write requests together, 86 // and a channel for writing data. 87 s := SDSurfacer{ 88 cache: make(map[string]*monitoring.TimeSeries), 89 knownMetrics: make(map[string]bool), 90 writeChan: make(chan *metrics.EventMetrics, config.GetMetricsBufferSize()), 91 c: config, 92 opts: opts, 93 projectName: config.GetProject(), 94 startTime: time.Now(), 95 l: l, 96 } 97 98 if s.c.GetAllowedMetricsRegex() != "" { 99 l.Warning("allowed_metrics_regex is now deprecated. Please use the common surfacer options: allow_metrics, ignore_metrics.") 100 r, err := regexp.Compile(s.c.GetAllowedMetricsRegex()) 101 if err != nil { 102 return nil, err 103 } 104 s.allowedMetricsRegex = r 105 } 106 107 // Find all the necessary information for writing metrics to Stack 108 // Driver. 109 var err error 110 111 if metadata.OnGCE() { 112 s.onGCE = true 113 114 if s.projectName == "" { 115 if s.projectName, err = metadata.ProjectID(); err != nil { 116 return nil, fmt.Errorf("unable to retrieve project name: %v", err) 117 } 118 } 119 120 mr, err := monitoredResourceOnGCE(s.projectName) 121 if err != nil { 122 return nil, fmt.Errorf("error initializing monitored resource for stackdriver on GCE: %v", err) 123 } 124 125 s.resource = mr 126 127 } 128 129 // Create monitoring client 130 httpClient, err := google.DefaultClient(ctx, monitoring.CloudPlatformScope) 131 if err != nil { 132 return nil, err 133 } 134 s.client, err = monitoring.New(httpClient) 135 if err != nil { 136 return nil, err 137 } 138 139 // Start either the writeAsync or the writeBatch, depending on if we are 140 // batching or not. 141 go s.writeBatch(ctx) 142 143 s.l.Info("Created a new stackdriver surfacer") 144 return &s, nil 145 } 146 147 // Write queues a message to be written to stackdriver. 148 func (s *SDSurfacer) Write(_ context.Context, em *metrics.EventMetrics) { 149 // Write inserts the data to be written into channel. This channel is 150 // watched by writeBatch and will make the necessary calls to the Stackdriver 151 // API to write the data from the channel. 152 select { 153 case s.writeChan <- em: 154 default: 155 s.l.Errorf("SDSurfacer's write channel is full, dropping new data.") 156 } 157 } 158 159 // createMetricDescriptor creates metric descriptor for the given timeseries. 160 // We create metric descriptors explicitly, instead of relying on auto- 161 // creation by creating timeseries, because auto-creation doesn't add units to 162 // the metric. 163 func (s *SDSurfacer) createMetricDescriptor(ts *monitoring.TimeSeries) error { 164 var labels []*monitoring.LabelDescriptor 165 for k := range ts.Metric.Labels { 166 labels = append(labels, &monitoring.LabelDescriptor{ 167 Key: k, 168 ValueType: "STRING", 169 }) 170 } 171 172 _, err := s.client.Projects.MetricDescriptors.Create("projects/"+s.projectName, &monitoring.MetricDescriptor{ 173 Name: "projects/" + s.projectName + "/metricDescriptors/" + ts.Metric.Type, 174 Type: ts.Metric.Type, 175 MetricKind: ts.MetricKind, 176 Labels: labels, 177 Unit: ts.Unit, 178 ValueType: ts.ValueType, 179 }).Do() 180 181 return err 182 } 183 184 // writeBatch polls the writeChan and the sendChan waiting for either a new 185 // write packet or a new context. If data comes in on the writeChan, then 186 // the data is pulled off and put into the cache (if there is already an 187 // entry into the cache for the same metric, it updates the metric to the 188 // new data). If ticker fires, then the metrics in the cache 189 // are batched together. The Stackdriver API has a limit on the maximum number 190 // of metrics that can be sent in a single request, so we may have to make 191 // multiple requests to the Stackdriver API to send the full cache of metrics. 192 // 193 // writeBatch is set up to run as an infinite goroutine call in the New function 194 // to allow it to write asynchronously to Stack Driver. 195 func (s *SDSurfacer) writeBatch(ctx context.Context) { 196 // Introduce a random delay before starting the loop. 197 rand.Seed(time.Now().UnixNano()) 198 randomDelay := time.Duration(rand.Int63n(int64(s.c.GetBatchTimerSec()))) * time.Second 199 time.Sleep(randomDelay) 200 201 batchTicker := time.NewTicker(time.Duration(s.c.GetBatchTimerSec()) * time.Second) 202 for { 203 select { 204 case <-ctx.Done(): 205 s.l.Infof("Context canceled, stopping the input processing loop.") 206 batchTicker.Stop() 207 return 208 case em := <-s.writeChan: 209 // Process EventMetrics to build timeseries using them and cache the timeseries 210 // objects. 211 s.recordEventMetrics(em) 212 case <-batchTicker.C: 213 // Empty time series writes cause an error to be returned, so 214 // we skip any calls that write but wouldn't set any data. 215 if len(s.cache) == 0 { 216 break 217 } 218 219 var ts []*monitoring.TimeSeries 220 for _, v := range s.cache { 221 if !s.knownMetrics[v.Metric.Type] && v.Unit != "" { 222 if err := s.createMetricDescriptor(v); err != nil { 223 s.l.Warningf("Error creating metric descriptor for: %s, err: %v", v.Metric.Type, err) 224 continue 225 } 226 s.knownMetrics[v.Metric.Type] = true 227 } 228 ts = append(ts, v) 229 } 230 231 // We batch the time series into appropriately-sized sets 232 // and write them 233 for i := 0; i < len(ts); i += batchSize { 234 endIndex := min(len(ts), i+batchSize) 235 236 s.l.Infof("Sending entries %d through %d of %d", i, endIndex, len(ts)) 237 238 // Now that we've created the new metric, we can write the data. Making 239 // a time series create call will automatically register a new metric 240 // with the correct information if it does not already exist. 241 // Ref: https://cloud.google.com/monitoring/custom-metrics/creating-metrics#auto-creation 242 requestBody := monitoring.CreateTimeSeriesRequest{ 243 TimeSeries: ts[i:endIndex], 244 } 245 if _, err := s.client.Projects.TimeSeries.Create("projects/"+s.projectName, &requestBody).Do(); err != nil { 246 s.failCnt++ 247 s.l.Warningf("Unable to fulfill TimeSeries Create call. Err: %v", err) 248 } 249 } 250 251 // Flush the cache after we've finished writing so we don't accidentally 252 // re-write metric values that haven't been written over several write 253 // cycles. 254 for k := range s.cache { 255 delete(s.cache, k) 256 } 257 } 258 } 259 260 } 261 262 //----------------------------------------------------------------------------- 263 // StackDriver Object Creation and Helper Functions 264 //----------------------------------------------------------------------------- 265 266 // recordTimeSeries forms a timeseries object from the given arguments, records 267 // it in the cache if batch processing is enabled, and returns it. 268 // 269 // More information on the object and specific fields can be found here: 270 // https://cloud.google.com/monitoring/api/ref_v3/rest/v3/TimeSeries 271 func (s *SDSurfacer) recordTimeSeries(metricKind, metricName, msgType string, labels map[string]string, timestamp time.Time, tv *monitoring.TypedValue, unit, cacheKey string) *monitoring.TimeSeries { 272 startTime := s.startTime.Format(time.RFC3339Nano) 273 if metricKind == "GAUGE" { 274 startTime = timestamp.Format(time.RFC3339Nano) 275 } 276 277 ts := &monitoring.TimeSeries{ 278 // The URL address for our custom metric, must match the 279 // name we used in the MetricDescriptor. 280 Metric: &monitoring.Metric{ 281 Type: s.c.GetMonitoringUrl() + metricName, 282 Labels: labels, 283 }, 284 285 // Must match the MetricKind and ValueType of the MetricDescriptor. 286 MetricKind: metricKind, 287 ValueType: msgType, 288 Unit: unit, 289 290 // Create a single data point, this could be utilized to create 291 // a batch of points instead of a single point if the write 292 // rate is too high. 293 Points: []*monitoring.Point{ 294 { 295 Interval: &monitoring.TimeInterval{ 296 StartTime: startTime, 297 EndTime: timestamp.Format(time.RFC3339Nano), 298 }, 299 Value: tv, 300 }, 301 }, 302 } 303 304 if s.resource != nil { 305 ts.Resource = s.resource 306 } 307 308 // We create a key that is a composite of both the name and the 309 // labels so we can make sure that the cache holds all distinct 310 // values and not just the ones with different names. 311 s.cache[metricName+","+cacheKey] = ts 312 313 return ts 314 315 } 316 317 // sdKind converts EventMetrics kind to StackDriver kind string. 318 func (s *SDSurfacer) sdKind(kind metrics.Kind) string { 319 switch kind { 320 case metrics.GAUGE: 321 return "GAUGE" 322 case metrics.CUMULATIVE: 323 return "CUMULATIVE" 324 default: 325 return "" 326 } 327 } 328 329 // processLabels processes EventMetrics labels to generate: 330 // - a map of label key values to use in StackDriver timeseries, 331 // - a labels key of the form label1_key=label1_val,label2_key=label2_val, 332 // used for caching. 333 // - prefix for metric names, usually <ptype>/<probe>. 334 func processLabels(em *metrics.EventMetrics) (labels map[string]string, labelsKey, metricPrefix string) { 335 labels = make(map[string]string) 336 var sortedLabels []string // we use this for cache key below 337 var ptype, probe string 338 for _, k := range em.LabelsKeys() { 339 if k == "ptype" { 340 ptype = em.Label(k) 341 continue 342 } 343 if k == "probe" { 344 probe = em.Label(k) 345 continue 346 } 347 labels[k] = em.Label(k) 348 sortedLabels = append(sortedLabels, k+"="+labels[k]) 349 } 350 labelsKey = strings.Join(sortedLabels, ",") 351 352 if ptype != "" { 353 metricPrefix += ptype + "/" 354 } 355 if probe != "" { 356 metricPrefix += probe + "/" 357 } 358 return 359 } 360 361 func (s *SDSurfacer) ignoreMetric(name string) bool { 362 if s.allowedMetricsRegex != nil { 363 if !s.allowedMetricsRegex.MatchString(name) { 364 return true 365 } 366 } 367 368 if !validMetricLength(name, s.c.GetMonitoringUrl()) { 369 s.l.Warningf("Message name %q is greater than the 100 character limit, skipping write", name) 370 return true 371 } 372 373 return false 374 } 375 376 // recordEventMetrics processes the incoming EventMetrics objects and builds 377 // TimeSeries from it. 378 // 379 // Since stackdriver doesn't support metrics.String and metrics.Map value types, 380 // it converts them to a numerical types (stackdriver type Double) with 381 // additional labels. See the inline comments for this conversion is done. 382 func (s *SDSurfacer) recordEventMetrics(em *metrics.EventMetrics) (ts []*monitoring.TimeSeries) { 383 metricKind := s.sdKind(em.Kind) 384 if metricKind == "" { 385 s.l.Warningf("Unknown event metrics type (not CUMULATIVE or GAUGE): %v", em.Kind) 386 return 387 } 388 389 emLabels, cacheKey, metricPrefix := processLabels(em) 390 391 for _, k := range em.MetricsKeys() { 392 if !s.opts.AllowMetric(k) { 393 continue 394 } 395 396 // Create a copy of emLabels for use in timeseries object. 397 mLabels := make(map[string]string) 398 for k, v := range emLabels { 399 mLabels[k] = v 400 } 401 name := metricPrefix + k 402 403 if s.ignoreMetric(name) { 404 continue 405 } 406 407 // Create the correct TimeSeries object based on the incoming data 408 val := em.Metric(k) 409 410 unit := "1" // "1" is the default unit for numbers. 411 if k == "latency" { 412 unit = map[time.Duration]string{ 413 time.Second: "s", 414 time.Millisecond: "ms", 415 time.Microsecond: "us", 416 time.Nanosecond: "ns", 417 }[em.LatencyUnit] 418 } 419 420 // If metric value is of type numerical value. 421 if v, ok := val.(metrics.NumValue); ok { 422 f := float64(v.Int64()) 423 ts = append(ts, s.recordTimeSeries(metricKind, name, "DOUBLE", mLabels, em.Timestamp, &monitoring.TypedValue{DoubleValue: &f}, unit, cacheKey)) 424 continue 425 } 426 427 // If metric value is of type String. 428 if v, ok := val.(metrics.String); ok { 429 // Since StackDriver doesn't support string value type for custom metrics, 430 // we convert string metrics into a numeric metric with an additional label 431 // val="string-val". 432 // 433 // metrics.String stringer wraps string values in a single "". Remove those 434 // for stackdriver. 435 mLabels["val"] = strings.Trim(v.String(), "\"") 436 f := float64(1) 437 ts = append(ts, s.recordTimeSeries(metricKind, name, "DOUBLE", mLabels, em.Timestamp, &monitoring.TypedValue{DoubleValue: &f}, unit, cacheKey)) 438 continue 439 } 440 441 // If metric value is of type Map. 442 if mapValue, ok := val.(*metrics.Map); ok { 443 // Since StackDriver doesn't support Map value type, we convert Map values 444 // to multiple timeseries with map's KeyName and key as labels. 445 for _, mapKey := range mapValue.Keys() { 446 mmLabels := make(map[string]string) 447 for lk, lv := range mLabels { 448 mmLabels[lk] = lv 449 } 450 mmLabels[mapValue.MapName] = mapKey 451 f := float64(mapValue.GetKey(mapKey).Int64()) 452 ts = append(ts, s.recordTimeSeries(metricKind, name, "DOUBLE", mmLabels, em.Timestamp, &monitoring.TypedValue{DoubleValue: &f}, unit, cacheKey)) 453 } 454 continue 455 } 456 457 // If metric value is of type Distribution. 458 if distValue, ok := val.(*metrics.Distribution); ok { 459 ts = append(ts, s.recordTimeSeries(metricKind, name, "DISTRIBUTION", mLabels, em.Timestamp, distValue.StackdriverTypedValue(), unit, cacheKey)) 460 continue 461 } 462 463 // We'll reach here only if encounter an unsupported value type. 464 s.l.Warningf("Unsupported value type: %v", val) 465 } 466 return ts 467 } 468 469 //----------------------------------------------------------------------------- 470 // Non-stackdriver Helper Functions 471 //----------------------------------------------------------------------------- 472 473 // checkMetricLength checks if the combination of the metricName and the url 474 // prefix are longer than 100 characters, which is illegal in a Stackdriver 475 // call. Stack Driver doesn't allow custom metrics with more than 100 character 476 // names, so we have a check to see if we are going over the limit. 477 // Ref: https://cloud.google.com/monitoring/api/v3/metrics#metric_names 478 func validMetricLength(metricName string, monitoringURL string) bool { 479 if len(metricName)+len(monitoringURL) > 100 { 480 return false 481 } 482 return true 483 } 484 485 // Function to return the min of two integers 486 func min(a, b int) int { 487 if a < b { 488 return a 489 } 490 return b 491 }