github.com/rudderlabs/rudder-go-kit@v0.30.0/stats/otel.go (about) 1 package stats 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "net/http" 8 "runtime" 9 "strings" 10 "sync" 11 "time" 12 13 "github.com/prometheus/client_golang/prometheus" 14 "github.com/prometheus/client_golang/prometheus/promhttp" 15 "github.com/spf13/cast" 16 "go.opentelemetry.io/otel/attribute" 17 "go.opentelemetry.io/otel/metric" 18 noopMetric "go.opentelemetry.io/otel/metric/noop" 19 "go.opentelemetry.io/otel/propagation" 20 semconv "go.opentelemetry.io/otel/semconv/v1.24.0" 21 "go.opentelemetry.io/otel/trace" 22 23 "github.com/rudderlabs/rudder-go-kit/logger" 24 "github.com/rudderlabs/rudder-go-kit/stats/internal/otel" 25 ) 26 27 const ( 28 defaultMeterName = "" 29 ) 30 31 // otelStats is an OTel-specific adapter that follows the Stats contract 32 type otelStats struct { 33 config statsConfig 34 otelConfig otelStatsConfig 35 resourceAttrs map[string]struct{} 36 37 tracerProvider trace.TracerProvider 38 traceBaseAttributes []attribute.KeyValue 39 tracerMap map[string]Tracer 40 tracerMapMu sync.Mutex 41 42 meter metric.Meter 43 noopMeter metric.Meter 44 counters map[string]metric.Int64Counter 45 countersMu sync.Mutex 46 gauges map[string]*otelGauge 47 gaugesMu sync.Mutex 48 timers map[string]metric.Float64Histogram 49 timersMu sync.Mutex 50 histograms map[string]metric.Float64Histogram 51 histogramsMu sync.Mutex 52 53 otelManager otel.Manager 54 runtimeStatsCollector runtimeStatsCollector 55 metricsStatsCollector metricStatsCollector 56 stopBackgroundCollection func() 57 logger logger.Logger 58 59 httpServer *http.Server 60 httpServerShutdownComplete chan struct{} 61 prometheusRegisterer prometheus.Registerer 62 prometheusGatherer prometheus.Gatherer 63 } 64 65 func (s *otelStats) Start(ctx context.Context, goFactory GoRoutineFactory) error { 66 if !s.config.enabled.Load() { 67 return nil 68 } 69 70 // Starting OpenTelemetry setup 71 var attrs []attribute.KeyValue 72 s.resourceAttrs = make(map[string]struct{}) 73 if s.config.instanceName != "" { 74 sanitized := sanitizeTagKey("instanceName") 75 attrs = append(attrs, attribute.String(sanitized, s.config.instanceName)) 76 s.resourceAttrs[sanitized] = struct{}{} 77 } 78 if s.config.namespaceIdentifier != "" { 79 sanitized := sanitizeTagKey("namespace") 80 attrs = append(attrs, attribute.String(sanitized, s.config.namespaceIdentifier)) 81 s.resourceAttrs[sanitized] = struct{}{} 82 } 83 res, err := otel.NewResource(s.config.serviceName, s.config.serviceVersion, attrs...) 84 if err != nil { 85 return fmt.Errorf("failed to create open telemetry resource: %w", err) 86 } 87 88 options := []otel.Option{otel.WithInsecure(), otel.WithLogger(s.logger)} 89 if s.otelConfig.tracesEndpoint != "" { 90 s.traceBaseAttributes = attrs 91 tpOpts := []otel.TracerProviderOption{ 92 otel.WithTracingSamplingRate(s.otelConfig.tracingSamplingRate), 93 } 94 if s.otelConfig.withTracingSyncer { 95 tpOpts = append(tpOpts, otel.WithTracingSyncer()) 96 } 97 if s.otelConfig.withZipkin { 98 tpOpts = append(tpOpts, otel.WithZipkin()) 99 } 100 options = append(options, 101 otel.WithTracerProvider(s.otelConfig.tracesEndpoint, tpOpts...), 102 otel.WithTextMapPropagator( 103 propagation.NewCompositeTextMapPropagator(propagation.TraceContext{}, propagation.Baggage{}), 104 ), 105 ) 106 } 107 108 meterProviderOptions := []otel.MeterProviderOption{ 109 otel.WithMeterProviderExportsInterval(s.otelConfig.metricsExportInterval), 110 } 111 if len(s.config.defaultHistogramBuckets) > 0 { 112 meterProviderOptions = append(meterProviderOptions, 113 otel.WithDefaultHistogramBucketBoundaries(s.config.defaultHistogramBuckets), 114 ) 115 } 116 if len(s.config.histogramBuckets) > 0 { 117 for histogramName, buckets := range s.config.histogramBuckets { 118 meterProviderOptions = append(meterProviderOptions, 119 otel.WithHistogramBucketBoundaries(histogramName, defaultMeterName, buckets), 120 ) 121 } 122 } 123 if s.otelConfig.metricsEndpoint != "" { 124 options = append(options, otel.WithMeterProvider(append(meterProviderOptions, 125 otel.WithGRPCMeterProvider(s.otelConfig.metricsEndpoint), 126 )...)) 127 } else if s.otelConfig.enablePrometheusExporter { 128 options = append(options, otel.WithMeterProvider(append(meterProviderOptions, 129 otel.WithPrometheusExporter(s.prometheusRegisterer), 130 )...)) 131 } 132 133 tp, mp, err := s.otelManager.Setup(ctx, res, options...) 134 if err != nil { 135 return fmt.Errorf("failed to setup open telemetry: %w", err) 136 } 137 138 if tp != nil { 139 s.tracerProvider = tp 140 } 141 142 s.noopMeter = noopMetric.NewMeterProvider().Meter(defaultMeterName) 143 if mp != nil { 144 s.meter = mp.Meter(defaultMeterName) 145 } else { 146 s.meter = s.noopMeter 147 } 148 149 if s.otelConfig.enablePrometheusExporter && s.otelConfig.prometheusMetricsPort > 0 { 150 s.httpServerShutdownComplete = make(chan struct{}) 151 s.httpServer = &http.Server{ 152 Addr: fmt.Sprintf(":%d", s.otelConfig.prometheusMetricsPort), 153 Handler: promhttp.InstrumentMetricHandler( 154 s.prometheusRegisterer, promhttp.HandlerFor(s.prometheusGatherer, promhttp.HandlerOpts{ 155 ErrorLog: &prometheusLogger{l: s.logger}, 156 }), 157 ), 158 } 159 goFactory.Go(func() { 160 defer close(s.httpServerShutdownComplete) 161 if err := s.httpServer.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) { 162 s.logger.Fatalf("Prometheus exporter failed: %v", err) 163 } 164 }) 165 } 166 167 // Starting background collection 168 var backgroundCollectionCtx context.Context 169 backgroundCollectionCtx, s.stopBackgroundCollection = context.WithCancel(context.Background()) 170 171 gaugeFunc := func(key string, val uint64) { 172 s.getMeasurement("runtime_"+key, GaugeType, nil).Gauge(val) 173 } 174 s.metricsStatsCollector = newMetricStatsCollector(s, s.config.periodicStatsConfig.metricManager) 175 goFactory.Go(func() { 176 s.metricsStatsCollector.run(backgroundCollectionCtx) 177 }) 178 179 if s.config.periodicStatsConfig.enabled { 180 s.runtimeStatsCollector = newRuntimeStatsCollector(gaugeFunc) 181 s.runtimeStatsCollector.PauseDur = time.Duration(s.config.periodicStatsConfig.statsCollectionInterval) * time.Second 182 s.runtimeStatsCollector.EnableCPU = s.config.periodicStatsConfig.enableCPUStats 183 s.runtimeStatsCollector.EnableMem = s.config.periodicStatsConfig.enableMemStats 184 s.runtimeStatsCollector.EnableGC = s.config.periodicStatsConfig.enableGCStats 185 goFactory.Go(func() { 186 s.runtimeStatsCollector.run(backgroundCollectionCtx) 187 }) 188 } 189 190 if s.otelConfig.enablePrometheusExporter { 191 s.logger.Infof("Stats started in Prometheus mode on :%d", s.otelConfig.prometheusMetricsPort) 192 } else { 193 s.logger.Infof("Stats started in OpenTelemetry mode with metrics endpoint %q and traces endpoint %q", 194 s.otelConfig.metricsEndpoint, s.otelConfig.tracesEndpoint, 195 ) 196 } 197 198 return nil 199 } 200 201 func (s *otelStats) Stop() { 202 if !s.config.enabled.Load() { 203 return 204 } 205 206 ctx, cancel := context.WithTimeout(context.TODO(), 10*time.Second) 207 defer cancel() 208 209 if err := s.otelManager.Shutdown(ctx); err != nil { 210 s.logger.Errorf("failed to shutdown open telemetry: %v", err) 211 } 212 213 s.stopBackgroundCollection() 214 if s.metricsStatsCollector.done != nil { 215 <-s.metricsStatsCollector.done 216 } 217 if s.config.periodicStatsConfig.enabled && s.runtimeStatsCollector.done != nil { 218 <-s.runtimeStatsCollector.done 219 } 220 221 if s.httpServer != nil && s.httpServerShutdownComplete != nil { 222 if err := s.httpServer.Shutdown(ctx); err != nil { 223 s.logger.Errorf("failed to shutdown prometheus exporter: %v", err) 224 } 225 <-s.httpServerShutdownComplete 226 } 227 } 228 229 // NewTracer allows you to create a tracer for creating spans 230 func (s *otelStats) NewTracer(name string) Tracer { 231 s.tracerMapMu.Lock() 232 defer s.tracerMapMu.Unlock() 233 234 if s.tracerMap == nil { 235 s.tracerMap = make(map[string]Tracer) 236 } else if t, ok := s.tracerMap[name]; ok { 237 return t 238 } 239 240 var attrs []attribute.KeyValue 241 if len(s.traceBaseAttributes) > 0 { 242 attrs = append(attrs, s.traceBaseAttributes...) 243 } 244 if s.config.serviceName != "" { 245 attrs = append(attrs, semconv.ServiceNameKey.String(s.config.serviceName)) 246 } 247 248 opts := []trace.TracerOption{ 249 trace.WithInstrumentationVersion(s.config.serviceVersion), 250 } 251 if len(attrs) > 0 { 252 opts = append(opts, trace.WithInstrumentationAttributes(attrs...)) 253 } 254 255 s.tracerMap[name] = &tracer{ 256 tracer: s.tracerProvider.Tracer(name, opts...), 257 } 258 return s.tracerMap[name] 259 } 260 261 // NewStat creates a new Measurement with provided Name and Type 262 func (s *otelStats) NewStat(name, statType string) (m Measurement) { 263 return s.getMeasurement(name, statType, nil) 264 } 265 266 // NewTaggedStat creates a new Measurement with provided Name, Type and Tags 267 func (s *otelStats) NewTaggedStat(name, statType string, tags Tags) (m Measurement) { 268 return s.getMeasurement(name, statType, tags) 269 } 270 271 // NewSampledTaggedStat creates a new Measurement with provided Name, Type and Tags 272 // Deprecated: use NewTaggedStat instead 273 func (s *otelStats) NewSampledTaggedStat(name, statType string, tags Tags) (m Measurement) { 274 return s.NewTaggedStat(name, statType, tags) 275 } 276 277 func (*otelStats) getNoOpMeasurement(statType string) Measurement { 278 om := &otelMeasurement{ 279 genericMeasurement: genericMeasurement{statType: statType}, 280 disabled: true, 281 } 282 switch statType { 283 case CountType: 284 return &otelCounter{otelMeasurement: om} 285 case GaugeType: 286 return &otelGauge{otelMeasurement: om} 287 case TimerType: 288 return &otelTimer{otelMeasurement: om} 289 case HistogramType: 290 return &otelHistogram{otelMeasurement: om} 291 } 292 panic(fmt.Errorf("unsupported measurement type %s", statType)) 293 } 294 295 func (s *otelStats) getMeasurement(name, statType string, tags Tags) Measurement { 296 if !s.config.enabled.Load() { 297 return s.getNoOpMeasurement(statType) 298 } 299 300 if strings.Trim(name, " ") == "" { 301 byteArr := make([]byte, 2048) 302 n := runtime.Stack(byteArr, false) 303 stackTrace := string(byteArr[:n]) 304 s.logger.Warnf("detected missing stat measurement name, using 'novalue':\n%v", stackTrace) 305 name = "novalue" 306 } 307 308 // Clean up tags based on deployment type. No need to send workspace id tag for free tier customers. 309 newTags := make(Tags) 310 for k, v := range tags { 311 if strings.Trim(k, " ") == "" { 312 s.logger.Warnf("removing empty tag key with value %q for measurement %q", v, name) 313 continue 314 } 315 if _, ok := s.config.excludedTags[k]; ok { 316 continue 317 } 318 sanitizedKey := sanitizeTagKey(k) 319 if _, ok := s.config.excludedTags[sanitizedKey]; ok { 320 continue 321 } 322 if _, ok := s.resourceAttrs[sanitizedKey]; ok { 323 s.logger.Warnf("removing tag %q for measurement %q since it is a resource attribute", k, name) 324 continue 325 } 326 newTags[sanitizedKey] = v 327 } 328 329 om := &otelMeasurement{ 330 genericMeasurement: genericMeasurement{statType: statType}, 331 attributes: newTags.otelAttributes(), 332 } 333 334 switch statType { 335 case CountType: 336 instr := buildOTelInstrument(s.meter, s.noopMeter, name, s.counters, &s.countersMu, s.logger) 337 return &otelCounter{counter: instr, otelMeasurement: om} 338 case GaugeType: 339 return s.getGauge(name, om.attributes, newTags.String()) 340 case TimerType: 341 instr := buildOTelInstrument(s.meter, s.noopMeter, name, s.timers, &s.timersMu, s.logger) 342 return &otelTimer{timer: instr, otelMeasurement: om} 343 case HistogramType: 344 instr := buildOTelInstrument(s.meter, s.noopMeter, name, s.histograms, &s.histogramsMu, s.logger) 345 return &otelHistogram{histogram: instr, otelMeasurement: om} 346 default: 347 panic(fmt.Errorf("unsupported measurement type %s", statType)) 348 } 349 } 350 351 func (s *otelStats) getGauge( 352 name string, attributes []attribute.KeyValue, tagsKey string, 353 ) *otelGauge { 354 var ( 355 ok bool 356 og *otelGauge 357 mapKey = name + "|" + tagsKey 358 ) 359 360 s.gaugesMu.Lock() 361 defer s.gaugesMu.Unlock() 362 363 if s.gauges == nil { 364 s.gauges = make(map[string]*otelGauge) 365 } else { 366 og, ok = s.gauges[mapKey] 367 } 368 369 if !ok { 370 og = &otelGauge{otelMeasurement: &otelMeasurement{ 371 genericMeasurement: genericMeasurement{statType: GaugeType}, 372 attributes: attributes, 373 }} 374 375 g, err := s.meter.Float64ObservableGauge(name) 376 if err != nil { 377 s.logger.Warnf("failed to create gauge %s: %v", name, err) 378 g, _ = s.noopMeter.Float64ObservableGauge(name) 379 } else { 380 _, err = s.meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { 381 if value := og.getValue(); value != nil { 382 o.ObserveFloat64(g, cast.ToFloat64(value), metric.WithAttributes(attributes...)) 383 } 384 return nil 385 }, g) 386 if err != nil { 387 panic(fmt.Errorf("failed to register callback for gauge %s: %w", name, err)) 388 } 389 } 390 391 s.gauges[mapKey] = og 392 } 393 394 return og 395 } 396 397 func buildOTelInstrument[T any]( 398 meter, noopMeter metric.Meter, 399 name string, m map[string]T, mu *sync.Mutex, 400 l logger.Logger, 401 ) T { 402 var ( 403 ok bool 404 instr T 405 ) 406 407 mu.Lock() 408 defer mu.Unlock() 409 if m == nil { 410 m = make(map[string]T) 411 } else { 412 instr, ok = m[name] 413 } 414 415 if !ok { 416 var err error 417 var value interface{} 418 switch any(m).(type) { 419 case map[string]metric.Int64Counter: 420 if value, err = meter.Int64Counter(name); err != nil { 421 value, _ = noopMeter.Int64Counter(name) 422 } 423 case map[string]metric.Float64Histogram: 424 if value, err = meter.Float64Histogram(name); err != nil { 425 value, _ = noopMeter.Float64Histogram(name) 426 } 427 default: 428 panic(fmt.Errorf("unknown instrument type %T", instr)) 429 } 430 if err != nil { 431 l.Warnf("failed to create instrument %T(%s): %v", instr, name, err) 432 } 433 instr = value.(T) 434 m[name] = instr 435 } 436 437 return instr 438 } 439 440 type otelStatsConfig struct { 441 tracesEndpoint string 442 tracingSamplingRate float64 443 withTracingSyncer bool 444 withZipkin bool 445 metricsEndpoint string 446 metricsExportInterval time.Duration 447 enablePrometheusExporter bool 448 prometheusMetricsPort int 449 } 450 451 type prometheusLogger struct{ l logger.Logger } 452 453 func (p *prometheusLogger) Println(v ...interface{}) { p.l.Error(v...) }