github.com/rudderlabs/rudder-go-kit@v0.30.0/stats/otel.go (about)

     1  package stats
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"net/http"
     8  	"runtime"
     9  	"strings"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/prometheus/client_golang/prometheus"
    14  	"github.com/prometheus/client_golang/prometheus/promhttp"
    15  	"github.com/spf13/cast"
    16  	"go.opentelemetry.io/otel/attribute"
    17  	"go.opentelemetry.io/otel/metric"
    18  	noopMetric "go.opentelemetry.io/otel/metric/noop"
    19  	"go.opentelemetry.io/otel/propagation"
    20  	semconv "go.opentelemetry.io/otel/semconv/v1.24.0"
    21  	"go.opentelemetry.io/otel/trace"
    22  
    23  	"github.com/rudderlabs/rudder-go-kit/logger"
    24  	"github.com/rudderlabs/rudder-go-kit/stats/internal/otel"
    25  )
    26  
    27  const (
    28  	defaultMeterName = ""
    29  )
    30  
    31  // otelStats is an OTel-specific adapter that follows the Stats contract
    32  type otelStats struct {
    33  	config        statsConfig
    34  	otelConfig    otelStatsConfig
    35  	resourceAttrs map[string]struct{}
    36  
    37  	tracerProvider      trace.TracerProvider
    38  	traceBaseAttributes []attribute.KeyValue
    39  	tracerMap           map[string]Tracer
    40  	tracerMapMu         sync.Mutex
    41  
    42  	meter        metric.Meter
    43  	noopMeter    metric.Meter
    44  	counters     map[string]metric.Int64Counter
    45  	countersMu   sync.Mutex
    46  	gauges       map[string]*otelGauge
    47  	gaugesMu     sync.Mutex
    48  	timers       map[string]metric.Float64Histogram
    49  	timersMu     sync.Mutex
    50  	histograms   map[string]metric.Float64Histogram
    51  	histogramsMu sync.Mutex
    52  
    53  	otelManager              otel.Manager
    54  	runtimeStatsCollector    runtimeStatsCollector
    55  	metricsStatsCollector    metricStatsCollector
    56  	stopBackgroundCollection func()
    57  	logger                   logger.Logger
    58  
    59  	httpServer                 *http.Server
    60  	httpServerShutdownComplete chan struct{}
    61  	prometheusRegisterer       prometheus.Registerer
    62  	prometheusGatherer         prometheus.Gatherer
    63  }
    64  
    65  func (s *otelStats) Start(ctx context.Context, goFactory GoRoutineFactory) error {
    66  	if !s.config.enabled.Load() {
    67  		return nil
    68  	}
    69  
    70  	// Starting OpenTelemetry setup
    71  	var attrs []attribute.KeyValue
    72  	s.resourceAttrs = make(map[string]struct{})
    73  	if s.config.instanceName != "" {
    74  		sanitized := sanitizeTagKey("instanceName")
    75  		attrs = append(attrs, attribute.String(sanitized, s.config.instanceName))
    76  		s.resourceAttrs[sanitized] = struct{}{}
    77  	}
    78  	if s.config.namespaceIdentifier != "" {
    79  		sanitized := sanitizeTagKey("namespace")
    80  		attrs = append(attrs, attribute.String(sanitized, s.config.namespaceIdentifier))
    81  		s.resourceAttrs[sanitized] = struct{}{}
    82  	}
    83  	res, err := otel.NewResource(s.config.serviceName, s.config.serviceVersion, attrs...)
    84  	if err != nil {
    85  		return fmt.Errorf("failed to create open telemetry resource: %w", err)
    86  	}
    87  
    88  	options := []otel.Option{otel.WithInsecure(), otel.WithLogger(s.logger)}
    89  	if s.otelConfig.tracesEndpoint != "" {
    90  		s.traceBaseAttributes = attrs
    91  		tpOpts := []otel.TracerProviderOption{
    92  			otel.WithTracingSamplingRate(s.otelConfig.tracingSamplingRate),
    93  		}
    94  		if s.otelConfig.withTracingSyncer {
    95  			tpOpts = append(tpOpts, otel.WithTracingSyncer())
    96  		}
    97  		if s.otelConfig.withZipkin {
    98  			tpOpts = append(tpOpts, otel.WithZipkin())
    99  		}
   100  		options = append(options,
   101  			otel.WithTracerProvider(s.otelConfig.tracesEndpoint, tpOpts...),
   102  			otel.WithTextMapPropagator(
   103  				propagation.NewCompositeTextMapPropagator(propagation.TraceContext{}, propagation.Baggage{}),
   104  			),
   105  		)
   106  	}
   107  
   108  	meterProviderOptions := []otel.MeterProviderOption{
   109  		otel.WithMeterProviderExportsInterval(s.otelConfig.metricsExportInterval),
   110  	}
   111  	if len(s.config.defaultHistogramBuckets) > 0 {
   112  		meterProviderOptions = append(meterProviderOptions,
   113  			otel.WithDefaultHistogramBucketBoundaries(s.config.defaultHistogramBuckets),
   114  		)
   115  	}
   116  	if len(s.config.histogramBuckets) > 0 {
   117  		for histogramName, buckets := range s.config.histogramBuckets {
   118  			meterProviderOptions = append(meterProviderOptions,
   119  				otel.WithHistogramBucketBoundaries(histogramName, defaultMeterName, buckets),
   120  			)
   121  		}
   122  	}
   123  	if s.otelConfig.metricsEndpoint != "" {
   124  		options = append(options, otel.WithMeterProvider(append(meterProviderOptions,
   125  			otel.WithGRPCMeterProvider(s.otelConfig.metricsEndpoint),
   126  		)...))
   127  	} else if s.otelConfig.enablePrometheusExporter {
   128  		options = append(options, otel.WithMeterProvider(append(meterProviderOptions,
   129  			otel.WithPrometheusExporter(s.prometheusRegisterer),
   130  		)...))
   131  	}
   132  
   133  	tp, mp, err := s.otelManager.Setup(ctx, res, options...)
   134  	if err != nil {
   135  		return fmt.Errorf("failed to setup open telemetry: %w", err)
   136  	}
   137  
   138  	if tp != nil {
   139  		s.tracerProvider = tp
   140  	}
   141  
   142  	s.noopMeter = noopMetric.NewMeterProvider().Meter(defaultMeterName)
   143  	if mp != nil {
   144  		s.meter = mp.Meter(defaultMeterName)
   145  	} else {
   146  		s.meter = s.noopMeter
   147  	}
   148  
   149  	if s.otelConfig.enablePrometheusExporter && s.otelConfig.prometheusMetricsPort > 0 {
   150  		s.httpServerShutdownComplete = make(chan struct{})
   151  		s.httpServer = &http.Server{
   152  			Addr: fmt.Sprintf(":%d", s.otelConfig.prometheusMetricsPort),
   153  			Handler: promhttp.InstrumentMetricHandler(
   154  				s.prometheusRegisterer, promhttp.HandlerFor(s.prometheusGatherer, promhttp.HandlerOpts{
   155  					ErrorLog: &prometheusLogger{l: s.logger},
   156  				}),
   157  			),
   158  		}
   159  		goFactory.Go(func() {
   160  			defer close(s.httpServerShutdownComplete)
   161  			if err := s.httpServer.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) {
   162  				s.logger.Fatalf("Prometheus exporter failed: %v", err)
   163  			}
   164  		})
   165  	}
   166  
   167  	// Starting background collection
   168  	var backgroundCollectionCtx context.Context
   169  	backgroundCollectionCtx, s.stopBackgroundCollection = context.WithCancel(context.Background())
   170  
   171  	gaugeFunc := func(key string, val uint64) {
   172  		s.getMeasurement("runtime_"+key, GaugeType, nil).Gauge(val)
   173  	}
   174  	s.metricsStatsCollector = newMetricStatsCollector(s, s.config.periodicStatsConfig.metricManager)
   175  	goFactory.Go(func() {
   176  		s.metricsStatsCollector.run(backgroundCollectionCtx)
   177  	})
   178  
   179  	if s.config.periodicStatsConfig.enabled {
   180  		s.runtimeStatsCollector = newRuntimeStatsCollector(gaugeFunc)
   181  		s.runtimeStatsCollector.PauseDur = time.Duration(s.config.periodicStatsConfig.statsCollectionInterval) * time.Second
   182  		s.runtimeStatsCollector.EnableCPU = s.config.periodicStatsConfig.enableCPUStats
   183  		s.runtimeStatsCollector.EnableMem = s.config.periodicStatsConfig.enableMemStats
   184  		s.runtimeStatsCollector.EnableGC = s.config.periodicStatsConfig.enableGCStats
   185  		goFactory.Go(func() {
   186  			s.runtimeStatsCollector.run(backgroundCollectionCtx)
   187  		})
   188  	}
   189  
   190  	if s.otelConfig.enablePrometheusExporter {
   191  		s.logger.Infof("Stats started in Prometheus mode on :%d", s.otelConfig.prometheusMetricsPort)
   192  	} else {
   193  		s.logger.Infof("Stats started in OpenTelemetry mode with metrics endpoint %q and traces endpoint %q",
   194  			s.otelConfig.metricsEndpoint, s.otelConfig.tracesEndpoint,
   195  		)
   196  	}
   197  
   198  	return nil
   199  }
   200  
   201  func (s *otelStats) Stop() {
   202  	if !s.config.enabled.Load() {
   203  		return
   204  	}
   205  
   206  	ctx, cancel := context.WithTimeout(context.TODO(), 10*time.Second)
   207  	defer cancel()
   208  
   209  	if err := s.otelManager.Shutdown(ctx); err != nil {
   210  		s.logger.Errorf("failed to shutdown open telemetry: %v", err)
   211  	}
   212  
   213  	s.stopBackgroundCollection()
   214  	if s.metricsStatsCollector.done != nil {
   215  		<-s.metricsStatsCollector.done
   216  	}
   217  	if s.config.periodicStatsConfig.enabled && s.runtimeStatsCollector.done != nil {
   218  		<-s.runtimeStatsCollector.done
   219  	}
   220  
   221  	if s.httpServer != nil && s.httpServerShutdownComplete != nil {
   222  		if err := s.httpServer.Shutdown(ctx); err != nil {
   223  			s.logger.Errorf("failed to shutdown prometheus exporter: %v", err)
   224  		}
   225  		<-s.httpServerShutdownComplete
   226  	}
   227  }
   228  
   229  // NewTracer allows you to create a tracer for creating spans
   230  func (s *otelStats) NewTracer(name string) Tracer {
   231  	s.tracerMapMu.Lock()
   232  	defer s.tracerMapMu.Unlock()
   233  
   234  	if s.tracerMap == nil {
   235  		s.tracerMap = make(map[string]Tracer)
   236  	} else if t, ok := s.tracerMap[name]; ok {
   237  		return t
   238  	}
   239  
   240  	var attrs []attribute.KeyValue
   241  	if len(s.traceBaseAttributes) > 0 {
   242  		attrs = append(attrs, s.traceBaseAttributes...)
   243  	}
   244  	if s.config.serviceName != "" {
   245  		attrs = append(attrs, semconv.ServiceNameKey.String(s.config.serviceName))
   246  	}
   247  
   248  	opts := []trace.TracerOption{
   249  		trace.WithInstrumentationVersion(s.config.serviceVersion),
   250  	}
   251  	if len(attrs) > 0 {
   252  		opts = append(opts, trace.WithInstrumentationAttributes(attrs...))
   253  	}
   254  
   255  	s.tracerMap[name] = &tracer{
   256  		tracer: s.tracerProvider.Tracer(name, opts...),
   257  	}
   258  	return s.tracerMap[name]
   259  }
   260  
   261  // NewStat creates a new Measurement with provided Name and Type
   262  func (s *otelStats) NewStat(name, statType string) (m Measurement) {
   263  	return s.getMeasurement(name, statType, nil)
   264  }
   265  
   266  // NewTaggedStat creates a new Measurement with provided Name, Type and Tags
   267  func (s *otelStats) NewTaggedStat(name, statType string, tags Tags) (m Measurement) {
   268  	return s.getMeasurement(name, statType, tags)
   269  }
   270  
   271  // NewSampledTaggedStat creates a new Measurement with provided Name, Type and Tags
   272  // Deprecated: use NewTaggedStat instead
   273  func (s *otelStats) NewSampledTaggedStat(name, statType string, tags Tags) (m Measurement) {
   274  	return s.NewTaggedStat(name, statType, tags)
   275  }
   276  
   277  func (*otelStats) getNoOpMeasurement(statType string) Measurement {
   278  	om := &otelMeasurement{
   279  		genericMeasurement: genericMeasurement{statType: statType},
   280  		disabled:           true,
   281  	}
   282  	switch statType {
   283  	case CountType:
   284  		return &otelCounter{otelMeasurement: om}
   285  	case GaugeType:
   286  		return &otelGauge{otelMeasurement: om}
   287  	case TimerType:
   288  		return &otelTimer{otelMeasurement: om}
   289  	case HistogramType:
   290  		return &otelHistogram{otelMeasurement: om}
   291  	}
   292  	panic(fmt.Errorf("unsupported measurement type %s", statType))
   293  }
   294  
   295  func (s *otelStats) getMeasurement(name, statType string, tags Tags) Measurement {
   296  	if !s.config.enabled.Load() {
   297  		return s.getNoOpMeasurement(statType)
   298  	}
   299  
   300  	if strings.Trim(name, " ") == "" {
   301  		byteArr := make([]byte, 2048)
   302  		n := runtime.Stack(byteArr, false)
   303  		stackTrace := string(byteArr[:n])
   304  		s.logger.Warnf("detected missing stat measurement name, using 'novalue':\n%v", stackTrace)
   305  		name = "novalue"
   306  	}
   307  
   308  	// Clean up tags based on deployment type. No need to send workspace id tag for free tier customers.
   309  	newTags := make(Tags)
   310  	for k, v := range tags {
   311  		if strings.Trim(k, " ") == "" {
   312  			s.logger.Warnf("removing empty tag key with value %q for measurement %q", v, name)
   313  			continue
   314  		}
   315  		if _, ok := s.config.excludedTags[k]; ok {
   316  			continue
   317  		}
   318  		sanitizedKey := sanitizeTagKey(k)
   319  		if _, ok := s.config.excludedTags[sanitizedKey]; ok {
   320  			continue
   321  		}
   322  		if _, ok := s.resourceAttrs[sanitizedKey]; ok {
   323  			s.logger.Warnf("removing tag %q for measurement %q since it is a resource attribute", k, name)
   324  			continue
   325  		}
   326  		newTags[sanitizedKey] = v
   327  	}
   328  
   329  	om := &otelMeasurement{
   330  		genericMeasurement: genericMeasurement{statType: statType},
   331  		attributes:         newTags.otelAttributes(),
   332  	}
   333  
   334  	switch statType {
   335  	case CountType:
   336  		instr := buildOTelInstrument(s.meter, s.noopMeter, name, s.counters, &s.countersMu, s.logger)
   337  		return &otelCounter{counter: instr, otelMeasurement: om}
   338  	case GaugeType:
   339  		return s.getGauge(name, om.attributes, newTags.String())
   340  	case TimerType:
   341  		instr := buildOTelInstrument(s.meter, s.noopMeter, name, s.timers, &s.timersMu, s.logger)
   342  		return &otelTimer{timer: instr, otelMeasurement: om}
   343  	case HistogramType:
   344  		instr := buildOTelInstrument(s.meter, s.noopMeter, name, s.histograms, &s.histogramsMu, s.logger)
   345  		return &otelHistogram{histogram: instr, otelMeasurement: om}
   346  	default:
   347  		panic(fmt.Errorf("unsupported measurement type %s", statType))
   348  	}
   349  }
   350  
   351  func (s *otelStats) getGauge(
   352  	name string, attributes []attribute.KeyValue, tagsKey string,
   353  ) *otelGauge {
   354  	var (
   355  		ok     bool
   356  		og     *otelGauge
   357  		mapKey = name + "|" + tagsKey
   358  	)
   359  
   360  	s.gaugesMu.Lock()
   361  	defer s.gaugesMu.Unlock()
   362  
   363  	if s.gauges == nil {
   364  		s.gauges = make(map[string]*otelGauge)
   365  	} else {
   366  		og, ok = s.gauges[mapKey]
   367  	}
   368  
   369  	if !ok {
   370  		og = &otelGauge{otelMeasurement: &otelMeasurement{
   371  			genericMeasurement: genericMeasurement{statType: GaugeType},
   372  			attributes:         attributes,
   373  		}}
   374  
   375  		g, err := s.meter.Float64ObservableGauge(name)
   376  		if err != nil {
   377  			s.logger.Warnf("failed to create gauge %s: %v", name, err)
   378  			g, _ = s.noopMeter.Float64ObservableGauge(name)
   379  		} else {
   380  			_, err = s.meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error {
   381  				if value := og.getValue(); value != nil {
   382  					o.ObserveFloat64(g, cast.ToFloat64(value), metric.WithAttributes(attributes...))
   383  				}
   384  				return nil
   385  			}, g)
   386  			if err != nil {
   387  				panic(fmt.Errorf("failed to register callback for gauge %s: %w", name, err))
   388  			}
   389  		}
   390  
   391  		s.gauges[mapKey] = og
   392  	}
   393  
   394  	return og
   395  }
   396  
   397  func buildOTelInstrument[T any](
   398  	meter, noopMeter metric.Meter,
   399  	name string, m map[string]T, mu *sync.Mutex,
   400  	l logger.Logger,
   401  ) T {
   402  	var (
   403  		ok    bool
   404  		instr T
   405  	)
   406  
   407  	mu.Lock()
   408  	defer mu.Unlock()
   409  	if m == nil {
   410  		m = make(map[string]T)
   411  	} else {
   412  		instr, ok = m[name]
   413  	}
   414  
   415  	if !ok {
   416  		var err error
   417  		var value interface{}
   418  		switch any(m).(type) {
   419  		case map[string]metric.Int64Counter:
   420  			if value, err = meter.Int64Counter(name); err != nil {
   421  				value, _ = noopMeter.Int64Counter(name)
   422  			}
   423  		case map[string]metric.Float64Histogram:
   424  			if value, err = meter.Float64Histogram(name); err != nil {
   425  				value, _ = noopMeter.Float64Histogram(name)
   426  			}
   427  		default:
   428  			panic(fmt.Errorf("unknown instrument type %T", instr))
   429  		}
   430  		if err != nil {
   431  			l.Warnf("failed to create instrument %T(%s): %v", instr, name, err)
   432  		}
   433  		instr = value.(T)
   434  		m[name] = instr
   435  	}
   436  
   437  	return instr
   438  }
   439  
   440  type otelStatsConfig struct {
   441  	tracesEndpoint           string
   442  	tracingSamplingRate      float64
   443  	withTracingSyncer        bool
   444  	withZipkin               bool
   445  	metricsEndpoint          string
   446  	metricsExportInterval    time.Duration
   447  	enablePrometheusExporter bool
   448  	prometheusMetricsPort    int
   449  }
   450  
   451  type prometheusLogger struct{ l logger.Logger }
   452  
   453  func (p *prometheusLogger) Println(v ...interface{}) { p.l.Error(v...) }