github.com/rudderlabs/rudder-go-kit@v0.30.0/stats/internal/otel/otel.go (about)

     1  package otel
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"time"
     7  
     8  	promClient "github.com/prometheus/client_golang/prometheus"
     9  	"go.opentelemetry.io/otel"
    10  	"go.opentelemetry.io/otel/attribute"
    11  	"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
    12  	"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
    13  	"go.opentelemetry.io/otel/exporters/zipkin"
    14  	"go.opentelemetry.io/otel/propagation"
    15  	sdkmetric "go.opentelemetry.io/otel/sdk/metric"
    16  	"go.opentelemetry.io/otel/sdk/resource"
    17  	sdktrace "go.opentelemetry.io/otel/sdk/trace"
    18  	semconv "go.opentelemetry.io/otel/semconv/v1.24.0"
    19  	"golang.org/x/sync/errgroup"
    20  
    21  	"github.com/rudderlabs/rudder-go-kit/stats/internal/otel/prometheus"
    22  )
    23  
    24  // DefaultRetryConfig represents the default retry configuration
    25  var DefaultRetryConfig = RetryConfig{
    26  	Enabled:         true,
    27  	InitialInterval: 5 * time.Second,
    28  	MaxInterval:     30 * time.Second,
    29  	MaxElapsedTime:  time.Minute,
    30  }
    31  
    32  type Manager struct {
    33  	tp *sdktrace.TracerProvider
    34  	mp *sdkmetric.MeterProvider
    35  }
    36  
    37  // Setup simplifies the creation of tracer and meter providers with GRPC
    38  func (m *Manager) Setup(
    39  	ctx context.Context, res *resource.Resource, opts ...Option,
    40  ) (
    41  	*sdktrace.TracerProvider,
    42  	*sdkmetric.MeterProvider,
    43  	error,
    44  ) {
    45  	var c config
    46  	for _, opt := range opts {
    47  		opt(&c)
    48  	}
    49  	if c.retryConfig == nil {
    50  		c.retryConfig = &DefaultRetryConfig
    51  	}
    52  	if c.logger == nil {
    53  		c.logger = nopLogger{}
    54  	}
    55  
    56  	if !c.tracerProviderConfig.enabled && !c.meterProviderConfig.enabled {
    57  		return nil, nil, fmt.Errorf("no trace provider or meter provider to initialize")
    58  	}
    59  
    60  	if c.tracerProviderConfig.enabled {
    61  		if c.tracerProviderConfig.customSpanExporter != nil {
    62  			m.tp = sdktrace.NewTracerProvider(m.buildTracerProviderOptions(
    63  				&c, res, c.tracerProviderConfig.customSpanExporter)...,
    64  			)
    65  		} else if c.tracerProviderConfig.withZipkin {
    66  			traceExporter, err := zipkin.New(c.tracesEndpoint)
    67  			if err != nil {
    68  				return nil, nil, fmt.Errorf("failed to create zipkin trace exporter: %w", err)
    69  			}
    70  
    71  			m.tp = sdktrace.NewTracerProvider(m.buildTracerProviderOptions(&c, res, traceExporter)...)
    72  		} else {
    73  			tracerProviderOptions := []otlptracegrpc.Option{
    74  				otlptracegrpc.WithEndpoint(c.tracesEndpoint),
    75  				otlptracegrpc.WithRetry(otlptracegrpc.RetryConfig{
    76  					Enabled:         c.retryConfig.Enabled,
    77  					InitialInterval: c.retryConfig.InitialInterval,
    78  					MaxInterval:     c.retryConfig.MaxInterval,
    79  					MaxElapsedTime:  c.retryConfig.MaxElapsedTime,
    80  				}),
    81  			}
    82  			if c.withInsecure {
    83  				tracerProviderOptions = append(tracerProviderOptions, otlptracegrpc.WithInsecure())
    84  			}
    85  			traceExporter, err := otlptracegrpc.New(ctx, tracerProviderOptions...)
    86  			if err != nil {
    87  				return nil, nil, fmt.Errorf("failed to create trace exporter: %w", err)
    88  			}
    89  
    90  			m.tp = sdktrace.NewTracerProvider(m.buildTracerProviderOptions(&c, res, traceExporter)...)
    91  		}
    92  
    93  		if c.tracerProviderConfig.textMapPropagator != nil {
    94  			otel.SetTextMapPropagator(c.tracerProviderConfig.textMapPropagator)
    95  		}
    96  
    97  		if c.tracerProviderConfig.global {
    98  			otel.SetTracerProvider(m.tp)
    99  		}
   100  	}
   101  
   102  	if c.meterProviderConfig.enabled {
   103  		var err error
   104  		m.mp, err = m.buildMeterProvider(ctx, c, res)
   105  		if err != nil {
   106  			return nil, nil, err
   107  		}
   108  		if c.meterProviderConfig.global {
   109  			otel.SetMeterProvider(m.mp)
   110  		}
   111  	}
   112  
   113  	return m.tp, m.mp, nil
   114  }
   115  
   116  func (m *Manager) buildTracerProviderOptions(
   117  	c *config,
   118  	res *resource.Resource, exp sdktrace.SpanExporter,
   119  ) []sdktrace.TracerProviderOption {
   120  	opts := []sdktrace.TracerProviderOption{
   121  		sdktrace.WithResource(res),
   122  		sdktrace.WithSampler(sdktrace.TraceIDRatioBased(c.tracerProviderConfig.samplingRate)),
   123  	}
   124  
   125  	if c.tracerProviderConfig.withSyncer {
   126  		opts = append(opts, sdktrace.WithSyncer(exp))
   127  	} else {
   128  		opts = append(opts, sdktrace.WithSpanProcessor(sdktrace.NewBatchSpanProcessor(exp)))
   129  	}
   130  
   131  	return opts
   132  }
   133  
   134  func (m *Manager) buildMeterProvider(
   135  	ctx context.Context, c config, res *resource.Resource,
   136  ) (*sdkmetric.MeterProvider, error) {
   137  	if c.meterProviderConfig.grpcEndpoint == nil && c.meterProviderConfig.prometheusRegisterer == nil {
   138  		return nil, fmt.Errorf("no grpc endpoint or prometheus registerer to initialize meter provider")
   139  	}
   140  	if c.meterProviderConfig.grpcEndpoint != nil && c.meterProviderConfig.prometheusRegisterer != nil {
   141  		return nil, fmt.Errorf("cannot initialize meter provider with both grpc endpoint and prometheus registerer")
   142  	}
   143  	if c.meterProviderConfig.prometheusRegisterer != nil {
   144  		return m.buildPrometheusMeterProvider(c, res)
   145  	}
   146  	return m.buildOTLPMeterProvider(ctx, c, res)
   147  }
   148  
   149  func (m *Manager) buildPrometheusMeterProvider(c config, res *resource.Resource) (*sdkmetric.MeterProvider, error) {
   150  	exporterOptions := []prometheus.Option{
   151  		prometheus.WithRegisterer(c.meterProviderConfig.prometheusRegisterer),
   152  		prometheus.WithLogger(c.logger),
   153  	}
   154  	exp, err := prometheus.New(exporterOptions...)
   155  	if err != nil {
   156  		return nil, fmt.Errorf("prometheus: failed to create metric exporter: %w", err)
   157  	}
   158  
   159  	return sdkmetric.NewMeterProvider(m.getMeterProviderOptions(c, res, exp)...), nil
   160  }
   161  
   162  func (m *Manager) buildOTLPMeterProvider(
   163  	ctx context.Context, c config, res *resource.Resource,
   164  ) (*sdkmetric.MeterProvider, error) {
   165  	opts := []otlpmetricgrpc.Option{
   166  		otlpmetricgrpc.WithEndpoint(*c.meterProviderConfig.grpcEndpoint),
   167  		otlpmetricgrpc.WithRetry(otlpmetricgrpc.RetryConfig{
   168  			Enabled:         c.retryConfig.Enabled,
   169  			InitialInterval: c.retryConfig.InitialInterval,
   170  			MaxInterval:     c.retryConfig.MaxInterval,
   171  			MaxElapsedTime:  c.retryConfig.MaxElapsedTime,
   172  		}),
   173  	}
   174  	if c.withInsecure {
   175  		opts = append(opts, otlpmetricgrpc.WithInsecure())
   176  	}
   177  	if len(c.meterProviderConfig.otlpMetricGRPCOptions) > 0 {
   178  		opts = append(opts, c.meterProviderConfig.otlpMetricGRPCOptions...)
   179  	}
   180  	exp, err := otlpmetricgrpc.New(ctx, opts...)
   181  	if err != nil {
   182  		return nil, fmt.Errorf("otlp: failed to create metric exporter: %w", err)
   183  	}
   184  
   185  	reader := sdkmetric.NewPeriodicReader(
   186  		exp,
   187  		sdkmetric.WithInterval(c.meterProviderConfig.exportsInterval),
   188  	)
   189  
   190  	return sdkmetric.NewMeterProvider(m.getMeterProviderOptions(c, res, reader)...), nil
   191  }
   192  
   193  func (m *Manager) getMeterProviderOptions(c config, res *resource.Resource, r sdkmetric.Reader) []sdkmetric.Option {
   194  	opts := []sdkmetric.Option{
   195  		sdkmetric.WithResource(res),
   196  		sdkmetric.WithReader(r),
   197  	}
   198  	var views []sdkmetric.View
   199  	if len(c.meterProviderConfig.views) > 0 {
   200  		views = append(views, c.meterProviderConfig.views...)
   201  	}
   202  	if c.meterProviderConfig.defaultHistogramBuckets != nil {
   203  		views = append(views, c.meterProviderConfig.defaultHistogramBuckets)
   204  	}
   205  	if len(views) > 0 {
   206  		opts = append(opts, sdkmetric.WithView(views...))
   207  	}
   208  	return opts
   209  }
   210  
   211  // Shutdown allows you to gracefully clean up after the OTel manager (e.g. close underlying gRPC connection)
   212  func (m *Manager) Shutdown(ctx context.Context) error {
   213  	var g errgroup.Group
   214  	if m.tp != nil {
   215  		g.Go(func() error {
   216  			return m.tp.Shutdown(ctx)
   217  		})
   218  	}
   219  	if m.mp != nil {
   220  		g.Go(func() error {
   221  			return m.mp.Shutdown(ctx)
   222  		})
   223  	}
   224  
   225  	done := make(chan error)
   226  	go func() {
   227  		done <- g.Wait()
   228  		close(done)
   229  	}()
   230  
   231  	select {
   232  	case <-ctx.Done():
   233  		return ctx.Err()
   234  	case err := <-done:
   235  		return err
   236  	}
   237  }
   238  
   239  // NewResource allows the creation of an OpenTelemetry resource
   240  // https://opentelemetry.io/docs/concepts/glossary/#resource
   241  func NewResource(svcName, svcVersion string, attrs ...attribute.KeyValue) (*resource.Resource, error) {
   242  	defaultAttrs := []attribute.KeyValue{
   243  		semconv.ServiceNameKey.String(svcName),
   244  		semconv.ServiceVersionKey.String(svcVersion),
   245  	}
   246  	return resource.Merge(
   247  		resource.Default(),
   248  		resource.NewWithAttributes(semconv.SchemaURL, append(defaultAttrs, attrs...)...),
   249  	)
   250  }
   251  
   252  // RetryConfig defines configuration for retrying batches in case of export failure
   253  // using an exponential backoff.
   254  type RetryConfig struct {
   255  	// Enabled indicates whether to not retry sending batches in case of
   256  	// export failure.
   257  	Enabled bool
   258  	// InitialInterval the time to wait after the first failure before
   259  	// retrying.
   260  	InitialInterval time.Duration
   261  	// MaxInterval is the upper bound on backoff interval. Once this value is
   262  	// reached the delay between consecutive retries will always be
   263  	// `MaxInterval`.
   264  	MaxInterval time.Duration
   265  	// MaxElapsedTime is the maximum amount of time (including retries) spent
   266  	// trying to send a request/batch.  Once this value is reached, the data
   267  	// is discarded.
   268  	MaxElapsedTime time.Duration
   269  }
   270  
   271  type config struct {
   272  	retryConfig  *RetryConfig
   273  	withInsecure bool
   274  
   275  	tracesEndpoint       string
   276  	tracerProviderConfig tracerProviderConfig
   277  	meterProviderConfig  meterProviderConfig
   278  
   279  	logger logger
   280  }
   281  
   282  type tracerProviderConfig struct {
   283  	enabled            bool
   284  	global             bool
   285  	samplingRate       float64
   286  	textMapPropagator  propagation.TextMapPropagator
   287  	customSpanExporter SpanExporter
   288  	withSyncer         bool
   289  	withZipkin         bool
   290  }
   291  
   292  type meterProviderConfig struct {
   293  	enabled         bool
   294  	global          bool
   295  	exportsInterval time.Duration
   296  	views           []sdkmetric.View
   297  	// defaultHistogramBuckets is not part of the above "views" because the order
   298  	// by which we add views matter. We have to add the default view last because the
   299  	// views criteria are applied in order and the default one is the more generic.
   300  	// Thus, if we put it first it will be applied to all histogram instruments removing
   301  	// the ability to customize the buckets of specific histograms.
   302  	defaultHistogramBuckets sdkmetric.View
   303  	grpcEndpoint            *string
   304  	prometheusRegisterer    promClient.Registerer
   305  	otlpMetricGRPCOptions   []otlpmetricgrpc.Option
   306  }
   307  
   308  type logger interface {
   309  	Info(...interface{})
   310  	Error(...interface{})
   311  }
   312  
   313  type nopLogger struct{}
   314  
   315  func (nopLogger) Info(...interface{})  {}
   316  func (nopLogger) Error(...interface{}) {}