github.com/letsencrypt/boulder@v0.20251208.0/test/integration/otel_test.go (about)

     1  //go:build integration
     2  
     3  package integration
     4  
     5  import (
     6  	"context"
     7  	"crypto/ecdsa"
     8  	"crypto/elliptic"
     9  	"crypto/rand"
    10  	"encoding/json"
    11  	"fmt"
    12  	"io"
    13  	"net/http"
    14  	"os"
    15  	"strings"
    16  	"testing"
    17  	"time"
    18  
    19  	"github.com/eggsampler/acme/v3"
    20  	"go.opentelemetry.io/otel"
    21  	"go.opentelemetry.io/otel/propagation"
    22  	sdktrace "go.opentelemetry.io/otel/sdk/trace"
    23  	"go.opentelemetry.io/otel/trace"
    24  
    25  	"github.com/letsencrypt/boulder/cmd"
    26  	blog "github.com/letsencrypt/boulder/log"
    27  	"github.com/letsencrypt/boulder/test"
    28  )
    29  
    30  // TraceResponse is the list of traces returned from Jaeger's trace search API
    31  // We always search for a single trace by ID, so this should be length 1.
    32  // This is a specialization of Jaeger's structuredResponse type which
    33  // uses []interface{} upstream.
    34  type TraceResponse struct {
    35  	Data []Trace
    36  }
    37  
    38  // Trace represents a single trace in Jaeger's API
    39  // See https://pkg.go.dev/github.com/jaegertracing/jaeger/model/json#Trace
    40  type Trace struct {
    41  	TraceID   string
    42  	Spans     []Span
    43  	Processes map[string]struct {
    44  		ServiceName string
    45  	}
    46  	Warnings []string
    47  }
    48  
    49  // Span represents a single span in Jaeger's API
    50  // See https://pkg.go.dev/github.com/jaegertracing/jaeger/model/json#Span
    51  type Span struct {
    52  	SpanID        string
    53  	OperationName string
    54  	Warnings      []string
    55  	ProcessID     string
    56  	References    []struct {
    57  		RefType string
    58  		TraceID string
    59  		SpanID  string
    60  	}
    61  }
    62  
    63  func getTraceFromJaeger(t *testing.T, traceID trace.TraceID) Trace {
    64  	t.Helper()
    65  	traceURL := "http://bjaeger:16686/api/traces/" + traceID.String()
    66  	resp, err := http.Get(traceURL)
    67  	test.AssertNotError(t, err, "failed to trace from jaeger: "+traceID.String())
    68  	if resp.StatusCode == http.StatusNotFound {
    69  		t.Fatalf("jaeger returned 404 for trace %s", traceID)
    70  	}
    71  	test.AssertEquals(t, resp.StatusCode, http.StatusOK)
    72  
    73  	body, err := io.ReadAll(resp.Body)
    74  	test.AssertNotError(t, err, "failed to read trace body")
    75  
    76  	var parsed TraceResponse
    77  	err = json.Unmarshal(body, &parsed)
    78  	test.AssertNotError(t, err, "failed to decode traces body")
    79  
    80  	if len(parsed.Data) != 1 {
    81  		t.Fatalf("expected to get exactly one trace from jaeger for %s: %v", traceID, parsed)
    82  	}
    83  
    84  	return parsed.Data[0]
    85  }
    86  
    87  type expectedSpans struct {
    88  	Operation string
    89  	Service   string
    90  	Children  []expectedSpans
    91  }
    92  
    93  // isParent returns true if the given span has a parent of ParentID
    94  // The empty string means no ParentID
    95  func isParent(parentID string, span Span) bool {
    96  	if len(span.References) == 0 {
    97  		return parentID == ""
    98  	}
    99  	for _, ref := range span.References {
   100  		// In OpenTelemetry, CHILD_OF is the only reference, but Jaeger supports other systems.
   101  		if ref.RefType == "CHILD_OF" {
   102  			return ref.SpanID == parentID
   103  		}
   104  	}
   105  	return false
   106  }
   107  
   108  func missingChildren(trace Trace, spanID string, children []expectedSpans) bool {
   109  	for _, child := range children {
   110  		if !findSpans(trace, spanID, child) {
   111  			// Missing Child
   112  			return true
   113  		}
   114  	}
   115  	return false
   116  }
   117  
   118  // findSpans checks if the expectedSpan and its expected children are found in trace
   119  func findSpans(trace Trace, parentSpan string, expectedSpan expectedSpans) bool {
   120  	for _, span := range trace.Spans {
   121  		if !isParent(parentSpan, span) {
   122  			continue
   123  		}
   124  		if trace.Processes[span.ProcessID].ServiceName != expectedSpan.Service {
   125  			continue
   126  		}
   127  		if span.OperationName != expectedSpan.Operation {
   128  			continue
   129  		}
   130  		if missingChildren(trace, span.SpanID, expectedSpan.Children) {
   131  			continue
   132  		}
   133  
   134  		// This span has the correct parent, service, operation, and children
   135  		return true
   136  	}
   137  	fmt.Printf("did not find span %s::%s with parent '%s'\n", expectedSpan.Service, expectedSpan.Operation, parentSpan)
   138  	return false
   139  }
   140  
   141  // ContextInjectingRoundTripper holds a context that is added to every request
   142  // sent through this RoundTripper, propagating the OpenTelemetry trace through
   143  // the requests made with it.
   144  //
   145  // This is useful for tracing HTTP clients which don't pass through a context,
   146  // notably including the eggsampler ACME client used in this test.
   147  //
   148  // This test uses a trace started in the test to connect all the outgoing
   149  // requests into a trace that is retrieved from Jaeger's API to make assertions
   150  // about the spans from Boulder.
   151  type ContextInjectingRoundTripper struct {
   152  	ctx context.Context
   153  }
   154  
   155  // RoundTrip implements http.RoundTripper, injecting c.ctx and the OpenTelemetry
   156  // propagation headers into the request. This ensures all requests are traced.
   157  func (c *ContextInjectingRoundTripper) RoundTrip(request *http.Request) (*http.Response, error) {
   158  	// RoundTrip is not permitted to modify the request, so we clone with this context
   159  	r := request.Clone(c.ctx)
   160  	// Inject the otel propagation headers
   161  	otel.GetTextMapPropagator().Inject(c.ctx, propagation.HeaderCarrier(r.Header))
   162  	return http.DefaultTransport.RoundTrip(r)
   163  }
   164  
   165  // rpcSpan is a helper for constructing an RPC span where we have both a client and server rpc operation
   166  func rpcSpan(op, client, server string, children ...expectedSpans) expectedSpans {
   167  	return expectedSpans{
   168  		Operation: op,
   169  		Service:   client,
   170  		Children: []expectedSpans{
   171  			{
   172  				Operation: op,
   173  				Service:   server,
   174  				Children:  children,
   175  			},
   176  		},
   177  	}
   178  }
   179  
   180  func httpSpan(endpoint string, children ...expectedSpans) expectedSpans {
   181  	return expectedSpans{
   182  		Operation: endpoint,
   183  		Service:   "boulder-wfe2",
   184  		Children: append(children,
   185  			rpcSpan("nonce.NonceService/Nonce", "boulder-wfe2", "nonce-service"),
   186  			rpcSpan("nonce.NonceService/Redeem", "boulder-wfe2", "nonce-service"),
   187  		),
   188  	}
   189  }
   190  
   191  func redisPipelineSpan(op, service string, children ...expectedSpans) expectedSpans {
   192  	return expectedSpans{
   193  		Operation: "redis.pipeline " + op,
   194  		Service:   service,
   195  		Children:  children,
   196  	}
   197  }
   198  
   199  // TestTraces tests that all the expected spans are present and properly connected
   200  func TestTraces(t *testing.T) {
   201  	t.Parallel()
   202  	if !strings.Contains(os.Getenv("BOULDER_CONFIG_DIR"), "test/config-next") {
   203  		t.Skip("OpenTelemetry is only configured in config-next")
   204  	}
   205  
   206  	traceID := traceIssuingTestCert(t)
   207  
   208  	wfe := "boulder-wfe2"
   209  	ra := "boulder-ra"
   210  	ca := "boulder-ca"
   211  
   212  	// A very stripped-down version of the expected call graph of a full issuance
   213  	// flow: just enough to ensure that our otel tracing is working without
   214  	// asserting too much about the exact set of RPCs we use under the hood.
   215  	expectedSpans := expectedSpans{
   216  		Operation: "TraceTest",
   217  		Service:   "integration.test",
   218  		Children: []expectedSpans{
   219  			{Operation: "/directory", Service: wfe},
   220  			{Operation: "/acme/new-nonce", Service: wfe, Children: []expectedSpans{
   221  				rpcSpan("nonce.NonceService/Nonce", wfe, "nonce-service")}},
   222  			httpSpan("/acme/new-acct",
   223  				redisPipelineSpan("get", wfe)),
   224  			httpSpan("/acme/new-order"),
   225  			httpSpan("/acme/authz/"),
   226  			httpSpan("/acme/chall/"),
   227  			httpSpan("/acme/finalize/",
   228  				rpcSpan("ra.RegistrationAuthority/FinalizeOrder", wfe, ra,
   229  					rpcSpan("ca.CertificateAuthority/IssueCertificate", ra, ca))),
   230  		},
   231  	}
   232  
   233  	// Retry checking for spans. Span submission is batched asynchronously, so we
   234  	// may have to wait for the DefaultScheduleDelay (5 seconds) for results to
   235  	// be available. Rather than always waiting, we retry a few times.
   236  	// Empirically, this test passes on the second or third try.
   237  	var trace Trace
   238  	found := false
   239  	const retries = 10
   240  	for range retries {
   241  		trace := getTraceFromJaeger(t, traceID)
   242  		if findSpans(trace, "", expectedSpans) {
   243  			found = true
   244  			break
   245  		}
   246  		time.Sleep(sdktrace.DefaultScheduleDelay / 5 * time.Millisecond)
   247  	}
   248  	test.Assert(t, found, fmt.Sprintf("Failed to find expected spans in Jaeger for trace %s", traceID))
   249  
   250  	test.AssertEquals(t, len(trace.Warnings), 0)
   251  	for _, span := range trace.Spans {
   252  		for _, warning := range span.Warnings {
   253  			if strings.Contains(warning, "clock skew adjustment disabled; not applying calculated delta") {
   254  				continue
   255  			}
   256  			t.Errorf("Span %s (%s) warning: %v", span.SpanID, span.OperationName, warning)
   257  		}
   258  	}
   259  }
   260  
   261  func traceIssuingTestCert(t *testing.T) trace.TraceID {
   262  	// Configure this integration test to trace to jaeger:4317 like Boulder will
   263  	shutdown := cmd.NewOpenTelemetry(cmd.OpenTelemetryConfig{
   264  		Endpoint:    "bjaeger:4317",
   265  		SampleRatio: 1,
   266  	}, blog.Get())
   267  	defer shutdown(context.Background())
   268  
   269  	tracer := otel.GetTracerProvider().Tracer("TraceTest")
   270  	ctx, span := tracer.Start(context.Background(), "TraceTest")
   271  	defer span.End()
   272  
   273  	// Provide an HTTP client with otel spans.
   274  	// The acme client doesn't pass contexts through, so we inject one.
   275  	option := acme.WithHTTPClient(&http.Client{
   276  		Timeout:   60 * time.Second,
   277  		Transport: &ContextInjectingRoundTripper{ctx},
   278  	})
   279  
   280  	c, err := acme.NewClient("http://boulder.service.consul:4001/directory", option)
   281  	test.AssertNotError(t, err, "acme.NewClient failed")
   282  
   283  	privKey, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader)
   284  	test.AssertNotError(t, err, "Generating ECDSA key failed")
   285  
   286  	account, err := c.NewAccount(privKey, false, true)
   287  	test.AssertNotError(t, err, "newAccount failed")
   288  
   289  	_, err = authAndIssue(&client{account, c}, nil, []acme.Identifier{{Type: "dns", Value: random_domain()}}, true, "")
   290  	test.AssertNotError(t, err, "authAndIssue failed")
   291  
   292  	return span.SpanContext().TraceID()
   293  }