github.com/letsencrypt/boulder@v0.20251208.0/test/integration/otel_test.go (about) 1 //go:build integration 2 3 package integration 4 5 import ( 6 "context" 7 "crypto/ecdsa" 8 "crypto/elliptic" 9 "crypto/rand" 10 "encoding/json" 11 "fmt" 12 "io" 13 "net/http" 14 "os" 15 "strings" 16 "testing" 17 "time" 18 19 "github.com/eggsampler/acme/v3" 20 "go.opentelemetry.io/otel" 21 "go.opentelemetry.io/otel/propagation" 22 sdktrace "go.opentelemetry.io/otel/sdk/trace" 23 "go.opentelemetry.io/otel/trace" 24 25 "github.com/letsencrypt/boulder/cmd" 26 blog "github.com/letsencrypt/boulder/log" 27 "github.com/letsencrypt/boulder/test" 28 ) 29 30 // TraceResponse is the list of traces returned from Jaeger's trace search API 31 // We always search for a single trace by ID, so this should be length 1. 32 // This is a specialization of Jaeger's structuredResponse type which 33 // uses []interface{} upstream. 34 type TraceResponse struct { 35 Data []Trace 36 } 37 38 // Trace represents a single trace in Jaeger's API 39 // See https://pkg.go.dev/github.com/jaegertracing/jaeger/model/json#Trace 40 type Trace struct { 41 TraceID string 42 Spans []Span 43 Processes map[string]struct { 44 ServiceName string 45 } 46 Warnings []string 47 } 48 49 // Span represents a single span in Jaeger's API 50 // See https://pkg.go.dev/github.com/jaegertracing/jaeger/model/json#Span 51 type Span struct { 52 SpanID string 53 OperationName string 54 Warnings []string 55 ProcessID string 56 References []struct { 57 RefType string 58 TraceID string 59 SpanID string 60 } 61 } 62 63 func getTraceFromJaeger(t *testing.T, traceID trace.TraceID) Trace { 64 t.Helper() 65 traceURL := "http://bjaeger:16686/api/traces/" + traceID.String() 66 resp, err := http.Get(traceURL) 67 test.AssertNotError(t, err, "failed to trace from jaeger: "+traceID.String()) 68 if resp.StatusCode == http.StatusNotFound { 69 t.Fatalf("jaeger returned 404 for trace %s", traceID) 70 } 71 test.AssertEquals(t, resp.StatusCode, http.StatusOK) 72 73 body, err := io.ReadAll(resp.Body) 74 test.AssertNotError(t, err, "failed to read trace body") 75 76 var parsed TraceResponse 77 err = json.Unmarshal(body, &parsed) 78 test.AssertNotError(t, err, "failed to decode traces body") 79 80 if len(parsed.Data) != 1 { 81 t.Fatalf("expected to get exactly one trace from jaeger for %s: %v", traceID, parsed) 82 } 83 84 return parsed.Data[0] 85 } 86 87 type expectedSpans struct { 88 Operation string 89 Service string 90 Children []expectedSpans 91 } 92 93 // isParent returns true if the given span has a parent of ParentID 94 // The empty string means no ParentID 95 func isParent(parentID string, span Span) bool { 96 if len(span.References) == 0 { 97 return parentID == "" 98 } 99 for _, ref := range span.References { 100 // In OpenTelemetry, CHILD_OF is the only reference, but Jaeger supports other systems. 101 if ref.RefType == "CHILD_OF" { 102 return ref.SpanID == parentID 103 } 104 } 105 return false 106 } 107 108 func missingChildren(trace Trace, spanID string, children []expectedSpans) bool { 109 for _, child := range children { 110 if !findSpans(trace, spanID, child) { 111 // Missing Child 112 return true 113 } 114 } 115 return false 116 } 117 118 // findSpans checks if the expectedSpan and its expected children are found in trace 119 func findSpans(trace Trace, parentSpan string, expectedSpan expectedSpans) bool { 120 for _, span := range trace.Spans { 121 if !isParent(parentSpan, span) { 122 continue 123 } 124 if trace.Processes[span.ProcessID].ServiceName != expectedSpan.Service { 125 continue 126 } 127 if span.OperationName != expectedSpan.Operation { 128 continue 129 } 130 if missingChildren(trace, span.SpanID, expectedSpan.Children) { 131 continue 132 } 133 134 // This span has the correct parent, service, operation, and children 135 return true 136 } 137 fmt.Printf("did not find span %s::%s with parent '%s'\n", expectedSpan.Service, expectedSpan.Operation, parentSpan) 138 return false 139 } 140 141 // ContextInjectingRoundTripper holds a context that is added to every request 142 // sent through this RoundTripper, propagating the OpenTelemetry trace through 143 // the requests made with it. 144 // 145 // This is useful for tracing HTTP clients which don't pass through a context, 146 // notably including the eggsampler ACME client used in this test. 147 // 148 // This test uses a trace started in the test to connect all the outgoing 149 // requests into a trace that is retrieved from Jaeger's API to make assertions 150 // about the spans from Boulder. 151 type ContextInjectingRoundTripper struct { 152 ctx context.Context 153 } 154 155 // RoundTrip implements http.RoundTripper, injecting c.ctx and the OpenTelemetry 156 // propagation headers into the request. This ensures all requests are traced. 157 func (c *ContextInjectingRoundTripper) RoundTrip(request *http.Request) (*http.Response, error) { 158 // RoundTrip is not permitted to modify the request, so we clone with this context 159 r := request.Clone(c.ctx) 160 // Inject the otel propagation headers 161 otel.GetTextMapPropagator().Inject(c.ctx, propagation.HeaderCarrier(r.Header)) 162 return http.DefaultTransport.RoundTrip(r) 163 } 164 165 // rpcSpan is a helper for constructing an RPC span where we have both a client and server rpc operation 166 func rpcSpan(op, client, server string, children ...expectedSpans) expectedSpans { 167 return expectedSpans{ 168 Operation: op, 169 Service: client, 170 Children: []expectedSpans{ 171 { 172 Operation: op, 173 Service: server, 174 Children: children, 175 }, 176 }, 177 } 178 } 179 180 func httpSpan(endpoint string, children ...expectedSpans) expectedSpans { 181 return expectedSpans{ 182 Operation: endpoint, 183 Service: "boulder-wfe2", 184 Children: append(children, 185 rpcSpan("nonce.NonceService/Nonce", "boulder-wfe2", "nonce-service"), 186 rpcSpan("nonce.NonceService/Redeem", "boulder-wfe2", "nonce-service"), 187 ), 188 } 189 } 190 191 func redisPipelineSpan(op, service string, children ...expectedSpans) expectedSpans { 192 return expectedSpans{ 193 Operation: "redis.pipeline " + op, 194 Service: service, 195 Children: children, 196 } 197 } 198 199 // TestTraces tests that all the expected spans are present and properly connected 200 func TestTraces(t *testing.T) { 201 t.Parallel() 202 if !strings.Contains(os.Getenv("BOULDER_CONFIG_DIR"), "test/config-next") { 203 t.Skip("OpenTelemetry is only configured in config-next") 204 } 205 206 traceID := traceIssuingTestCert(t) 207 208 wfe := "boulder-wfe2" 209 ra := "boulder-ra" 210 ca := "boulder-ca" 211 212 // A very stripped-down version of the expected call graph of a full issuance 213 // flow: just enough to ensure that our otel tracing is working without 214 // asserting too much about the exact set of RPCs we use under the hood. 215 expectedSpans := expectedSpans{ 216 Operation: "TraceTest", 217 Service: "integration.test", 218 Children: []expectedSpans{ 219 {Operation: "/directory", Service: wfe}, 220 {Operation: "/acme/new-nonce", Service: wfe, Children: []expectedSpans{ 221 rpcSpan("nonce.NonceService/Nonce", wfe, "nonce-service")}}, 222 httpSpan("/acme/new-acct", 223 redisPipelineSpan("get", wfe)), 224 httpSpan("/acme/new-order"), 225 httpSpan("/acme/authz/"), 226 httpSpan("/acme/chall/"), 227 httpSpan("/acme/finalize/", 228 rpcSpan("ra.RegistrationAuthority/FinalizeOrder", wfe, ra, 229 rpcSpan("ca.CertificateAuthority/IssueCertificate", ra, ca))), 230 }, 231 } 232 233 // Retry checking for spans. Span submission is batched asynchronously, so we 234 // may have to wait for the DefaultScheduleDelay (5 seconds) for results to 235 // be available. Rather than always waiting, we retry a few times. 236 // Empirically, this test passes on the second or third try. 237 var trace Trace 238 found := false 239 const retries = 10 240 for range retries { 241 trace := getTraceFromJaeger(t, traceID) 242 if findSpans(trace, "", expectedSpans) { 243 found = true 244 break 245 } 246 time.Sleep(sdktrace.DefaultScheduleDelay / 5 * time.Millisecond) 247 } 248 test.Assert(t, found, fmt.Sprintf("Failed to find expected spans in Jaeger for trace %s", traceID)) 249 250 test.AssertEquals(t, len(trace.Warnings), 0) 251 for _, span := range trace.Spans { 252 for _, warning := range span.Warnings { 253 if strings.Contains(warning, "clock skew adjustment disabled; not applying calculated delta") { 254 continue 255 } 256 t.Errorf("Span %s (%s) warning: %v", span.SpanID, span.OperationName, warning) 257 } 258 } 259 } 260 261 func traceIssuingTestCert(t *testing.T) trace.TraceID { 262 // Configure this integration test to trace to jaeger:4317 like Boulder will 263 shutdown := cmd.NewOpenTelemetry(cmd.OpenTelemetryConfig{ 264 Endpoint: "bjaeger:4317", 265 SampleRatio: 1, 266 }, blog.Get()) 267 defer shutdown(context.Background()) 268 269 tracer := otel.GetTracerProvider().Tracer("TraceTest") 270 ctx, span := tracer.Start(context.Background(), "TraceTest") 271 defer span.End() 272 273 // Provide an HTTP client with otel spans. 274 // The acme client doesn't pass contexts through, so we inject one. 275 option := acme.WithHTTPClient(&http.Client{ 276 Timeout: 60 * time.Second, 277 Transport: &ContextInjectingRoundTripper{ctx}, 278 }) 279 280 c, err := acme.NewClient("http://boulder.service.consul:4001/directory", option) 281 test.AssertNotError(t, err, "acme.NewClient failed") 282 283 privKey, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) 284 test.AssertNotError(t, err, "Generating ECDSA key failed") 285 286 account, err := c.NewAccount(privKey, false, true) 287 test.AssertNotError(t, err, "newAccount failed") 288 289 _, err = authAndIssue(&client{account, c}, nil, []acme.Identifier{{Type: "dns", Value: random_domain()}}, true, "") 290 test.AssertNotError(t, err, "authAndIssue failed") 291 292 return span.SpanContext().TraceID() 293 }