github.com/grafana/pyroscope@v1.18.0/cmd/profilecli/canary_exporter.go (about) 1 // Provenance-includes-location: https://github.com/prometheus/blackbox_exporter/blob/9d3e8e8ab443772aefb4ba2c3010329fd6d9be84/prober/http.go 2 // Provenance-includes-license: Apache-2.0 3 // Provenance-includes-copyright: The Prometheus Authors. 4 5 // This has been mostly adapted to our use case from the blackbox exporter 6 7 package main 8 9 import ( 10 "context" 11 "crypto/sha256" 12 "crypto/tls" 13 "encoding/hex" 14 "fmt" 15 "io" 16 "net/http" 17 "net/http/httptrace" 18 "os" 19 "strconv" 20 "strings" 21 "sync" 22 "time" 23 24 "github.com/go-kit/log/level" 25 "github.com/grafana/dskit/multierror" 26 "github.com/prometheus/client_golang/prometheus" 27 "github.com/prometheus/client_golang/prometheus/promauto" 28 "github.com/prometheus/client_golang/prometheus/promhttp" 29 "go.uber.org/atomic" 30 ) 31 32 type canaryExporterParams struct { 33 *phlareClient 34 ListenAddress string 35 TestFrequency time.Duration 36 TestDelay time.Duration 37 QueryProbeSet string 38 } 39 40 func addCanaryExporterParams(ceCmd commander) *canaryExporterParams { 41 var ( 42 params = &canaryExporterParams{} 43 ) 44 ceCmd.Flag("listen-address", "Listen address for the canary exporter.").Default(":4101").StringVar(¶ms.ListenAddress) 45 ceCmd.Flag("test-frequency", "How often the specified Pyroscope cell should be tested.").Default("15s").DurationVar(¶ms.TestFrequency) 46 ceCmd.Flag("test-delay", "The delay between ingest and query requests.").Default("2s").DurationVar(¶ms.TestDelay) 47 ceCmd.Flag("query-probe-set", "Which set of probes to use for query requests. Available sets are \"default\" and \"all\".").Default("default").EnumVar(¶ms.QueryProbeSet, "default", "all") 48 params.phlareClient = addPhlareClient(ceCmd) 49 50 return params 51 } 52 53 type queryProbe struct { 54 name string 55 f func(ctx context.Context, now time.Time) error 56 } 57 58 type canaryExporter struct { 59 params *canaryExporterParams 60 reg *prometheus.Registry 61 mux *http.ServeMux 62 63 defaultTransport http.RoundTripper 64 metrics *canaryExporterMetrics 65 66 queryProbes []*queryProbe 67 68 hostname string 69 } 70 71 type canaryExporterMetrics struct { 72 success *prometheus.GaugeVec 73 duration *prometheus.HistogramVec 74 contentLength *prometheus.GaugeVec 75 bodyUncompressedLength *prometheus.GaugeVec 76 statusCode *prometheus.GaugeVec 77 isSSL prometheus.Gauge 78 probeSSLEarliestCertExpiry prometheus.Gauge 79 probeSSLLastChainExpiryTimestampSeconds prometheus.Gauge 80 probeTLSVersion *prometheus.GaugeVec 81 probeSSLLastInformation *prometheus.GaugeVec 82 probeHTTPVersion *prometheus.GaugeVec 83 } 84 85 func newCanaryExporterMetrics(reg prometheus.Registerer) *canaryExporterMetrics { 86 return &canaryExporterMetrics{ 87 success: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ 88 Name: "probe_success", 89 Help: "Displays whether or not the probe was a success", 90 }, []string{"name"}), 91 duration: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ 92 Name: "probe_http_duration_seconds", 93 Help: "Duration of http request by phase, summed over all redirects", 94 Buckets: prometheus.ExponentialBuckets(0.00025, 4, 10), 95 }, []string{"name", "phase"}), 96 97 contentLength: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ 98 Name: "probe_http_content_length", 99 Help: "Length of http content response", 100 }, []string{"name"}), 101 bodyUncompressedLength: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ 102 Name: "probe_http_uncompressed_body_length", 103 Help: "Length of uncompressed response body", 104 }, []string{"name"}), 105 statusCode: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ 106 Name: "probe_http_status_code", 107 Help: "Response HTTP status code", 108 }, []string{"name"}), 109 isSSL: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ 110 Name: "probe_http_ssl", 111 Help: "Indicates if SSL was used for the final redirect", 112 }), 113 probeSSLEarliestCertExpiry: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ 114 Name: "probe_ssl_earliest_cert_expiry", 115 Help: "Returns last SSL chain expiry in unixtime", 116 }), 117 probeSSLLastChainExpiryTimestampSeconds: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ 118 Name: "probe_ssl_last_chain_expiry_timestamp_seconds", 119 Help: "Returns last SSL chain expiry in timestamp", 120 }), 121 probeTLSVersion: promauto.With(reg).NewGaugeVec( 122 prometheus.GaugeOpts{ 123 Name: "probe_tls_version_info", 124 Help: "Returns the TLS version used or NaN when unknown", 125 }, 126 []string{"version"}, 127 ), 128 probeSSLLastInformation: promauto.With(reg).NewGaugeVec( 129 prometheus.GaugeOpts{ 130 Name: "probe_ssl_last_chain_info", 131 Help: "Contains SSL leaf certificate information", 132 }, 133 []string{"fingerprint_sha256", "subject", "issuer", "subjectalternative"}, 134 ), 135 probeHTTPVersion: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ 136 Name: "probe_http_version", 137 Help: "Returns the version of HTTP of the probe response", 138 }, []string{"name"}), 139 } 140 } 141 142 func newCanaryExporter(params *canaryExporterParams) *canaryExporter { 143 // Disable keepalives messing with probes 144 defaultTransport := http.DefaultTransport.(*http.Transport) 145 defaultTransport.DisableKeepAlives = true 146 params.defaultTransport = defaultTransport 147 148 reg := prometheus.NewRegistry() 149 ce := &canaryExporter{ 150 reg: reg, 151 mux: http.NewServeMux(), 152 params: params, 153 154 hostname: "unknown", 155 defaultTransport: params.httpClient().Transport, 156 157 metrics: newCanaryExporterMetrics(reg), 158 159 queryProbes: make([]*queryProbe, 0), 160 } 161 162 ce.queryProbes = append(ce.queryProbes, &queryProbe{name: "query-select-merge-profile", f: ce.testSelectMergeProfile}) 163 ce.queryProbes = append(ce.queryProbes, &queryProbe{name: "query-select-merge-otlp-profile", f: ce.testSelectMergeOTLPProfile}) 164 165 if params.QueryProbeSet == "all" { 166 ce.queryProbes = append(ce.queryProbes, &queryProbe{"query-profile-types", ce.testProfileTypes}) 167 ce.queryProbes = append(ce.queryProbes, &queryProbe{"query-series", ce.testSeries}) 168 ce.queryProbes = append(ce.queryProbes, &queryProbe{"query-label-names", ce.testLabelNames}) 169 ce.queryProbes = append(ce.queryProbes, &queryProbe{"query-label-values", ce.testLabelValues}) 170 ce.queryProbes = append(ce.queryProbes, &queryProbe{"query-select-series", ce.testSelectSeries}) 171 ce.queryProbes = append(ce.queryProbes, &queryProbe{"query-select-merge-stacktraces", ce.testSelectMergeStacktraces}) 172 ce.queryProbes = append(ce.queryProbes, &queryProbe{"query-select-merge-span-profile", ce.testSelectMergeSpanProfile}) 173 ce.queryProbes = append(ce.queryProbes, &queryProbe{"query-get-profile-stats", ce.testGetProfileStats}) 174 ce.queryProbes = append(ce.queryProbes, &queryProbe{"render", ce.testRender}) 175 ce.queryProbes = append(ce.queryProbes, &queryProbe{"render-diff", ce.testRenderDiff}) 176 } 177 178 metricsPath := "/metrics" 179 ce.mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { 180 _, _ = w.Write([]byte(`<html> 181 <head><title>Pyroscope Blackbox Exporter</title></head> 182 <body> 183 <h1>Pyroscope Blackbox Exporter</h1> 184 <p><a href="` + metricsPath + `">Metrics</a></p> 185 </body> 186 </html>`)) 187 }) 188 189 // Expose the registered metrics via HTTP. 190 ce.mux.Handle(metricsPath, promhttp.HandlerFor( 191 ce.reg, 192 promhttp.HandlerOpts{ 193 // Opt into OpenMetrics to support exemplars. 194 EnableOpenMetrics: true, 195 // Pass custom registry 196 Registry: ce.reg, 197 }, 198 )) 199 200 if hostname, err := os.Hostname(); err == nil { 201 ce.hostname = hostname 202 } 203 204 return ce 205 } 206 207 func (ce *canaryExporter) run(ctx context.Context) error { 208 209 run := func(ctx context.Context) { 210 if err := ce.testPyroscopeCell(ctx); err != nil { 211 for _, line := range strings.Split(err.Error(), "\n") { 212 level.Error(logger).Log("msg", "error testing pyroscope cell", "err", line) 213 } 214 } 215 } 216 run(ctx) 217 218 go func() { 219 ticker := time.NewTicker(ce.params.TestFrequency) 220 defer ticker.Stop() 221 222 for { 223 select { 224 case <-ctx.Done(): 225 return 226 case n := <-ticker.C: 227 cCtx, cancel := context.WithDeadline(ctx, n.Add(ce.params.TestFrequency)) 228 run(cCtx) 229 cancel() 230 } 231 } 232 }() 233 234 if err := http.ListenAndServe(ce.params.ListenAddress, ce.mux); err != nil { 235 return err 236 } 237 238 return nil 239 } 240 241 func (ce *canaryExporter) doTrace(ctx context.Context, probeName string) (rCtx context.Context, done func(bool)) { 242 level.Info(logger).Log("msg", "starting probe", "probe_name", probeName) 243 tt := newInstrumentedTransport(ce.defaultTransport, ce.metrics, probeName) 244 ce.params.client.Transport = tt 245 246 trace := &httptrace.ClientTrace{ 247 DNSStart: tt.DNSStart, 248 DNSDone: tt.DNSDone, 249 ConnectStart: tt.ConnectStart, 250 ConnectDone: tt.ConnectDone, 251 GotConn: tt.GotConn, 252 GotFirstResponseByte: tt.GotFirstResponseByte, 253 TLSHandshakeStart: tt.TLSHandshakeStart, 254 TLSHandshakeDone: tt.TLSHandshakeDone, 255 } 256 rCtx = httptrace.WithClientTrace(ctx, trace) 257 258 return rCtx, func(result bool) { 259 // At this point body is fully read and we can write end time. 260 // Note: tt.current may be nil for non-HTTP probes (e.g., gRPC) 261 if tt.current != nil { 262 tt.current.end = time.Now() 263 } 264 265 // record body size 266 ce.metrics.bodyUncompressedLength.WithLabelValues(probeName).Set(float64(tt.bodySize.Load())) 267 268 // aggregate duration for all requests (that is to support redirects) 269 durations := make(map[string]float64) 270 271 for _, trace := range tt.traces { 272 durations["resolve"] += trace.dnsDone.Sub(trace.start).Seconds() 273 274 // Continue here if we never got a connection because a request failed. 275 if trace.gotConn.IsZero() { 276 continue 277 } 278 279 if trace.tls { 280 // dnsDone must be set if gotConn was set. 281 durations["connect"] += trace.connectDone.Sub(trace.dnsDone).Seconds() 282 durations["tls"] += trace.tlsDone.Sub(trace.tlsStart).Seconds() 283 } else { 284 durations["connect"] += trace.gotConn.Sub(trace.dnsDone).Seconds() 285 } 286 287 // Continue here if we never got a response from the server. 288 if trace.responseStart.IsZero() { 289 continue 290 } 291 durations["processing"] += trace.responseStart.Sub(trace.gotConn).Seconds() 292 293 // Continue here if we never read the full response from the server. 294 // Usually this means that request either failed or was redirected. 295 if trace.end.IsZero() { 296 continue 297 } 298 durations["transfer"] += trace.end.Sub(trace.responseStart).Seconds() 299 } 300 301 // now store the values in the histogram 302 for phase, value := range durations { 303 ce.metrics.duration.WithLabelValues(probeName, phase).Observe(value) 304 } 305 306 if m := ce.metrics.success.WithLabelValues(probeName); result { 307 m.Set(1) 308 } else { 309 m.Set(0) 310 } 311 } 312 } 313 314 func (ce *canaryExporter) testPyroscopeCell(ctx context.Context) error { 315 now := time.Now() 316 317 // Run all ingest probes 318 var ingestErrors multierror.MultiError 319 320 // ingest a fake profile using the original method 321 if err := ce.runProbe(ctx, "ingest", func(rCtx context.Context) error { 322 return ce.testIngestProfile(rCtx, now) 323 }); err != nil { 324 ingestErrors.Add(fmt.Errorf("error during standard ingestion: %w", err)) 325 } 326 327 // ingest via OTLP gRPC 328 // if err := ce.runProbe(ctx, "ingest-otlp-grpc", func(rCtx context.Context) error { 329 // return ce.testIngestOTLPGrpc(rCtx, now) 330 // }); err != nil { 331 // ingestErrors.Add(fmt.Errorf("error during OTLP gRPC ingestion: %w", err)) 332 // } 333 334 // ingest via OTLP HTTP/JSON 335 if err := ce.runProbe(ctx, "ingest-otlp-http-json", func(rCtx context.Context) error { 336 return ce.testIngestOTLPHttpJson(rCtx, now) 337 }); err != nil { 338 ingestErrors.Add(fmt.Errorf("error during OTLP HTTP/JSON ingestion: %w", err)) 339 } 340 341 // ingest via OTLP HTTP/Protobuf 342 // Note: HTTP endpoints are not yet implemented in Pyroscope (see pkg/api/api.go:204-205) 343 if err := ce.runProbe(ctx, "ingest-otlp-http-protobuf", func(rCtx context.Context) error { 344 return ce.testIngestOTLPHttpProtobuf(rCtx, now) 345 }); err != nil { 346 ingestErrors.Add(fmt.Errorf("error during OTLP HTTP/Protobuf ingestion: %w", err)) 347 } 348 349 // Report ingestion errors if any 350 if ingestErrors.Err() != nil { 351 for _, line := range strings.Split(ingestErrors.Err().Error(), "\n") { 352 level.Error(logger).Log("msg", "ingestion error", "err", line) 353 } 354 } 355 356 if ce.params.TestDelay > 0 { 357 level.Info(logger).Log("msg", "waiting before running a query", "delay", ce.params.TestDelay) 358 select { 359 case <-time.After(ce.params.TestDelay): 360 case <-ctx.Done(): 361 } 362 } 363 364 // Now try to query the data back 365 var multiError multierror.MultiError 366 for _, probe := range ce.queryProbes { 367 err := ce.runProbe(ctx, probe.name, func(rCtx context.Context) error { 368 return probe.f(rCtx, now) 369 }) 370 multiError.Add(err) 371 } 372 if multiError.Err() != nil { 373 return fmt.Errorf("%d error(s) reported from query probes", len(multiError)) 374 } 375 376 return nil 377 } 378 379 func (ce *canaryExporter) runProbe(ctx context.Context, probeName string, probeFunc func(ctx context.Context) error) error { 380 rCtx, done := ce.doTrace(ctx, probeName) 381 result := false 382 defer func() { 383 done(result) 384 }() 385 err := probeFunc(rCtx) 386 if err != nil { 387 level.Error(logger).Log("msg", "probe failed", "probe_name", probeName, "err", err) 388 } else { 389 level.Info(logger).Log("msg", "probe successful", "probe_name", probeName) 390 result = true 391 } 392 return err 393 } 394 395 // roundTripTrace holds timings for a single HTTP roundtrip. 396 type roundTripTrace struct { 397 tls bool 398 start time.Time 399 dnsDone time.Time 400 connectDone time.Time 401 gotConn time.Time 402 responseStart time.Time 403 end time.Time 404 tlsStart time.Time 405 tlsDone time.Time 406 } 407 408 // transport is a custom transport keeping traces for each HTTP roundtrip. 409 type transport struct { 410 Transport http.RoundTripper 411 name string 412 metrics *canaryExporterMetrics 413 414 mu sync.Mutex 415 traces []*roundTripTrace 416 current *roundTripTrace 417 bodySize *atomic.Int64 418 } 419 420 func newInstrumentedTransport(rt http.RoundTripper, m *canaryExporterMetrics, name string) *transport { 421 return &transport{ 422 Transport: rt, 423 traces: []*roundTripTrace{}, 424 name: name, 425 metrics: m, 426 bodySize: atomic.NewInt64(0), 427 } 428 } 429 430 type readerWrapper struct { 431 io.ReadCloser 432 bodySize *atomic.Int64 433 } 434 435 func (rw *readerWrapper) Read(p []byte) (n int, err error) { 436 n, err = rw.ReadCloser.Read(p) 437 rw.bodySize.Add(int64(n)) 438 return n, err 439 } 440 441 // RoundTrip switches to a new trace, then runs embedded RoundTripper. 442 func (t *transport) RoundTrip(req *http.Request) (*http.Response, error) { 443 level.Debug(logger).Log("msg", "making HTTP request", "url", req.URL.String(), "host", req.Host) 444 445 trace := &roundTripTrace{} 446 if req.URL.Scheme == "https" { 447 trace.tls = true 448 } 449 t.current = trace 450 t.traces = append(t.traces, trace) 451 452 resp, err := t.Transport.RoundTrip(req) 453 if err != nil { 454 return resp, err 455 } 456 457 resp.Body = &readerWrapper{ReadCloser: resp.Body, bodySize: t.bodySize} 458 459 if resp.TLS != nil { 460 t.metrics.isSSL.Set(float64(1)) 461 t.metrics.probeSSLEarliestCertExpiry.Set(float64(getEarliestCertExpiry(resp.TLS).Unix())) 462 t.metrics.probeTLSVersion.WithLabelValues(getTLSVersion(resp.TLS)).Set(1) 463 t.metrics.probeSSLLastChainExpiryTimestampSeconds.Set(float64(getLastChainExpiry(resp.TLS).Unix())) 464 t.metrics.probeSSLLastInformation.WithLabelValues(getFingerprint(resp.TLS), getSubject(resp.TLS), getIssuer(resp.TLS), getDNSNames(resp.TLS)).Set(1) 465 } 466 467 t.metrics.statusCode.WithLabelValues(t.name).Set(float64(resp.StatusCode)) 468 t.metrics.contentLength.WithLabelValues(t.name).Set(float64(resp.ContentLength)) 469 470 var httpVersionNumber float64 471 httpVersionNumber, err = strconv.ParseFloat(strings.TrimPrefix(resp.Proto, "HTTP/"), 64) 472 if err != nil { 473 level.Error(logger).Log("msg", "Error parsing version number from HTTP version", "err", err) 474 } 475 t.metrics.probeHTTPVersion.WithLabelValues(t.name).Set(httpVersionNumber) 476 477 return resp, err 478 } 479 480 func (t *transport) DNSStart(_ httptrace.DNSStartInfo) { 481 t.mu.Lock() 482 defer t.mu.Unlock() 483 t.current.start = time.Now() 484 } 485 func (t *transport) DNSDone(_ httptrace.DNSDoneInfo) { 486 t.mu.Lock() 487 defer t.mu.Unlock() 488 t.current.dnsDone = time.Now() 489 } 490 func (ts *transport) ConnectStart(_, _ string) { 491 ts.mu.Lock() 492 defer ts.mu.Unlock() 493 t := ts.current 494 // No DNS resolution because we connected to IP directly. 495 if t.dnsDone.IsZero() { 496 t.start = time.Now() 497 t.dnsDone = t.start 498 } 499 } 500 func (t *transport) ConnectDone(net, addr string, err error) { 501 t.mu.Lock() 502 defer t.mu.Unlock() 503 t.current.connectDone = time.Now() 504 } 505 func (t *transport) GotConn(_ httptrace.GotConnInfo) { 506 t.mu.Lock() 507 defer t.mu.Unlock() 508 t.current.gotConn = time.Now() 509 } 510 func (t *transport) GotFirstResponseByte() { 511 t.mu.Lock() 512 defer t.mu.Unlock() 513 t.current.responseStart = time.Now() 514 } 515 func (t *transport) TLSHandshakeStart() { 516 t.mu.Lock() 517 defer t.mu.Unlock() 518 t.current.tlsStart = time.Now() 519 } 520 func (t *transport) TLSHandshakeDone(_ tls.ConnectionState, _ error) { 521 t.mu.Lock() 522 defer t.mu.Unlock() 523 t.current.tlsDone = time.Now() 524 } 525 func getEarliestCertExpiry(state *tls.ConnectionState) time.Time { 526 earliest := time.Time{} 527 for _, cert := range state.PeerCertificates { 528 if (earliest.IsZero() || cert.NotAfter.Before(earliest)) && !cert.NotAfter.IsZero() { 529 earliest = cert.NotAfter 530 } 531 } 532 return earliest 533 } 534 535 func getFingerprint(state *tls.ConnectionState) string { 536 cert := state.PeerCertificates[0] 537 fingerprint := sha256.Sum256(cert.Raw) 538 return hex.EncodeToString(fingerprint[:]) 539 } 540 541 func getSubject(state *tls.ConnectionState) string { 542 cert := state.PeerCertificates[0] 543 return cert.Subject.String() 544 } 545 546 func getIssuer(state *tls.ConnectionState) string { 547 cert := state.PeerCertificates[0] 548 return cert.Issuer.String() 549 } 550 551 func getDNSNames(state *tls.ConnectionState) string { 552 cert := state.PeerCertificates[0] 553 return strings.Join(cert.DNSNames, ",") 554 } 555 556 func getLastChainExpiry(state *tls.ConnectionState) time.Time { 557 lastChainExpiry := time.Time{} 558 for _, chain := range state.VerifiedChains { 559 earliestCertExpiry := time.Time{} 560 for _, cert := range chain { 561 if (earliestCertExpiry.IsZero() || cert.NotAfter.Before(earliestCertExpiry)) && !cert.NotAfter.IsZero() { 562 earliestCertExpiry = cert.NotAfter 563 } 564 } 565 if lastChainExpiry.IsZero() || lastChainExpiry.Before(earliestCertExpiry) { 566 lastChainExpiry = earliestCertExpiry 567 } 568 569 } 570 return lastChainExpiry 571 } 572 573 func getTLSVersion(state *tls.ConnectionState) string { 574 switch state.Version { 575 case tls.VersionTLS10: 576 return "TLS 1.0" 577 case tls.VersionTLS11: 578 return "TLS 1.1" 579 case tls.VersionTLS12: 580 return "TLS 1.2" 581 case tls.VersionTLS13: 582 return "TLS 1.3" 583 default: 584 return "unknown" 585 } 586 }