istio.io/istio@v0.0.0-20240520182934-d79c90f27776/istioctl/pkg/metrics/metrics.go

istio.io/istio@v0.0.0-20240520182934-d79c90f27776/istioctl/pkg/metrics/metrics.go (about)

     1  // Copyright Istio Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package metrics
    16  
    17  import (
    18  	"context"
    19  	"errors"
    20  	"fmt"
    21  	"io"
    22  	"strings"
    23  	"text/tabwriter"
    24  	"time"
    25  
    26  	"github.com/hashicorp/go-multierror"
    27  	"github.com/prometheus/client_golang/api"
    28  	promv1 "github.com/prometheus/client_golang/api/prometheus/v1"
    29  	"github.com/prometheus/common/model"
    30  	"github.com/spf13/cobra"
    31  
    32  	"istio.io/istio/istioctl/pkg/cli"
    33  	"istio.io/istio/istioctl/pkg/clioptions"
    34  	"istio.io/istio/istioctl/pkg/completion"
    35  	"istio.io/istio/istioctl/pkg/dashboard"
    36  	"istio.io/istio/pkg/log"
    37  )
    38  
    39  var (
    40  	metricsOpts     clioptions.ControlPlaneOptions
    41  	metricsDuration time.Duration
    42  )
    43  
    44  const (
    45  	destWorkloadLabel          = "destination_workload"
    46  	destWorkloadNamespaceLabel = "destination_workload_namespace"
    47  	reqTot                     = "istio_requests_total"
    48  	reqDur                     = "istio_request_duration_milliseconds"
    49  )
    50  
    51  func Cmd(ctx cli.Context) *cobra.Command {
    52  	cmd := &cobra.Command{
    53  		Use:   "metrics <workload name>...",
    54  		Short: "Prints the metrics for the specified workload(s) when running in Kubernetes.",
    55  		Long: `
    56  Prints the metrics for the specified service(s) when running in Kubernetes.
    57  
    58  This command finds a Prometheus pod running in the specified istio system
    59  namespace. It then executes a series of queries per requested workload to
    60  find the following top-level workload metrics: total requests per second,
    61  error rate, and request latency at p50, p90, and p99 percentiles. The
    62  query results are printed to the console, organized by workload name.
    63  
    64  All metrics returned are from server-side reports. This means that latencies
    65  and error rates are from the perspective of the service itself and not of an
    66  individual client (or aggregate set of clients). Rates and latencies are
    67  calculated over a time interval of 1 minute.
    68  `,
    69  		Example: `  # Retrieve workload metrics for productpage-v1 workload
    70    istioctl experimental metrics productpage-v1
    71  
    72    # Retrieve workload metrics for various services with custom duration
    73    istioctl experimental metrics productpage-v1 -d 2m
    74  
    75    # Retrieve workload metrics for various services in the different namespaces
    76    istioctl experimental metrics productpage-v1.foo reviews-v1.bar ratings-v1.baz`,
    77  		// nolint: goimports
    78  		Aliases: []string{"m"},
    79  		Args: func(cmd *cobra.Command, args []string) error {
    80  			if len(args) < 1 {
    81  				cmd.Println(cmd.UsageString())
    82  				return fmt.Errorf("metrics requires workload name")
    83  			}
    84  			return nil
    85  		},
    86  		RunE: func(cmd *cobra.Command, args []string) error {
    87  			return run(cmd, ctx, args)
    88  		},
    89  		DisableFlagsInUseLine: true,
    90  		ValidArgsFunction:     completion.ValidPodsNameArgs(ctx),
    91  	}
    92  
    93  	cmd.PersistentFlags().DurationVarP(&metricsDuration, "duration", "d", time.Minute, "Duration of query metrics, default value is 1m.")
    94  
    95  	return cmd
    96  }
    97  
    98  type workloadMetrics struct {
    99  	workload                           string
   100  	totalRPS, errorRPS                 float64
   101  	p50Latency, p90Latency, p99Latency time.Duration
   102  }
   103  
   104  func run(c *cobra.Command, ctx cli.Context, args []string) error {
   105  	log.Debugf("metrics command invoked for workload(s): %v", args)
   106  
   107  	client, err := ctx.CLIClientWithRevision(metricsOpts.Revision)
   108  	if err != nil {
   109  		return fmt.Errorf("failed to create k8s client: %v", err)
   110  	}
   111  
   112  	pl, err := client.PodsForSelector(context.TODO(), ctx.IstioNamespace(), "app.kubernetes.io/name=prometheus")
   113  	if err != nil {
   114  		return fmt.Errorf("not able to locate Prometheus pod: %v", err)
   115  	}
   116  
   117  	if len(pl.Items) < 1 {
   118  		return errors.New("no Prometheus pods found")
   119  	}
   120  
   121  	// only use the first pod in the list
   122  	promPod := pl.Items[0]
   123  	fw, err := client.NewPortForwarder(promPod.Name, ctx.IstioNamespace(), "", 0, 9090)
   124  	if err != nil {
   125  		return fmt.Errorf("could not build port forwarder for prometheus: %v", err)
   126  	}
   127  
   128  	if err = fw.Start(); err != nil {
   129  		return fmt.Errorf("failure running port forward process: %v", err)
   130  	}
   131  
   132  	// Close the forwarder either when we exit or when an this processes is interrupted.
   133  	defer fw.Close()
   134  	dashboard.ClosePortForwarderOnInterrupt(fw)
   135  
   136  	log.Debugf("port-forward to prometheus pod ready")
   137  
   138  	promAPI, err := prometheusAPI(fmt.Sprintf("http://%s", fw.Address()))
   139  	if err != nil {
   140  		return fmt.Errorf("failure running port forward process: %v", err)
   141  	}
   142  
   143  	printHeader(c.OutOrStdout())
   144  
   145  	workloads := args
   146  	for _, workload := range workloads {
   147  		sm, err := metrics(promAPI, workload, metricsDuration)
   148  		if err != nil {
   149  			return fmt.Errorf("could not build metrics for workload '%s': %v", workload, err)
   150  		}
   151  
   152  		printMetrics(c.OutOrStdout(), sm)
   153  	}
   154  	return nil
   155  }
   156  
   157  func prometheusAPI(address string) (promv1.API, error) {
   158  	promClient, err := api.NewClient(api.Config{Address: address})
   159  	if err != nil {
   160  		return nil, fmt.Errorf("could not build prometheus client: %v", err)
   161  	}
   162  	return promv1.NewAPI(promClient), nil
   163  }
   164  
   165  func metrics(promAPI promv1.API, workload string, duration time.Duration) (workloadMetrics, error) {
   166  	parts := strings.Split(workload, ".")
   167  	wname := parts[0]
   168  	wns := ""
   169  	if len(parts) > 1 {
   170  		wns = parts[1]
   171  	}
   172  
   173  	rpsQuery := fmt.Sprintf(`sum(rate(%s{%s=~"%s.*", %s=~"%s.*",reporter="destination"}[%s]))`,
   174  		reqTot, destWorkloadLabel, wname, destWorkloadNamespaceLabel, wns, duration)
   175  	errRPSQuery := fmt.Sprintf(`sum(rate(%s{%s=~"%s.*", %s=~"%s.*",reporter="destination",response_code=~"[45][0-9]{2}"}[%s]))`,
   176  		reqTot, destWorkloadLabel, wname, destWorkloadNamespaceLabel, wns, duration)
   177  
   178  	var me *multierror.Error
   179  	var err error
   180  	sm := workloadMetrics{workload: workload}
   181  	sm.totalRPS, err = vectorValue(promAPI, rpsQuery)
   182  	if err != nil {
   183  		me = multierror.Append(me, err)
   184  	}
   185  
   186  	sm.errorRPS, err = vectorValue(promAPI, errRPSQuery)
   187  	if err != nil {
   188  		me = multierror.Append(me, err)
   189  	}
   190  
   191  	p50Latency, err := getLatency(promAPI, wname, wns, duration, 0.5)
   192  	if err != nil {
   193  		me = multierror.Append(me, err)
   194  	}
   195  	sm.p50Latency = p50Latency
   196  
   197  	p90Latency, err := getLatency(promAPI, wname, wns, duration, 0.9)
   198  	if err != nil {
   199  		me = multierror.Append(me, err)
   200  	}
   201  	sm.p90Latency = p90Latency
   202  
   203  	p99Latency, err := getLatency(promAPI, wname, wns, duration, 0.99)
   204  	if err != nil {
   205  		me = multierror.Append(me, err)
   206  	}
   207  	sm.p99Latency = p99Latency
   208  
   209  	if me.ErrorOrNil() != nil {
   210  		return sm, fmt.Errorf("error retrieving some metrics: %v", me.Error())
   211  	}
   212  
   213  	return sm, nil
   214  }
   215  
   216  func getLatency(promAPI promv1.API, workloadName, workloadNamespace string, duration time.Duration, quantile float64) (time.Duration, error) {
   217  	latencyQuery := fmt.Sprintf(`histogram_quantile(%f, sum(rate(%s_bucket{%s=~"%s.*", %s=~"%s.*",reporter="destination"}[%s])) by (le))`,
   218  		quantile, reqDur, destWorkloadLabel, workloadName, destWorkloadNamespaceLabel, workloadNamespace, duration)
   219  
   220  	letency, err := vectorValue(promAPI, latencyQuery)
   221  	if err != nil {
   222  		return time.Duration(0), err
   223  	}
   224  
   225  	return convertLatencyToDuration(letency), nil
   226  }
   227  
   228  func vectorValue(promAPI promv1.API, query string) (float64, error) {
   229  	val, _, err := promAPI.Query(context.Background(), query, time.Now())
   230  	if err != nil {
   231  		return 0, fmt.Errorf("query() failure for '%s': %v", query, err)
   232  	}
   233  
   234  	log.Debugf("executing query: %s  result:%s", query, val)
   235  
   236  	switch v := val.(type) {
   237  	case model.Vector:
   238  		if v.Len() < 1 {
   239  			log.Debugf("no values for query: %s", query)
   240  			return 0, nil
   241  		}
   242  
   243  		return float64(v[0].Value), nil
   244  	default:
   245  		return 0, errors.New("bad metric value type returned for query")
   246  	}
   247  }
   248  
   249  func convertLatencyToDuration(val float64) time.Duration {
   250  	return time.Duration(val) * time.Millisecond
   251  }
   252  
   253  func printHeader(writer io.Writer) {
   254  	w := tabwriter.NewWriter(writer, 13, 1, 2, ' ', tabwriter.AlignRight)
   255  	_, _ = fmt.Fprintf(w, "%40s\tTOTAL RPS\tERROR RPS\tP50 LATENCY\tP90 LATENCY\tP99 LATENCY\t\n", "WORKLOAD")
   256  	_ = w.Flush()
   257  }
   258  
   259  func printMetrics(writer io.Writer, wm workloadMetrics) {
   260  	w := tabwriter.NewWriter(writer, 13, 1, 2, ' ', tabwriter.AlignRight)
   261  	_, _ = fmt.Fprintf(w, "%40s\t", wm.workload)
   262  	_, _ = fmt.Fprintf(w, "%.3f\t", wm.totalRPS)
   263  	_, _ = fmt.Fprintf(w, "%.3f\t", wm.errorRPS)
   264  	_, _ = fmt.Fprintf(w, "%s\t", wm.p50Latency)
   265  	_, _ = fmt.Fprintf(w, "%s\t", wm.p90Latency)
   266  	_, _ = fmt.Fprintf(w, "%s\t\n", wm.p99Latency)
   267  	_ = w.Flush()
   268  }