istio.io/istio@v0.0.0-20240520182934-d79c90f27776/istioctl/pkg/metrics/metrics.go (about) 1 // Copyright Istio Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package metrics 16 17 import ( 18 "context" 19 "errors" 20 "fmt" 21 "io" 22 "strings" 23 "text/tabwriter" 24 "time" 25 26 "github.com/hashicorp/go-multierror" 27 "github.com/prometheus/client_golang/api" 28 promv1 "github.com/prometheus/client_golang/api/prometheus/v1" 29 "github.com/prometheus/common/model" 30 "github.com/spf13/cobra" 31 32 "istio.io/istio/istioctl/pkg/cli" 33 "istio.io/istio/istioctl/pkg/clioptions" 34 "istio.io/istio/istioctl/pkg/completion" 35 "istio.io/istio/istioctl/pkg/dashboard" 36 "istio.io/istio/pkg/log" 37 ) 38 39 var ( 40 metricsOpts clioptions.ControlPlaneOptions 41 metricsDuration time.Duration 42 ) 43 44 const ( 45 destWorkloadLabel = "destination_workload" 46 destWorkloadNamespaceLabel = "destination_workload_namespace" 47 reqTot = "istio_requests_total" 48 reqDur = "istio_request_duration_milliseconds" 49 ) 50 51 func Cmd(ctx cli.Context) *cobra.Command { 52 cmd := &cobra.Command{ 53 Use: "metrics <workload name>...", 54 Short: "Prints the metrics for the specified workload(s) when running in Kubernetes.", 55 Long: ` 56 Prints the metrics for the specified service(s) when running in Kubernetes. 57 58 This command finds a Prometheus pod running in the specified istio system 59 namespace. It then executes a series of queries per requested workload to 60 find the following top-level workload metrics: total requests per second, 61 error rate, and request latency at p50, p90, and p99 percentiles. The 62 query results are printed to the console, organized by workload name. 63 64 All metrics returned are from server-side reports. This means that latencies 65 and error rates are from the perspective of the service itself and not of an 66 individual client (or aggregate set of clients). Rates and latencies are 67 calculated over a time interval of 1 minute. 68 `, 69 Example: ` # Retrieve workload metrics for productpage-v1 workload 70 istioctl experimental metrics productpage-v1 71 72 # Retrieve workload metrics for various services with custom duration 73 istioctl experimental metrics productpage-v1 -d 2m 74 75 # Retrieve workload metrics for various services in the different namespaces 76 istioctl experimental metrics productpage-v1.foo reviews-v1.bar ratings-v1.baz`, 77 // nolint: goimports 78 Aliases: []string{"m"}, 79 Args: func(cmd *cobra.Command, args []string) error { 80 if len(args) < 1 { 81 cmd.Println(cmd.UsageString()) 82 return fmt.Errorf("metrics requires workload name") 83 } 84 return nil 85 }, 86 RunE: func(cmd *cobra.Command, args []string) error { 87 return run(cmd, ctx, args) 88 }, 89 DisableFlagsInUseLine: true, 90 ValidArgsFunction: completion.ValidPodsNameArgs(ctx), 91 } 92 93 cmd.PersistentFlags().DurationVarP(&metricsDuration, "duration", "d", time.Minute, "Duration of query metrics, default value is 1m.") 94 95 return cmd 96 } 97 98 type workloadMetrics struct { 99 workload string 100 totalRPS, errorRPS float64 101 p50Latency, p90Latency, p99Latency time.Duration 102 } 103 104 func run(c *cobra.Command, ctx cli.Context, args []string) error { 105 log.Debugf("metrics command invoked for workload(s): %v", args) 106 107 client, err := ctx.CLIClientWithRevision(metricsOpts.Revision) 108 if err != nil { 109 return fmt.Errorf("failed to create k8s client: %v", err) 110 } 111 112 pl, err := client.PodsForSelector(context.TODO(), ctx.IstioNamespace(), "app.kubernetes.io/name=prometheus") 113 if err != nil { 114 return fmt.Errorf("not able to locate Prometheus pod: %v", err) 115 } 116 117 if len(pl.Items) < 1 { 118 return errors.New("no Prometheus pods found") 119 } 120 121 // only use the first pod in the list 122 promPod := pl.Items[0] 123 fw, err := client.NewPortForwarder(promPod.Name, ctx.IstioNamespace(), "", 0, 9090) 124 if err != nil { 125 return fmt.Errorf("could not build port forwarder for prometheus: %v", err) 126 } 127 128 if err = fw.Start(); err != nil { 129 return fmt.Errorf("failure running port forward process: %v", err) 130 } 131 132 // Close the forwarder either when we exit or when an this processes is interrupted. 133 defer fw.Close() 134 dashboard.ClosePortForwarderOnInterrupt(fw) 135 136 log.Debugf("port-forward to prometheus pod ready") 137 138 promAPI, err := prometheusAPI(fmt.Sprintf("http://%s", fw.Address())) 139 if err != nil { 140 return fmt.Errorf("failure running port forward process: %v", err) 141 } 142 143 printHeader(c.OutOrStdout()) 144 145 workloads := args 146 for _, workload := range workloads { 147 sm, err := metrics(promAPI, workload, metricsDuration) 148 if err != nil { 149 return fmt.Errorf("could not build metrics for workload '%s': %v", workload, err) 150 } 151 152 printMetrics(c.OutOrStdout(), sm) 153 } 154 return nil 155 } 156 157 func prometheusAPI(address string) (promv1.API, error) { 158 promClient, err := api.NewClient(api.Config{Address: address}) 159 if err != nil { 160 return nil, fmt.Errorf("could not build prometheus client: %v", err) 161 } 162 return promv1.NewAPI(promClient), nil 163 } 164 165 func metrics(promAPI promv1.API, workload string, duration time.Duration) (workloadMetrics, error) { 166 parts := strings.Split(workload, ".") 167 wname := parts[0] 168 wns := "" 169 if len(parts) > 1 { 170 wns = parts[1] 171 } 172 173 rpsQuery := fmt.Sprintf(`sum(rate(%s{%s=~"%s.*", %s=~"%s.*",reporter="destination"}[%s]))`, 174 reqTot, destWorkloadLabel, wname, destWorkloadNamespaceLabel, wns, duration) 175 errRPSQuery := fmt.Sprintf(`sum(rate(%s{%s=~"%s.*", %s=~"%s.*",reporter="destination",response_code=~"[45][0-9]{2}"}[%s]))`, 176 reqTot, destWorkloadLabel, wname, destWorkloadNamespaceLabel, wns, duration) 177 178 var me *multierror.Error 179 var err error 180 sm := workloadMetrics{workload: workload} 181 sm.totalRPS, err = vectorValue(promAPI, rpsQuery) 182 if err != nil { 183 me = multierror.Append(me, err) 184 } 185 186 sm.errorRPS, err = vectorValue(promAPI, errRPSQuery) 187 if err != nil { 188 me = multierror.Append(me, err) 189 } 190 191 p50Latency, err := getLatency(promAPI, wname, wns, duration, 0.5) 192 if err != nil { 193 me = multierror.Append(me, err) 194 } 195 sm.p50Latency = p50Latency 196 197 p90Latency, err := getLatency(promAPI, wname, wns, duration, 0.9) 198 if err != nil { 199 me = multierror.Append(me, err) 200 } 201 sm.p90Latency = p90Latency 202 203 p99Latency, err := getLatency(promAPI, wname, wns, duration, 0.99) 204 if err != nil { 205 me = multierror.Append(me, err) 206 } 207 sm.p99Latency = p99Latency 208 209 if me.ErrorOrNil() != nil { 210 return sm, fmt.Errorf("error retrieving some metrics: %v", me.Error()) 211 } 212 213 return sm, nil 214 } 215 216 func getLatency(promAPI promv1.API, workloadName, workloadNamespace string, duration time.Duration, quantile float64) (time.Duration, error) { 217 latencyQuery := fmt.Sprintf(`histogram_quantile(%f, sum(rate(%s_bucket{%s=~"%s.*", %s=~"%s.*",reporter="destination"}[%s])) by (le))`, 218 quantile, reqDur, destWorkloadLabel, workloadName, destWorkloadNamespaceLabel, workloadNamespace, duration) 219 220 letency, err := vectorValue(promAPI, latencyQuery) 221 if err != nil { 222 return time.Duration(0), err 223 } 224 225 return convertLatencyToDuration(letency), nil 226 } 227 228 func vectorValue(promAPI promv1.API, query string) (float64, error) { 229 val, _, err := promAPI.Query(context.Background(), query, time.Now()) 230 if err != nil { 231 return 0, fmt.Errorf("query() failure for '%s': %v", query, err) 232 } 233 234 log.Debugf("executing query: %s result:%s", query, val) 235 236 switch v := val.(type) { 237 case model.Vector: 238 if v.Len() < 1 { 239 log.Debugf("no values for query: %s", query) 240 return 0, nil 241 } 242 243 return float64(v[0].Value), nil 244 default: 245 return 0, errors.New("bad metric value type returned for query") 246 } 247 } 248 249 func convertLatencyToDuration(val float64) time.Duration { 250 return time.Duration(val) * time.Millisecond 251 } 252 253 func printHeader(writer io.Writer) { 254 w := tabwriter.NewWriter(writer, 13, 1, 2, ' ', tabwriter.AlignRight) 255 _, _ = fmt.Fprintf(w, "%40s\tTOTAL RPS\tERROR RPS\tP50 LATENCY\tP90 LATENCY\tP99 LATENCY\t\n", "WORKLOAD") 256 _ = w.Flush() 257 } 258 259 func printMetrics(writer io.Writer, wm workloadMetrics) { 260 w := tabwriter.NewWriter(writer, 13, 1, 2, ' ', tabwriter.AlignRight) 261 _, _ = fmt.Fprintf(w, "%40s\t", wm.workload) 262 _, _ = fmt.Fprintf(w, "%.3f\t", wm.totalRPS) 263 _, _ = fmt.Fprintf(w, "%.3f\t", wm.errorRPS) 264 _, _ = fmt.Fprintf(w, "%s\t", wm.p50Latency) 265 _, _ = fmt.Fprintf(w, "%s\t", wm.p90Latency) 266 _, _ = fmt.Fprintf(w, "%s\t\n", wm.p99Latency) 267 _ = w.Flush() 268 }