go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/tq/internal/metrics/metrics.go (about)

     1  // Copyright 2020 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package metrics
    16  
    17  import (
    18  	"math"
    19  
    20  	"go.chromium.org/luci/common/tsmon/distribution"
    21  	"go.chromium.org/luci/common/tsmon/field"
    22  	"go.chromium.org/luci/common/tsmon/metric"
    23  	"go.chromium.org/luci/common/tsmon/types"
    24  )
    25  
    26  // MaxRetryFieldValue is the number to cap the value of retry count field at,
    27  // for metrics that include it, thus indicating a value greater or equal to it.
    28  // This makes the metric field have a reasonable number of distinct values.
    29  const MaxRetryFieldValue = 10
    30  
    31  var (
    32  	// bucketer1msTo5min covers range of 1..300k.
    33  	bucketer1msTo5min = distribution.GeometricBucketer(math.Pow(10, 0.055), 100)
    34  
    35  	// TQ guts metrics, primary useful to debug TQ.
    36  
    37  	InprocSweepDurationMS = metric.NewCumulativeDistribution(
    38  		"tq/sweep/inproc/duration",
    39  		"Duration of a full inproc sweep cycle across all DBs (ms)",
    40  		&types.MetricMetadata{Units: types.Milliseconds},
    41  		bucketer1msTo5min,
    42  	)
    43  
    44  	SweepFetchMetaDurationsMS = metric.NewCumulativeDistribution(
    45  		"tq/sweep/fetch/meta/durations",
    46  		"Duration of FetchRemindersMeta operation (ms)",
    47  		&types.MetricMetadata{Units: types.Milliseconds},
    48  		bucketer1msTo5min,
    49  		field.String("status"), // OK | limit | timeout | failures
    50  		field.Int("level"),     // 0 means the primary shard task, 1+ are its children
    51  		field.String("db"),
    52  	)
    53  
    54  	SweepFetchMetaReminders = metric.NewCounter(
    55  		"tq/sweep/fetch/meta/reminders",
    56  		"Count of Reminders fetched by FetchRemindersMeta",
    57  		nil,
    58  		field.String("status"), // OK | limit | timeout | failures
    59  		field.Int("level"),     // 0 means the primary shard task, 1+ are its children
    60  		field.String("db"),
    61  	)
    62  
    63  	ReminderStalenessMS = metric.NewCumulativeDistribution(
    64  		"tq/reminders/staleness",
    65  		("Distribution of staleness of scanned Reminders during the sweep. " +
    66  			"May be incomplete if keyspace wasn't scanned completely"),
    67  		&types.MetricMetadata{Units: types.Milliseconds},
    68  		distribution.DefaultBucketer,
    69  		field.Int("level"),
    70  		field.String("db"),
    71  	)
    72  
    73  	RemindersCreated = metric.NewCounter(
    74  		"tq/reminders/created",
    75  		"Count of reminders created and if they are still fresh in the post-txn defer",
    76  		nil,
    77  		field.String("task_class"), // matches TaskClass.ID
    78  		field.String("staleness"),  // fresh | stale
    79  		field.String("db"),
    80  	)
    81  
    82  	RemindersDeleted = metric.NewCounter(
    83  		"tq/reminders/processed",
    84  		"Count of reminders processed (i.e. deleted)",
    85  		nil,
    86  		field.String("task_class"), // matches TaskClass.ID
    87  		field.String("txn_path"),   // happy | sweep
    88  		field.String("db"),
    89  	)
    90  
    91  	RemindersLatencyMS = metric.NewCumulativeDistribution(
    92  		"tq/reminders/latency",
    93  		"Time between AddTask call and the deletion of the reminder",
    94  		&types.MetricMetadata{Units: types.Milliseconds},
    95  		bucketer1msTo5min,
    96  		field.String("task_class"), // matches TaskClass.ID
    97  		field.String("txn_path"),   // happy | sweep
    98  		field.String("db"),
    99  	)
   100  
   101  	// TQ metrics that might be useful for TQ clients as well.
   102  
   103  	SubmitCount = metric.NewCounter(
   104  		"tq/submit/count",
   105  		"Count of submitted tasks",
   106  		nil,
   107  		field.String("task_class"), // matches TaskClass.ID
   108  		field.String("txn_path"),   // none | happy | sweep
   109  		field.String("grpc_code"),  // gRPC canonical code
   110  	)
   111  
   112  	SubmitDurationMS = metric.NewCumulativeDistribution(
   113  		"tq/submit/duration",
   114  		"Duration of submit calls",
   115  		&types.MetricMetadata{Units: types.Milliseconds},
   116  		distribution.DefaultBucketer,
   117  		field.String("task_class"), // matches TaskClass.ID
   118  		field.String("txn_path"),   // none | happy | sweep
   119  		field.String("grpc_code"),  // gRPC canonical code
   120  	)
   121  
   122  	ServerRejectedCount = metric.NewCounter(
   123  		"tq/server/rejected",
   124  		"Count of rejected (e.g. malformed) task pushes",
   125  		nil,
   126  		field.String("reason"), // auth | bad_request | unknown_class | no_handler | bad_payload
   127  	)
   128  
   129  	ServerHandledCount = metric.NewCounter(
   130  		"tq/server/handled",
   131  		"Count of handled non-rejected tasks",
   132  		nil,
   133  		field.String("task_class"), // matches TaskClass.ID
   134  		field.String("result"),     // OK | retry | transient | fatal
   135  		field.Int("retry"),         // 0 for first try, incrementing until cap.
   136  	)
   137  
   138  	ServerDurationMS = metric.NewCumulativeDistribution(
   139  		"tq/server/duration",
   140  		"Duration of handling of non-rejected tasks",
   141  		&types.MetricMetadata{Units: types.Milliseconds},
   142  		distribution.DefaultBucketer,
   143  		field.String("task_class"), // matches TaskClass.ID
   144  		field.String("result"),     // OK | retry | transient | fatal
   145  	)
   146  
   147  	ServerTaskLatency = metric.NewCumulativeDistribution(
   148  		"tq/server/latency",
   149  		"Time between task's expected ETA and actual completion",
   150  		&types.MetricMetadata{Units: types.Milliseconds},
   151  		distribution.DefaultBucketer,
   152  		field.String("task_class"), // matches TaskClass.ID
   153  		field.String("result"),     // OK | retry | transient | fatal
   154  		field.Int("retry"),         // 0 for first try, incrementing until cap.
   155  	)
   156  
   157  	ServerRunning = metric.NewInt(
   158  		"tq/server/running",
   159  		"Number of task handlers currently running",
   160  		nil,
   161  		field.String("task_class"), // matches TaskClass.ID
   162  	)
   163  )