go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/tokenserver/cmd/luci_machine_tokend/status.go (about)

     1  // Copyright 2016 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package main
    16  
    17  import (
    18  	"bytes"
    19  	"context"
    20  	"encoding/json"
    21  	"fmt"
    22  	"time"
    23  
    24  	"google.golang.org/grpc/codes"
    25  
    26  	"go.chromium.org/luci/common/logging/memlogger"
    27  	"go.chromium.org/luci/common/tsmon"
    28  	"go.chromium.org/luci/common/tsmon/metric"
    29  	"go.chromium.org/luci/common/tsmon/types"
    30  
    31  	tokenserver "go.chromium.org/luci/tokenserver/api"
    32  	"go.chromium.org/luci/tokenserver/client"
    33  )
    34  
    35  // UpdateOutcome describes overall status of tokend token update process.
    36  type UpdateOutcome string
    37  
    38  // Some known outcomes.
    39  //
    40  // See also OutcomeFromRPCError for outcomes generated from status codes.
    41  const (
    42  	OutcomeTokenIsGood           UpdateOutcome = "TOKEN_IS_GOOD"  // token is still valid
    43  	OutcomeUpdateSuccess         UpdateOutcome = "UPDATE_SUCCESS" // successfully updated
    44  	OutcomeCantReadKey           UpdateOutcome = "CANT_READ_KEY"
    45  	OutcomeMalformedReponse      UpdateOutcome = "MALFORMED_RESPONSE"
    46  	OutcomeUnknownRPCError       UpdateOutcome = "UNKNOWN_RPC_ERROR"
    47  	OutcomePermissionError       UpdateOutcome = "SAVE_TOKEN_PERM_ERROR"
    48  	OutcomeUnknownSaveTokenError UpdateOutcome = "UNKNOWN_SAVE_TOKEN_ERROR"
    49  )
    50  
    51  // OutcomeFromRPCError transform MintToken error into an update outcome.
    52  func OutcomeFromRPCError(err error) UpdateOutcome {
    53  	if err == nil {
    54  		return OutcomeUpdateSuccess
    55  	}
    56  	if details, ok := err.(client.RPCError); ok {
    57  		if details.GrpcCode != codes.OK {
    58  			return UpdateOutcome(fmt.Sprintf("GRPC_ERROR_%d", details.GrpcCode))
    59  		}
    60  		return UpdateOutcome(fmt.Sprintf("MINT_TOKEN_ERROR_%s", details.ErrorCode))
    61  	}
    62  	return OutcomeUnknownRPCError
    63  }
    64  
    65  // UpdateReason describes why tokend attempts to update the token.
    66  type UpdateReason string
    67  
    68  // All known reasons for starting token refresh procedure.
    69  const (
    70  	UpdateReasonTokenIsGood      UpdateReason = "TOKEN_IS_GOOD" // update was skipped
    71  	UpdateReasonNewToken         UpdateReason = "NEW_TOKEN"
    72  	UpdateReasonExpiration       UpdateReason = "TOKEN_EXPIRES"
    73  	UpdateReasonParametersChange UpdateReason = "PARAMS_CHANGE"
    74  	UpdateReasonForceRefresh     UpdateReason = "FORCE_REFRESH"
    75  )
    76  
    77  // StatusReport gathers information about tokend run.
    78  //
    79  // It is picked up by monitoring harness later.
    80  type StatusReport struct {
    81  	Version           string                 // major version of the tokend executable
    82  	Started           time.Time              // when the process started
    83  	Finished          time.Time              // when the process finished
    84  	UpdateOutcome     UpdateOutcome          // overall outcome of the token update process
    85  	UpdateReason      UpdateReason           // why tokend attempts to update the token
    86  	FailureError      error                  // immediate error that caused the failure
    87  	MintTokenDuration time.Duration          // how long RPC call lasted (with all retries)
    88  	LastToken         *tokenserver.TokenFile // last known token (possibly refreshed)
    89  	ServiceVersion    string                 // name and version of the server that generated the token
    90  }
    91  
    92  // Report is how status report looks on disk.
    93  type Report struct {
    94  	TokendVersion     string `json:"tokend_version"`
    95  	ServiceVersion    string `json:"service_version,omitempty"`
    96  	StartedTS         int64  `json:"started_ts"`
    97  	TotalDuration     int64  `json:"total_duration_us,omitempty"`
    98  	RPCDuration       int64  `json:"rpc_duration_us,omitempty"`
    99  	UpdateOutcome     string `json:"update_outcome,omitempty"`
   100  	UpdateReason      string `json:"update_reason,omitempty"`
   101  	FailureError      string `json:"failure_error,omitempty"`
   102  	LogDump           string `json:"log_dump"`
   103  	TokenLastUpdateTS int64  `json:"token_last_update_ts,omitempty"`
   104  	TokenNextUpdateTS int64  `json:"token_next_update_ts,omitempty"`
   105  	TokenExpiryTS     int64  `json:"token_expiry_ts,omitempty"`
   106  }
   107  
   108  // Report gathers the report into single JSON-serializable struct.
   109  func (s *StatusReport) Report() *Report {
   110  	rep := &Report{
   111  		TokendVersion:  s.Version,
   112  		ServiceVersion: s.ServiceVersion,
   113  		StartedTS:      s.Started.Unix(),
   114  		TotalDuration:  s.Finished.Sub(s.Started).Nanoseconds() / 1000,
   115  		RPCDuration:    s.MintTokenDuration.Nanoseconds() / 1000,
   116  		UpdateOutcome:  string(s.UpdateOutcome),
   117  		UpdateReason:   string(s.UpdateReason),
   118  	}
   119  	if s.FailureError != nil {
   120  		rep.FailureError = s.FailureError.Error()
   121  	}
   122  	if s.LastToken != nil {
   123  		rep.TokenLastUpdateTS = s.LastToken.LastUpdate
   124  		rep.TokenNextUpdateTS = s.LastToken.NextUpdate
   125  		rep.TokenExpiryTS = s.LastToken.Expiry
   126  	}
   127  	return rep
   128  }
   129  
   130  // SaveToFile saves the status report and log to a file on disk.
   131  func (s *StatusReport) SaveToFile(ctx context.Context, l *memlogger.MemLogger, path string) error {
   132  	report := s.Report()
   133  
   134  	buf := bytes.Buffer{}
   135  	l.Dump(&buf)
   136  	report.LogDump = buf.String()
   137  
   138  	blob, err := json.MarshalIndent(report, "", "  ")
   139  	if err != nil {
   140  		return err
   141  	}
   142  	return AtomicWriteFile(ctx, path, blob, 0644)
   143  }
   144  
   145  ////////////////////////////////////////////////////////////////////////////////
   146  // All tsmon metrics.
   147  
   148  var (
   149  	// E.g. "1.0". See Version const in main.go.
   150  	metricVersion = metric.NewString(
   151  		"luci/machine_tokend/version",
   152  		"Major version of luci_machine_tokend executable",
   153  		nil)
   154  
   155  	// E.g. "luci-token-server/2123-abcdef" (<appid>/<version>).
   156  	metricServiceVersion = metric.NewString(
   157  		"luci/machine_tokend/service_version",
   158  		"Identifier of the server version that generated the token",
   159  		nil)
   160  
   161  	// This should be >=30 min in the future if everything is ok. If update
   162  	// process fails repeatedly, it will be in the past (and the token is unusable
   163  	// at this point).
   164  	metricTokenExpiry = metric.NewInt(
   165  		"luci/machine_tokend/token_expiry_ts",
   166  		"Unix timestamp of when the token expires, in microsec",
   167  		&types.MetricMetadata{Units: types.Microseconds})
   168  
   169  	// This should be no longer than 30 min in the past if everything is ok.
   170  	metricTokenLastUpdate = metric.NewInt(
   171  		"luci/machine_tokend/last_update_ts",
   172  		"Unix timestamp of when the token was successfully updated, in microsec",
   173  		&types.MetricMetadata{Units: types.Microseconds})
   174  
   175  	// This should be [0-30] min in the future if everything ok. If update process
   176  	// fails (at least once), it will be in the past. It's not a fatal condition
   177  	// yet.
   178  	metricTokenNextUpdate = metric.NewInt(
   179  		"luci/machine_tokend/next_update_ts",
   180  		"Unix timestamp of when the token must be updated next time, in microsec",
   181  		&types.MetricMetadata{Units: types.Microseconds})
   182  
   183  	// See UpdateOutcome enum and OutcomeFromRPCError for possible values.
   184  	//
   185  	// Positive values are "TOKEN_IS_GOOD" and "UPDATE_SUCCESS".
   186  	metricUpdateOutcome = metric.NewString(
   187  		"luci/machine_tokend/update_outcome",
   188  		"Overall outcome of the luci_machine_tokend invocation",
   189  		nil)
   190  
   191  	// See UpdateReason enum for possible values.
   192  	metricUpdateReason = metric.NewString(
   193  		"luci/machine_tokend/update_reason",
   194  		"Why the token was updated or 'TOKEN_IS_GOOD' if token is still valid",
   195  		nil)
   196  
   197  	metricTotalDuration = metric.NewInt(
   198  		"luci/machine_tokend/duration_total_us",
   199  		"For how long luci_machine_tokend ran (including all local IO) in microsec",
   200  		&types.MetricMetadata{Units: types.Microseconds})
   201  
   202  	metricRPCDuration = metric.NewInt(
   203  		"luci/machine_tokend/duration_rpc_us",
   204  		"For how long an RPC to backend ran in microsec",
   205  		&types.MetricMetadata{Units: types.Microseconds})
   206  )
   207  
   208  // SendMetrics is called at the end of the token update process.
   209  //
   210  // It dumps all relevant metrics to tsmon.
   211  func (s *StatusReport) SendMetrics(ctx context.Context) error {
   212  	ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
   213  	defer cancel()
   214  	rep := s.Report()
   215  
   216  	metricVersion.Set(ctx, rep.TokendVersion)
   217  	if rep.ServiceVersion != "" {
   218  		metricServiceVersion.Set(ctx, rep.ServiceVersion)
   219  	}
   220  	if rep.TokenExpiryTS != 0 {
   221  		metricTokenExpiry.Set(ctx, rep.TokenExpiryTS*1000000)
   222  	}
   223  	if rep.TokenLastUpdateTS != 0 {
   224  		metricTokenLastUpdate.Set(ctx, rep.TokenLastUpdateTS*1000000)
   225  	}
   226  	if rep.TokenNextUpdateTS != 0 {
   227  		metricTokenNextUpdate.Set(ctx, rep.TokenNextUpdateTS*1000000)
   228  	}
   229  	metricUpdateOutcome.Set(ctx, rep.UpdateOutcome)
   230  	metricUpdateReason.Set(ctx, rep.UpdateReason)
   231  	metricTotalDuration.Set(ctx, rep.TotalDuration)
   232  	metricRPCDuration.Set(ctx, rep.RPCDuration)
   233  
   234  	return tsmon.Flush(ctx)
   235  }