go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/tokenserver/cmd/luci_machine_tokend/status.go (about) 1 // Copyright 2016 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package main 16 17 import ( 18 "bytes" 19 "context" 20 "encoding/json" 21 "fmt" 22 "time" 23 24 "google.golang.org/grpc/codes" 25 26 "go.chromium.org/luci/common/logging/memlogger" 27 "go.chromium.org/luci/common/tsmon" 28 "go.chromium.org/luci/common/tsmon/metric" 29 "go.chromium.org/luci/common/tsmon/types" 30 31 tokenserver "go.chromium.org/luci/tokenserver/api" 32 "go.chromium.org/luci/tokenserver/client" 33 ) 34 35 // UpdateOutcome describes overall status of tokend token update process. 36 type UpdateOutcome string 37 38 // Some known outcomes. 39 // 40 // See also OutcomeFromRPCError for outcomes generated from status codes. 41 const ( 42 OutcomeTokenIsGood UpdateOutcome = "TOKEN_IS_GOOD" // token is still valid 43 OutcomeUpdateSuccess UpdateOutcome = "UPDATE_SUCCESS" // successfully updated 44 OutcomeCantReadKey UpdateOutcome = "CANT_READ_KEY" 45 OutcomeMalformedReponse UpdateOutcome = "MALFORMED_RESPONSE" 46 OutcomeUnknownRPCError UpdateOutcome = "UNKNOWN_RPC_ERROR" 47 OutcomePermissionError UpdateOutcome = "SAVE_TOKEN_PERM_ERROR" 48 OutcomeUnknownSaveTokenError UpdateOutcome = "UNKNOWN_SAVE_TOKEN_ERROR" 49 ) 50 51 // OutcomeFromRPCError transform MintToken error into an update outcome. 52 func OutcomeFromRPCError(err error) UpdateOutcome { 53 if err == nil { 54 return OutcomeUpdateSuccess 55 } 56 if details, ok := err.(client.RPCError); ok { 57 if details.GrpcCode != codes.OK { 58 return UpdateOutcome(fmt.Sprintf("GRPC_ERROR_%d", details.GrpcCode)) 59 } 60 return UpdateOutcome(fmt.Sprintf("MINT_TOKEN_ERROR_%s", details.ErrorCode)) 61 } 62 return OutcomeUnknownRPCError 63 } 64 65 // UpdateReason describes why tokend attempts to update the token. 66 type UpdateReason string 67 68 // All known reasons for starting token refresh procedure. 69 const ( 70 UpdateReasonTokenIsGood UpdateReason = "TOKEN_IS_GOOD" // update was skipped 71 UpdateReasonNewToken UpdateReason = "NEW_TOKEN" 72 UpdateReasonExpiration UpdateReason = "TOKEN_EXPIRES" 73 UpdateReasonParametersChange UpdateReason = "PARAMS_CHANGE" 74 UpdateReasonForceRefresh UpdateReason = "FORCE_REFRESH" 75 ) 76 77 // StatusReport gathers information about tokend run. 78 // 79 // It is picked up by monitoring harness later. 80 type StatusReport struct { 81 Version string // major version of the tokend executable 82 Started time.Time // when the process started 83 Finished time.Time // when the process finished 84 UpdateOutcome UpdateOutcome // overall outcome of the token update process 85 UpdateReason UpdateReason // why tokend attempts to update the token 86 FailureError error // immediate error that caused the failure 87 MintTokenDuration time.Duration // how long RPC call lasted (with all retries) 88 LastToken *tokenserver.TokenFile // last known token (possibly refreshed) 89 ServiceVersion string // name and version of the server that generated the token 90 } 91 92 // Report is how status report looks on disk. 93 type Report struct { 94 TokendVersion string `json:"tokend_version"` 95 ServiceVersion string `json:"service_version,omitempty"` 96 StartedTS int64 `json:"started_ts"` 97 TotalDuration int64 `json:"total_duration_us,omitempty"` 98 RPCDuration int64 `json:"rpc_duration_us,omitempty"` 99 UpdateOutcome string `json:"update_outcome,omitempty"` 100 UpdateReason string `json:"update_reason,omitempty"` 101 FailureError string `json:"failure_error,omitempty"` 102 LogDump string `json:"log_dump"` 103 TokenLastUpdateTS int64 `json:"token_last_update_ts,omitempty"` 104 TokenNextUpdateTS int64 `json:"token_next_update_ts,omitempty"` 105 TokenExpiryTS int64 `json:"token_expiry_ts,omitempty"` 106 } 107 108 // Report gathers the report into single JSON-serializable struct. 109 func (s *StatusReport) Report() *Report { 110 rep := &Report{ 111 TokendVersion: s.Version, 112 ServiceVersion: s.ServiceVersion, 113 StartedTS: s.Started.Unix(), 114 TotalDuration: s.Finished.Sub(s.Started).Nanoseconds() / 1000, 115 RPCDuration: s.MintTokenDuration.Nanoseconds() / 1000, 116 UpdateOutcome: string(s.UpdateOutcome), 117 UpdateReason: string(s.UpdateReason), 118 } 119 if s.FailureError != nil { 120 rep.FailureError = s.FailureError.Error() 121 } 122 if s.LastToken != nil { 123 rep.TokenLastUpdateTS = s.LastToken.LastUpdate 124 rep.TokenNextUpdateTS = s.LastToken.NextUpdate 125 rep.TokenExpiryTS = s.LastToken.Expiry 126 } 127 return rep 128 } 129 130 // SaveToFile saves the status report and log to a file on disk. 131 func (s *StatusReport) SaveToFile(ctx context.Context, l *memlogger.MemLogger, path string) error { 132 report := s.Report() 133 134 buf := bytes.Buffer{} 135 l.Dump(&buf) 136 report.LogDump = buf.String() 137 138 blob, err := json.MarshalIndent(report, "", " ") 139 if err != nil { 140 return err 141 } 142 return AtomicWriteFile(ctx, path, blob, 0644) 143 } 144 145 //////////////////////////////////////////////////////////////////////////////// 146 // All tsmon metrics. 147 148 var ( 149 // E.g. "1.0". See Version const in main.go. 150 metricVersion = metric.NewString( 151 "luci/machine_tokend/version", 152 "Major version of luci_machine_tokend executable", 153 nil) 154 155 // E.g. "luci-token-server/2123-abcdef" (<appid>/<version>). 156 metricServiceVersion = metric.NewString( 157 "luci/machine_tokend/service_version", 158 "Identifier of the server version that generated the token", 159 nil) 160 161 // This should be >=30 min in the future if everything is ok. If update 162 // process fails repeatedly, it will be in the past (and the token is unusable 163 // at this point). 164 metricTokenExpiry = metric.NewInt( 165 "luci/machine_tokend/token_expiry_ts", 166 "Unix timestamp of when the token expires, in microsec", 167 &types.MetricMetadata{Units: types.Microseconds}) 168 169 // This should be no longer than 30 min in the past if everything is ok. 170 metricTokenLastUpdate = metric.NewInt( 171 "luci/machine_tokend/last_update_ts", 172 "Unix timestamp of when the token was successfully updated, in microsec", 173 &types.MetricMetadata{Units: types.Microseconds}) 174 175 // This should be [0-30] min in the future if everything ok. If update process 176 // fails (at least once), it will be in the past. It's not a fatal condition 177 // yet. 178 metricTokenNextUpdate = metric.NewInt( 179 "luci/machine_tokend/next_update_ts", 180 "Unix timestamp of when the token must be updated next time, in microsec", 181 &types.MetricMetadata{Units: types.Microseconds}) 182 183 // See UpdateOutcome enum and OutcomeFromRPCError for possible values. 184 // 185 // Positive values are "TOKEN_IS_GOOD" and "UPDATE_SUCCESS". 186 metricUpdateOutcome = metric.NewString( 187 "luci/machine_tokend/update_outcome", 188 "Overall outcome of the luci_machine_tokend invocation", 189 nil) 190 191 // See UpdateReason enum for possible values. 192 metricUpdateReason = metric.NewString( 193 "luci/machine_tokend/update_reason", 194 "Why the token was updated or 'TOKEN_IS_GOOD' if token is still valid", 195 nil) 196 197 metricTotalDuration = metric.NewInt( 198 "luci/machine_tokend/duration_total_us", 199 "For how long luci_machine_tokend ran (including all local IO) in microsec", 200 &types.MetricMetadata{Units: types.Microseconds}) 201 202 metricRPCDuration = metric.NewInt( 203 "luci/machine_tokend/duration_rpc_us", 204 "For how long an RPC to backend ran in microsec", 205 &types.MetricMetadata{Units: types.Microseconds}) 206 ) 207 208 // SendMetrics is called at the end of the token update process. 209 // 210 // It dumps all relevant metrics to tsmon. 211 func (s *StatusReport) SendMetrics(ctx context.Context) error { 212 ctx, cancel := context.WithTimeout(ctx, 10*time.Second) 213 defer cancel() 214 rep := s.Report() 215 216 metricVersion.Set(ctx, rep.TokendVersion) 217 if rep.ServiceVersion != "" { 218 metricServiceVersion.Set(ctx, rep.ServiceVersion) 219 } 220 if rep.TokenExpiryTS != 0 { 221 metricTokenExpiry.Set(ctx, rep.TokenExpiryTS*1000000) 222 } 223 if rep.TokenLastUpdateTS != 0 { 224 metricTokenLastUpdate.Set(ctx, rep.TokenLastUpdateTS*1000000) 225 } 226 if rep.TokenNextUpdateTS != 0 { 227 metricTokenNextUpdate.Set(ctx, rep.TokenNextUpdateTS*1000000) 228 } 229 metricUpdateOutcome.Set(ctx, rep.UpdateOutcome) 230 metricUpdateReason.Set(ctx, rep.UpdateReason) 231 metricTotalDuration.Set(ctx, rep.TotalDuration) 232 metricRPCDuration.Set(ctx, rep.RPCDuration) 233 234 return tsmon.Flush(ctx) 235 }