go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/tokenserver/cmd/luci_machine_tokend/main.go (about)

     1  // Copyright 2016 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Command luci_machine_tokend runs on all machines via cron.
    16  //
    17  // It wakes up each ~10 min, checks whether it needs to refresh existing machine
    18  // token, and refreshes it if necessary.
    19  //
    20  // It also dumps information about its run into a status file (as JSON), that
    21  // can be picked up sysmon and transformed into ts_mon metrics (most important
    22  // one being "time since last successful token refresh").
    23  package main
    24  
    25  import (
    26  	"context"
    27  	"crypto/sha1"
    28  	"encoding/hex"
    29  	"flag"
    30  	"fmt"
    31  	"os"
    32  	"sort"
    33  	"strings"
    34  	"time"
    35  
    36  	"go.chromium.org/luci/common/clock"
    37  	"go.chromium.org/luci/common/logging"
    38  	"go.chromium.org/luci/common/logging/gologger"
    39  	"go.chromium.org/luci/common/logging/memlogger"
    40  	"go.chromium.org/luci/common/logging/teelogger"
    41  	"go.chromium.org/luci/common/retry"
    42  	"go.chromium.org/luci/common/system/signals"
    43  	"go.chromium.org/luci/common/tsmon"
    44  	"go.chromium.org/luci/common/tsmon/target"
    45  
    46  	tokenserver "go.chromium.org/luci/tokenserver/api"
    47  	"go.chromium.org/luci/tokenserver/api/minter/v1"
    48  
    49  	"go.chromium.org/luci/tokenserver/client"
    50  )
    51  
    52  // Version identifies the major revision of the tokend code.
    53  //
    54  // It is put in the status file (and subsequently reported to monitoring).
    55  const Version = "1.2"
    56  
    57  // commandLine contains all command line flags.
    58  //
    59  // See registerFlags() for description of each individual flag.
    60  type commandLine struct {
    61  	PrivateKeyPath  string
    62  	CertificatePath string
    63  	Backend         string
    64  	TokenFile       string
    65  	StatusFile      string
    66  	Timeout         time.Duration
    67  	ForceRefresh    bool
    68  }
    69  
    70  func defaults() commandLine {
    71  	return commandLine{
    72  		Timeout: 60 * time.Second,
    73  	}
    74  }
    75  
    76  func (c *commandLine) registerFlags(f *flag.FlagSet) {
    77  	f.StringVar(&c.PrivateKeyPath, "pkey-pem", c.PrivateKeyPath, "path to a private key file")
    78  	f.StringVar(&c.CertificatePath, "cert-pem", c.CertificatePath, "path to a certificate file")
    79  	f.StringVar(&c.Backend, "backend", c.Backend, "hostname of the backend to use")
    80  	f.StringVar(&c.TokenFile, "token-file", c.TokenFile, "where to put the token file")
    81  	f.StringVar(&c.StatusFile, "status-file", c.StatusFile, "where to put details about this run (optional)")
    82  	f.DurationVar(&c.Timeout, "timeout", c.Timeout, "how long to retry on errors before giving up")
    83  	f.BoolVar(&c.ForceRefresh, "force-refresh", c.ForceRefresh, "forcefully refresh the token even if it is still valid")
    84  }
    85  
    86  func (c *commandLine) check() error {
    87  	if c.PrivateKeyPath == "" {
    88  		return fmt.Errorf("-pkey-pem is required")
    89  	}
    90  	if c.CertificatePath == "" {
    91  		return fmt.Errorf("-cert-pem is required")
    92  	}
    93  	if c.Backend == "" {
    94  		return fmt.Errorf("-backend is required")
    95  	}
    96  	if c.TokenFile == "" {
    97  		return fmt.Errorf("-token-file is required")
    98  	}
    99  	return nil
   100  }
   101  
   102  func main() {
   103  	os.Exit(realMain())
   104  }
   105  
   106  func realMain() int {
   107  	opts := defaults()
   108  	opts.registerFlags(flag.CommandLine)
   109  
   110  	tsmonFlags := tsmon.NewFlags()
   111  	tsmonFlags.Target.TargetType = target.TaskType
   112  	tsmonFlags.Target.TaskServiceName = "luci_machine_tokend"
   113  	tsmonFlags.Target.TaskJobName = "default"
   114  	tsmonFlags.Flush = "manual"
   115  	tsmonFlags.Register(flag.CommandLine)
   116  
   117  	flag.Parse()
   118  
   119  	if err := opts.check(); err != nil {
   120  		fmt.Fprintln(os.Stderr, err)
   121  		flag.Usage()
   122  		return 2
   123  	}
   124  
   125  	clientParams := client.Parameters{
   126  		PrivateKeyPath:  opts.PrivateKeyPath,
   127  		CertificatePath: opts.CertificatePath,
   128  		Backend:         opts.Backend,
   129  		Retry: func() retry.Iterator {
   130  			return &retry.ExponentialBackoff{
   131  				Limited: retry.Limited{
   132  					Delay:   200 * time.Millisecond,
   133  					Retries: 100000, // limit only by time, not number of retries
   134  				},
   135  				MaxDelay:   opts.Timeout,
   136  				Multiplier: 1.5,
   137  			}
   138  		},
   139  	}
   140  	if strings.HasPrefix(clientParams.Backend, "localhost:") {
   141  		clientParams.Insecure = true
   142  	}
   143  
   144  	log := &memlogger.MemLogger{}
   145  
   146  	// Write Debug log to both memlogger and gologger.
   147  	memLogFactory := func(context.Context) logging.Logger {
   148  		return log
   149  	}
   150  	root := teelogger.Use(context.Background(), memLogFactory, gologger.StdConfig.NewLogger)
   151  	root = logging.SetLevel(root, logging.Debug)
   152  
   153  	// Apply tsmon config. A failure here is non-fatal.
   154  	if err := tsmon.InitializeFromFlags(root, &tsmonFlags); err != nil {
   155  		logging.Errorf(root, "Failed to initialize tsmon - %s", err)
   156  	}
   157  
   158  	ctx, cancel := context.WithTimeout(root, opts.Timeout)
   159  	defer cancel()
   160  	signals.HandleInterrupt(cancel)
   161  
   162  	statusReport := StatusReport{
   163  		Version: Version,
   164  		Started: clock.Now(ctx),
   165  	}
   166  	defer func() {
   167  		// Dump the status of this run. It's picked up by monitoring. Ignore errors
   168  		// here, they are not important compared to 'run' errors. Use root context
   169  		// to be to flush errors to monitoring even if 'ctx' has expired.
   170  		statusReport.Finished = clock.Now(ctx)
   171  		if err := statusReport.SendMetrics(root); err != nil {
   172  			logging.Errorf(root, "Failed to send tsmon metrics - %s", err)
   173  		}
   174  		if opts.StatusFile != "" {
   175  			if err := statusReport.SaveToFile(root, log, opts.StatusFile); err != nil {
   176  				logging.Errorf(root, "Failed to save the status - %s", err)
   177  			}
   178  		}
   179  	}()
   180  	if err := run(ctx, clientParams, opts, &statusReport); err != nil {
   181  		return 1
   182  	}
   183  	return 0
   184  }
   185  
   186  func run(ctx context.Context, clientParams client.Parameters, opts commandLine, status *StatusReport) error {
   187  	// Read existing token file on disk to check whether we really need to update
   188  	// it. We update the token if it is missing, close to expiration, or when
   189  	// parameters change.
   190  	existingToken, existingState := readTokenFile(ctx, opts.TokenFile)
   191  
   192  	// Record the info about existing token in status report, it is useful even if
   193  	// we fail to refresh the token.
   194  	status.LastToken = existingToken
   195  
   196  	// Initialize the client. It will read private key and certificate file into
   197  	// memory and validate them.
   198  	cl, err := client.New(clientParams)
   199  	if err != nil {
   200  		logging.Errorf(ctx, "Failed to initialize the client - %s", err)
   201  		status.FailureError = err
   202  		status.UpdateOutcome = OutcomeCantReadKey
   203  		// Fill in some update reason to avoid "" as metric value.
   204  		if existingToken.NextUpdate == 0 {
   205  			status.UpdateReason = UpdateReasonNewToken
   206  		} else {
   207  			// We successfully updated the token in the past, but now the keys are
   208  			// suddenly unreadable, they probably changed.
   209  			status.UpdateReason = UpdateReasonParametersChange
   210  		}
   211  		return err
   212  	}
   213  
   214  	// Generate a hash of all input parameters. It is used to detect that we
   215  	// need to refresh the token file even if the token is still valid. It
   216  	// happens if we change a key or backend URL.
   217  	signer := cl.Signer.(*client.X509Signer)
   218  	inputsDigest := calcDigest(map[string][]byte{
   219  		"forceBump": {1}, // bump this to forcefully regenerate all tokens
   220  		"pkey":      signer.PrivateKeyPEM,
   221  		"cert":      signer.CertificatePEM,
   222  		"backend":   []byte(clientParams.Backend),
   223  	})
   224  
   225  	// Record a reason for token update (if we need to update the token).
   226  	now := clock.Now(ctx)
   227  	switch {
   228  	case existingToken.NextUpdate == 0:
   229  		status.UpdateReason = UpdateReasonNewToken
   230  	case now.After(time.Unix(existingToken.NextUpdate, 0)):
   231  		status.UpdateReason = UpdateReasonExpiration
   232  	case existingState.InputsDigest != inputsDigest:
   233  		status.UpdateReason = UpdateReasonParametersChange
   234  	case opts.ForceRefresh:
   235  		status.UpdateReason = UpdateReasonForceRefresh
   236  	default:
   237  		logging.Infof(ctx, "The token is valid, skipping the update")
   238  		status.UpdateReason = UpdateReasonTokenIsGood
   239  		status.UpdateOutcome = OutcomeTokenIsGood
   240  		return nil
   241  	}
   242  
   243  	// Grab a new token. MintMachineToken does retries internally, until success
   244  	// or context deadline.
   245  	resp, err := cl.MintMachineToken(ctx, &minter.MachineTokenRequest{
   246  		TokenType: tokenserver.MachineTokenType_LUCI_MACHINE_TOKEN,
   247  	})
   248  	status.MintTokenDuration = clock.Now(ctx).Sub(now)
   249  	if err != nil {
   250  		logging.Errorf(ctx, "Failed to generate a new token - %s", err)
   251  		status.FailureError = err
   252  		status.UpdateOutcome = OutcomeFromRPCError(err)
   253  		if details, ok := err.(client.RPCError); ok {
   254  			status.ServiceVersion = details.ServiceVersion
   255  		}
   256  		return err
   257  	}
   258  	status.ServiceVersion = resp.ServiceVersion
   259  
   260  	// Grab machine_token field.
   261  	var tok *minter.LuciMachineToken
   262  	if tt, _ := resp.TokenType.(*minter.MachineTokenResponse_LuciMachineToken); tt != nil {
   263  		tok = tt.LuciMachineToken
   264  	}
   265  	if tok == nil {
   266  		err = fmt.Errorf("bad response, empty luci_machine_token field")
   267  		logging.Errorf(ctx, "%s", err)
   268  		status.FailureError = err
   269  		status.UpdateOutcome = OutcomeMalformedReponse
   270  		return err
   271  	}
   272  
   273  	now = clock.Now(ctx)
   274  	expiry := tok.Expiry.AsTime()
   275  	lifetime := expiry.Sub(now)
   276  
   277  	// lifetime should usually be 1h, add a safeguard to avoid hammering
   278  	// the backend in case the lifetime is unexpectedly wrong.
   279  	if lifetime < 5*time.Minute {
   280  		logging.Warningf(ctx, "Returned token lifetime is unexpectedly too short (%s)", lifetime)
   281  		lifetime = 5 * time.Minute
   282  	}
   283  
   284  	// We start to attempt to refresh the token after half of its lifetime has
   285  	// passed, to be able survive short (~30 min) backend outages in exchange for
   286  	// 2x RPC rate.
   287  	newTokenFile := tokenserver.TokenFile{
   288  		LuciMachineToken: tok.MachineToken,
   289  		Expiry:           expiry.Unix(),
   290  		LastUpdate:       now.Unix(),
   291  		NextUpdate:       now.Add(lifetime / 2).Unix(),
   292  	}
   293  	newState := stateInToken{
   294  		InputsDigest: inputsDigest,
   295  		Version:      Version,
   296  	}
   297  	if err = writeTokenFile(ctx, &newTokenFile, &newState, opts.TokenFile); err != nil {
   298  		logging.Errorf(ctx, "Failed to save token file - %s", err)
   299  		status.FailureError = err
   300  		if os.IsPermission(err) {
   301  			status.UpdateOutcome = OutcomePermissionError
   302  		} else {
   303  			status.UpdateOutcome = OutcomeUnknownSaveTokenError
   304  		}
   305  		return err
   306  	}
   307  
   308  	status.LastToken = &newTokenFile
   309  	status.UpdateOutcome = OutcomeUpdateSuccess
   310  	return nil
   311  }
   312  
   313  // calcDigest produces a digest of a given map using some stable serialization.
   314  func calcDigest(inputs map[string][]byte) string {
   315  	keys := make([]string, 0, len(inputs))
   316  	for k := range inputs {
   317  		keys = append(keys, k)
   318  	}
   319  	sort.Strings(keys)
   320  	h := sha1.New()
   321  	for _, k := range keys {
   322  		v := inputs[k]
   323  		fmt.Fprintf(h, "%s\n%d\n", k, len(v))
   324  		h.Write(v)
   325  	}
   326  	blob := h.Sum(nil)
   327  	return hex.EncodeToString(blob[:])
   328  }