go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/auth/internal/gce.go (about)

     1  // Copyright 2015 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package internal
    16  
    17  import (
    18  	"context"
    19  	"encoding/json"
    20  	"fmt"
    21  	"net"
    22  	"net/http"
    23  	"net/url"
    24  	"strings"
    25  	"sync"
    26  	"time"
    27  
    28  	"cloud.google.com/go/compute/metadata"
    29  
    30  	"golang.org/x/oauth2"
    31  
    32  	"go.chromium.org/luci/common/clock"
    33  	"go.chromium.org/luci/common/data/stringset"
    34  	"go.chromium.org/luci/common/errors"
    35  	"go.chromium.org/luci/common/logging"
    36  	"go.chromium.org/luci/common/retry"
    37  	"go.chromium.org/luci/common/retry/transient"
    38  )
    39  
    40  // A client with more relaxed timeouts compared to the default one, which was
    41  // observed to timeout often on GKE when using Workload Identities.
    42  var metadataClient = metadata.NewClient(&http.Client{
    43  	Transport: &http.Transport{
    44  		Dial: (&net.Dialer{
    45  			Timeout:   10 * time.Second,
    46  			KeepAlive: 30 * time.Second,
    47  		}).Dial,
    48  		ResponseHeaderTimeout: 15 * time.Second, // default is 2
    49  	},
    50  })
    51  
    52  // GKE metadata servers is grumpy when it is called concurrently. We use the
    53  // global mutex to serialize calls to it from within this process.
    54  var globalGCELock sync.Mutex
    55  
    56  type gceTokenProvider struct {
    57  	account  string
    58  	email    string
    59  	audience string // not empty iff using ID tokens
    60  	cacheKey CacheKey
    61  }
    62  
    63  // NewGCETokenProvider returns TokenProvider that knows how to use GCE metadata
    64  // server.
    65  func NewGCETokenProvider(ctx context.Context, account string, scopes []string, audience string) (TokenProvider, error) {
    66  	// When running on GKE using Workload Identities, the metadata is served by
    67  	// gke-metadata-server pod, which may be very slow, especially when the node
    68  	// has just started. We'll wait for it to become responsive by retrying
    69  	// transient errors a bunch of times.
    70  	var p TokenProvider
    71  	err := retry.Retry(ctx, transient.Only(retryParams), func() error {
    72  		var err error
    73  		p, err = attemptInit(ctx, account, scopes, audience)
    74  		return err
    75  	}, retry.LogCallback(ctx, "initializing GCE token provider"))
    76  	return p, err
    77  }
    78  
    79  // retryParams defines the retry strategy for attemptInit.
    80  func retryParams() retry.Iterator {
    81  	return &retry.ExponentialBackoff{
    82  		Limited: retry.Limited{
    83  			Delay:    100 * time.Millisecond,
    84  			MaxTotal: 5 * time.Minute,
    85  			Retries:  -1, // until the overall MaxTotal timeout
    86  		},
    87  		Multiplier: 2,
    88  		MaxDelay:   10 * time.Second,
    89  	}
    90  }
    91  
    92  // attemptInit attempts to initialize GCE token provider.
    93  func attemptInit(ctx context.Context, account string, scopes []string, audience string) (TokenProvider, error) {
    94  	// This mutex is used to avoid hitting GKE metadata server concurrently if
    95  	// we have a stampede of goroutines. It doesn't actually protect any shared
    96  	// state in the current process.
    97  	globalGCELock.Lock()
    98  	defer globalGCELock.Unlock()
    99  
   100  	if account == "" {
   101  		account = "default"
   102  	}
   103  
   104  	// Grab an email associated with the account. This must not be failing on
   105  	// a healthy VM if the account is present. If it does, the metadata server is
   106  	// broken.
   107  	email, err := metadataClient.Email(account)
   108  	if err != nil {
   109  		// Note: we purposefully delay this check only after the first call to
   110  		// the metadata fails because metadata.OnGCE was observed to often report
   111  		// "false" when running on GKE due to gke-metadata-server being slow. Our
   112  		// metadataClient has (much) higher timeouts that the client used by
   113  		// metadata.OnGCE, and it handles slow gke-metadata-server better. So if we
   114  		// end up here and metadata.OnGCE also says "false", then we are not on GCE
   115  		// with high probability. The downside is that it may take up to 15 sec to
   116  		// detect this (or whatever ResponseHeaderTimeout in metadataClient is).
   117  		if !metadata.OnGCE() {
   118  			return nil, ErrBadCredentials
   119  		}
   120  		if _, yep := err.(metadata.NotDefinedError); yep {
   121  			return nil, ErrInsufficientAccess
   122  		}
   123  		return nil, transient.Tag.Apply(err)
   124  	}
   125  
   126  	// Ensure the account has requested scopes. Assume 'cloud-platform' scope
   127  	// covers all possible scopes. This is important when using GKE Workload
   128  	// Identities: the metadata server always reports only 'cloud-platform' scope
   129  	// there. Its presence should be enough to cover all scopes used in practice.
   130  	// The exception is non-cloud scopes (like gerritcodereview or G Suite). To
   131  	// use such scopes, one will have to use impersonation through Cloud IAM APIs,
   132  	// which *are* covered by cloud-platform (see ActAsServiceAccount in auth.go).
   133  	if audience == "" {
   134  		availableScopes, err := metadataClient.Scopes(account)
   135  		if err != nil {
   136  			return nil, transient.Tag.Apply(err)
   137  		}
   138  		availableSet := stringset.NewFromSlice(availableScopes...)
   139  		if !availableSet.Has("https://www.googleapis.com/auth/cloud-platform") {
   140  			for _, requested := range scopes {
   141  				if !availableSet.Has(requested) {
   142  					logging.Warningf(ctx, "GCE service account %q doesn't have required scope %q (all scopes: %q)", account, requested, availableScopes)
   143  					return nil, ErrInsufficientAccess
   144  				}
   145  			}
   146  		}
   147  	}
   148  
   149  	return &gceTokenProvider{
   150  		account:  account,
   151  		email:    email,
   152  		audience: audience,
   153  		cacheKey: CacheKey{
   154  			Key:    fmt.Sprintf("gce/%s", account),
   155  			Scopes: scopes,
   156  		},
   157  	}, nil
   158  }
   159  
   160  func (p *gceTokenProvider) RequiresInteraction() bool {
   161  	return false
   162  }
   163  
   164  func (p *gceTokenProvider) Lightweight() bool {
   165  	return true
   166  }
   167  
   168  func (p *gceTokenProvider) Email() string {
   169  	return p.email
   170  }
   171  
   172  func (p *gceTokenProvider) CacheKey(ctx context.Context) (*CacheKey, error) {
   173  	return &p.cacheKey, nil
   174  }
   175  
   176  func (p *gceTokenProvider) MintToken(ctx context.Context, base *Token) (*Token, error) {
   177  	// This mutex is used to avoid hitting GKE metadata server concurrently if
   178  	// we have a stampede of goroutines. It doesn't actually protect any shared
   179  	// state in the current process.
   180  	globalGCELock.Lock()
   181  	defer globalGCELock.Unlock()
   182  	if p.audience != "" {
   183  		return p.mintIDToken(ctx)
   184  	}
   185  	return p.mintAccessToken(ctx)
   186  }
   187  
   188  // mintIDToken calls /identity metadata server endpoint.
   189  func (p gceTokenProvider) mintIDToken(ctx context.Context) (*Token, error) {
   190  	v := url.Values{
   191  		"audience": []string{p.audience},
   192  		"format":   []string{"full"}, // include VM instance info into claims
   193  	}
   194  	urlSuffix := fmt.Sprintf("instance/service-accounts/%s/identity?%s", p.account, v.Encode())
   195  	token, err := metadataClient.Get(urlSuffix)
   196  	if err != nil {
   197  		return nil, errors.Annotate(err, "auth/gce: metadata server call failed").Tag(transient.Tag).Err()
   198  	}
   199  
   200  	claims, err := ParseIDTokenClaims(token)
   201  	if err != nil {
   202  		return nil, errors.Annotate(err, "auth/gce: metadata server returned invalid ID token").Err()
   203  	}
   204  
   205  	return &Token{
   206  		Token: oauth2.Token{
   207  			TokenType:   "Bearer",
   208  			AccessToken: NoAccessToken,
   209  			Expiry:      time.Unix(claims.Exp, 0),
   210  		},
   211  		IDToken: token,
   212  		Email:   p.Email(),
   213  	}, nil
   214  }
   215  
   216  // mintAccessToken calls /token metadata server endpoint.
   217  //
   218  // Note: this code is very similar to ComputeTokenSource(p.account).Token()
   219  // from [1], except it uses our custom metadataClient which is more forgiving
   220  // of the slowness of the gke-metadata-server.
   221  //
   222  // [1]: google/google.go file in https://github.com/golang/oauth2
   223  func (p *gceTokenProvider) mintAccessToken(ctx context.Context) (*Token, error) {
   224  	tokenJSON, err := metadataClient.Get("instance/service-accounts/" + p.account + "/token")
   225  	if err != nil {
   226  		return nil, errors.Annotate(err, "auth/gce: metadata server call failed").Tag(transient.Tag).Err()
   227  	}
   228  
   229  	var res struct {
   230  		AccessToken  string `json:"access_token"`
   231  		ExpiresInSec int    `json:"expires_in"`
   232  		TokenType    string `json:"token_type"`
   233  	}
   234  	switch err = json.NewDecoder(strings.NewReader(tokenJSON)).Decode(&res); {
   235  	case err != nil:
   236  		return nil, errors.Annotate(err, "auth/gce: invalid token JSON from metadata").Tag(transient.Tag).Err()
   237  	case res.ExpiresInSec == 0 || res.AccessToken == "":
   238  		return nil, errors.Reason("auth/gce: incomplete token received from metadata").Tag(transient.Tag).Err()
   239  	}
   240  
   241  	tok := oauth2.Token{
   242  		AccessToken: res.AccessToken,
   243  		TokenType:   res.TokenType,
   244  		Expiry:      clock.Now(ctx).Add(time.Duration(res.ExpiresInSec) * time.Second),
   245  	}
   246  
   247  	return &Token{
   248  		// Replicate the hidden magic state added by computeSource.Token().
   249  		Token: *tok.WithExtra(map[string]any{
   250  			"oauth2.google.tokenSource":    "compute-metadata",
   251  			"oauth2.google.serviceAccount": p.account,
   252  		}),
   253  		IDToken: NoIDToken,
   254  		Email:   p.Email(),
   255  	}, nil
   256  }
   257  
   258  func (p *gceTokenProvider) RefreshToken(ctx context.Context, prev, base *Token) (*Token, error) {
   259  	// Minting and refreshing on GCE is the same thing: a call to metadata server.
   260  	return p.MintToken(ctx, base)
   261  }