go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/auth/internal/gce.go (about) 1 // Copyright 2015 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package internal 16 17 import ( 18 "context" 19 "encoding/json" 20 "fmt" 21 "net" 22 "net/http" 23 "net/url" 24 "strings" 25 "sync" 26 "time" 27 28 "cloud.google.com/go/compute/metadata" 29 30 "golang.org/x/oauth2" 31 32 "go.chromium.org/luci/common/clock" 33 "go.chromium.org/luci/common/data/stringset" 34 "go.chromium.org/luci/common/errors" 35 "go.chromium.org/luci/common/logging" 36 "go.chromium.org/luci/common/retry" 37 "go.chromium.org/luci/common/retry/transient" 38 ) 39 40 // A client with more relaxed timeouts compared to the default one, which was 41 // observed to timeout often on GKE when using Workload Identities. 42 var metadataClient = metadata.NewClient(&http.Client{ 43 Transport: &http.Transport{ 44 Dial: (&net.Dialer{ 45 Timeout: 10 * time.Second, 46 KeepAlive: 30 * time.Second, 47 }).Dial, 48 ResponseHeaderTimeout: 15 * time.Second, // default is 2 49 }, 50 }) 51 52 // GKE metadata servers is grumpy when it is called concurrently. We use the 53 // global mutex to serialize calls to it from within this process. 54 var globalGCELock sync.Mutex 55 56 type gceTokenProvider struct { 57 account string 58 email string 59 audience string // not empty iff using ID tokens 60 cacheKey CacheKey 61 } 62 63 // NewGCETokenProvider returns TokenProvider that knows how to use GCE metadata 64 // server. 65 func NewGCETokenProvider(ctx context.Context, account string, scopes []string, audience string) (TokenProvider, error) { 66 // When running on GKE using Workload Identities, the metadata is served by 67 // gke-metadata-server pod, which may be very slow, especially when the node 68 // has just started. We'll wait for it to become responsive by retrying 69 // transient errors a bunch of times. 70 var p TokenProvider 71 err := retry.Retry(ctx, transient.Only(retryParams), func() error { 72 var err error 73 p, err = attemptInit(ctx, account, scopes, audience) 74 return err 75 }, retry.LogCallback(ctx, "initializing GCE token provider")) 76 return p, err 77 } 78 79 // retryParams defines the retry strategy for attemptInit. 80 func retryParams() retry.Iterator { 81 return &retry.ExponentialBackoff{ 82 Limited: retry.Limited{ 83 Delay: 100 * time.Millisecond, 84 MaxTotal: 5 * time.Minute, 85 Retries: -1, // until the overall MaxTotal timeout 86 }, 87 Multiplier: 2, 88 MaxDelay: 10 * time.Second, 89 } 90 } 91 92 // attemptInit attempts to initialize GCE token provider. 93 func attemptInit(ctx context.Context, account string, scopes []string, audience string) (TokenProvider, error) { 94 // This mutex is used to avoid hitting GKE metadata server concurrently if 95 // we have a stampede of goroutines. It doesn't actually protect any shared 96 // state in the current process. 97 globalGCELock.Lock() 98 defer globalGCELock.Unlock() 99 100 if account == "" { 101 account = "default" 102 } 103 104 // Grab an email associated with the account. This must not be failing on 105 // a healthy VM if the account is present. If it does, the metadata server is 106 // broken. 107 email, err := metadataClient.Email(account) 108 if err != nil { 109 // Note: we purposefully delay this check only after the first call to 110 // the metadata fails because metadata.OnGCE was observed to often report 111 // "false" when running on GKE due to gke-metadata-server being slow. Our 112 // metadataClient has (much) higher timeouts that the client used by 113 // metadata.OnGCE, and it handles slow gke-metadata-server better. So if we 114 // end up here and metadata.OnGCE also says "false", then we are not on GCE 115 // with high probability. The downside is that it may take up to 15 sec to 116 // detect this (or whatever ResponseHeaderTimeout in metadataClient is). 117 if !metadata.OnGCE() { 118 return nil, ErrBadCredentials 119 } 120 if _, yep := err.(metadata.NotDefinedError); yep { 121 return nil, ErrInsufficientAccess 122 } 123 return nil, transient.Tag.Apply(err) 124 } 125 126 // Ensure the account has requested scopes. Assume 'cloud-platform' scope 127 // covers all possible scopes. This is important when using GKE Workload 128 // Identities: the metadata server always reports only 'cloud-platform' scope 129 // there. Its presence should be enough to cover all scopes used in practice. 130 // The exception is non-cloud scopes (like gerritcodereview or G Suite). To 131 // use such scopes, one will have to use impersonation through Cloud IAM APIs, 132 // which *are* covered by cloud-platform (see ActAsServiceAccount in auth.go). 133 if audience == "" { 134 availableScopes, err := metadataClient.Scopes(account) 135 if err != nil { 136 return nil, transient.Tag.Apply(err) 137 } 138 availableSet := stringset.NewFromSlice(availableScopes...) 139 if !availableSet.Has("https://www.googleapis.com/auth/cloud-platform") { 140 for _, requested := range scopes { 141 if !availableSet.Has(requested) { 142 logging.Warningf(ctx, "GCE service account %q doesn't have required scope %q (all scopes: %q)", account, requested, availableScopes) 143 return nil, ErrInsufficientAccess 144 } 145 } 146 } 147 } 148 149 return &gceTokenProvider{ 150 account: account, 151 email: email, 152 audience: audience, 153 cacheKey: CacheKey{ 154 Key: fmt.Sprintf("gce/%s", account), 155 Scopes: scopes, 156 }, 157 }, nil 158 } 159 160 func (p *gceTokenProvider) RequiresInteraction() bool { 161 return false 162 } 163 164 func (p *gceTokenProvider) Lightweight() bool { 165 return true 166 } 167 168 func (p *gceTokenProvider) Email() string { 169 return p.email 170 } 171 172 func (p *gceTokenProvider) CacheKey(ctx context.Context) (*CacheKey, error) { 173 return &p.cacheKey, nil 174 } 175 176 func (p *gceTokenProvider) MintToken(ctx context.Context, base *Token) (*Token, error) { 177 // This mutex is used to avoid hitting GKE metadata server concurrently if 178 // we have a stampede of goroutines. It doesn't actually protect any shared 179 // state in the current process. 180 globalGCELock.Lock() 181 defer globalGCELock.Unlock() 182 if p.audience != "" { 183 return p.mintIDToken(ctx) 184 } 185 return p.mintAccessToken(ctx) 186 } 187 188 // mintIDToken calls /identity metadata server endpoint. 189 func (p gceTokenProvider) mintIDToken(ctx context.Context) (*Token, error) { 190 v := url.Values{ 191 "audience": []string{p.audience}, 192 "format": []string{"full"}, // include VM instance info into claims 193 } 194 urlSuffix := fmt.Sprintf("instance/service-accounts/%s/identity?%s", p.account, v.Encode()) 195 token, err := metadataClient.Get(urlSuffix) 196 if err != nil { 197 return nil, errors.Annotate(err, "auth/gce: metadata server call failed").Tag(transient.Tag).Err() 198 } 199 200 claims, err := ParseIDTokenClaims(token) 201 if err != nil { 202 return nil, errors.Annotate(err, "auth/gce: metadata server returned invalid ID token").Err() 203 } 204 205 return &Token{ 206 Token: oauth2.Token{ 207 TokenType: "Bearer", 208 AccessToken: NoAccessToken, 209 Expiry: time.Unix(claims.Exp, 0), 210 }, 211 IDToken: token, 212 Email: p.Email(), 213 }, nil 214 } 215 216 // mintAccessToken calls /token metadata server endpoint. 217 // 218 // Note: this code is very similar to ComputeTokenSource(p.account).Token() 219 // from [1], except it uses our custom metadataClient which is more forgiving 220 // of the slowness of the gke-metadata-server. 221 // 222 // [1]: google/google.go file in https://github.com/golang/oauth2 223 func (p *gceTokenProvider) mintAccessToken(ctx context.Context) (*Token, error) { 224 tokenJSON, err := metadataClient.Get("instance/service-accounts/" + p.account + "/token") 225 if err != nil { 226 return nil, errors.Annotate(err, "auth/gce: metadata server call failed").Tag(transient.Tag).Err() 227 } 228 229 var res struct { 230 AccessToken string `json:"access_token"` 231 ExpiresInSec int `json:"expires_in"` 232 TokenType string `json:"token_type"` 233 } 234 switch err = json.NewDecoder(strings.NewReader(tokenJSON)).Decode(&res); { 235 case err != nil: 236 return nil, errors.Annotate(err, "auth/gce: invalid token JSON from metadata").Tag(transient.Tag).Err() 237 case res.ExpiresInSec == 0 || res.AccessToken == "": 238 return nil, errors.Reason("auth/gce: incomplete token received from metadata").Tag(transient.Tag).Err() 239 } 240 241 tok := oauth2.Token{ 242 AccessToken: res.AccessToken, 243 TokenType: res.TokenType, 244 Expiry: clock.Now(ctx).Add(time.Duration(res.ExpiresInSec) * time.Second), 245 } 246 247 return &Token{ 248 // Replicate the hidden magic state added by computeSource.Token(). 249 Token: *tok.WithExtra(map[string]any{ 250 "oauth2.google.tokenSource": "compute-metadata", 251 "oauth2.google.serviceAccount": p.account, 252 }), 253 IDToken: NoIDToken, 254 Email: p.Email(), 255 }, nil 256 } 257 258 func (p *gceTokenProvider) RefreshToken(ctx context.Context, prev, base *Token) (*Token, error) { 259 // Minting and refreshing on GCE is the same thing: a call to metadata server. 260 return p.MintToken(ctx, base) 261 }