go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/secrets/gsm.go (about) 1 // Copyright 2020 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package secrets 16 17 import ( 18 "bytes" 19 "container/heap" 20 "context" 21 "encoding/base64" 22 "fmt" 23 "math" 24 "strconv" 25 "strings" 26 "sync" 27 "time" 28 29 "cloud.google.com/go/secretmanager/apiv1/secretmanagerpb" 30 "github.com/google/tink/go/aead" 31 "github.com/google/tink/go/insecurecleartextkeyset" 32 "github.com/google/tink/go/keyset" 33 gax "github.com/googleapis/gax-go/v2" 34 "golang.org/x/sync/errgroup" 35 "google.golang.org/grpc/codes" 36 "google.golang.org/grpc/status" 37 38 "go.chromium.org/luci/common/clock" 39 "go.chromium.org/luci/common/data/rand/mathrand" 40 "go.chromium.org/luci/common/errors" 41 "go.chromium.org/luci/common/logging" 42 "go.chromium.org/luci/common/tsmon/field" 43 "go.chromium.org/luci/common/tsmon/metric" 44 "go.chromium.org/luci/common/tsmon/types" 45 ) 46 47 // The version of the loaded secret per alias. 48 var versionMetric = metric.NewInt( 49 "secrets/gsm/version", 50 "Version number of a currently loaded Google Secret Manager secret", 51 &types.MetricMetadata{}, 52 field.String("project"), // GCP project with the secret 53 field.String("secret"), // the name of the secret 54 field.String("alias"), // one of "current", "previous", "next" 55 ) 56 57 // SecretManagerStore implements Store using Google Secret Manager. 58 // 59 // Stored secrets are fetched directly from Google Secret Manager. Random 60 // secrets are derived from a root secret using HKDF via DerivedStore. 61 type SecretManagerStore struct { 62 // CloudProject is used for loading secrets of the form "sm://<name>". 63 CloudProject string 64 // AccessSecretVersion is an RPC to fetch the secret from the Secret Manager. 65 AccessSecretVersion func(context.Context, *secretmanagerpb.AccessSecretVersionRequest, ...gax.CallOption) (*secretmanagerpb.AccessSecretVersionResponse, error) 66 67 randomSecrets Store // the store used by RandomSecret 68 69 rwm sync.RWMutex 70 secretsByName map[string]*trackedSecret 71 secretsByTime trackedSecretsPQ 72 wakeUp chan struct{} 73 handlers map[string][]RotationHandler 74 75 testingEvents chan string // used in tests 76 } 77 78 // LoadRootSecret loads the root secret used to generate random secrets. 79 // 80 // See StoredSecret for the format of the root secret. 81 func (sm *SecretManagerStore) LoadRootSecret(ctx context.Context, rootSecret string) error { 82 secret, err := sm.StoredSecret(ctx, rootSecret) 83 if err != nil { 84 return errors.Annotate(err, "failed to read the initial value of the root secret").Err() 85 } 86 derivedStore := NewDerivedStore(secret) 87 sm.AddRotationHandler(ctx, rootSecret, func(_ context.Context, secret Secret) { 88 derivedStore.SetRoot(secret) 89 }) 90 sm.SetRandomSecretsStore(derivedStore) 91 return nil 92 } 93 94 // SetRandomSecretsStore changes the store used for RandomSecret(...). 95 // 96 // Can be used instead of LoadRootSecret to hook up a custom implementation. 97 func (sm *SecretManagerStore) SetRandomSecretsStore(s Store) { 98 sm.randomSecrets = s 99 } 100 101 // MaintenanceLoop runs a loop that periodically rereads secrets. 102 // 103 // It exits on context cancellation. Logs errors inside. 104 func (sm *SecretManagerStore) MaintenanceLoop(ctx context.Context) { 105 wg := sync.WaitGroup{} 106 defer wg.Wait() 107 108 for ctx.Err() == nil { 109 var nextReload time.Time 110 var wakeUp chan struct{} 111 112 sm.emitTestingEvent(ctx, "checking") 113 114 sm.rwm.Lock() 115 for sm.reloadNextSecretLocked(ctx, &wg) { 116 } 117 if len(sm.secretsByTime) != 0 { 118 nextReload = sm.secretsByTime[0].nextReload 119 } 120 wakeUp = make(chan struct{}) 121 sm.wakeUp = wakeUp // closed in StoredSecret 122 sm.rwm.Unlock() 123 124 sm.emitTestingEvent(ctx, "sleeping") 125 if !nextReload.IsZero() { 126 sleep := nextReload.Sub(clock.Now(ctx)) 127 logging.Debugf(ctx, "Sleeping %s until the next scheduled refresh", sleep) 128 select { 129 case <-wakeUp: 130 sm.emitTestingEvent(ctx, "woken") 131 case <-clock.After(ctx, sleep): 132 sm.emitTestingEvent(ctx, "slept %s", sleep.Round(time.Second)) 133 } 134 } else { 135 select { 136 case <-wakeUp: 137 sm.emitTestingEvent(ctx, "woken") 138 case <-ctx.Done(): 139 } 140 } 141 } 142 } 143 144 // RandomSecret returns a random secret given its name. 145 func (sm *SecretManagerStore) RandomSecret(ctx context.Context, name string) (Secret, error) { 146 if sm.randomSecrets == nil { 147 return Secret{}, errors.Reason("random secrets store is not initialized").Err() 148 } 149 return sm.randomSecrets.RandomSecret(ctx, name) 150 } 151 152 // StoredSecret returns a stored secret given its name. 153 // 154 // Value of `name` should have form: 155 // - `sm://<project>/<secret>`: a concrete secret in Google Secret Manager. 156 // - `sm://<secret>`: same as `sm://<CloudProject>/<secret>`. 157 // - `devsecret://<base64-encoded secret>`: return this concrete secret. 158 // - `devsecret-gen://tink/aead`: generate a new secret of the Tink AEAD. 159 // - `devsecret-text://<string>`: return this concrete secret. 160 // 161 // Caches secrets loaded from Google Secret Manager in memory and sets up 162 // a periodic background task to update the cached values to facilitate graceful 163 // rotation. 164 // 165 // Calls to StoredSecret return the latest value from this local cache and thus 166 // are fast. To be notified about changes to the secret as soon as they are 167 // detected use AddRotationHandler. 168 func (sm *SecretManagerStore) StoredSecret(ctx context.Context, name string) (Secret, error) { 169 name, err := sm.normalizeName(name) 170 if err != nil { 171 return Secret{}, err 172 } 173 174 sm.rwm.RLock() 175 known := sm.secretsByName[name] 176 sm.rwm.RUnlock() 177 if known != nil { 178 return known.value, nil 179 } 180 181 // Note: this lock effectively means we load one secret at a time. This should 182 // be fine, there shouldn't be many secrets. And it is probably better to 183 // serialize all loading than to hit the GSM from a lot of handlers at the 184 // same time when referring to a "popular" secret. 185 sm.rwm.Lock() 186 defer sm.rwm.Unlock() 187 188 // Double check after grabbing the write lock. 189 if known := sm.secretsByName[name]; known != nil { 190 return known.value, nil 191 } 192 193 // Read the initial values of the secret. 194 secret, err := sm.readSecret(ctx, name) 195 if err != nil { 196 return Secret{}, err 197 } 198 secret.logActiveVersions(ctx) 199 secret.logNextReloadTime(ctx) 200 201 if sm.secretsByName == nil { 202 sm.secretsByName = make(map[string]*trackedSecret, 1) 203 } 204 sm.secretsByName[name] = secret 205 206 // Wake up the MaintenanceLoop (if any) to let it reschedule the next refresh. 207 if !secret.nextReload.IsZero() { 208 heap.Push(&sm.secretsByTime, secret) 209 if sm.wakeUp != nil { 210 close(sm.wakeUp) 211 sm.wakeUp = nil 212 } 213 } 214 215 return secret.value, nil 216 } 217 218 // AddRotationHandler registers a callback which is called when the stored 219 // secret is updated. 220 // 221 // The handler is called from an internal goroutine and receives a context 222 // passed to MaintenanceLoop. If multiple handlers for the same secret are 223 // registered, they are called in order of their registration one by one. 224 func (sm *SecretManagerStore) AddRotationHandler(ctx context.Context, name string, cb RotationHandler) error { 225 switch name, err := sm.normalizeName(name); { 226 case err != nil: 227 return err 228 229 case !strings.HasPrefix(name, "sm://"): 230 return nil // no updates for static secrets 231 232 default: 233 sm.rwm.Lock() 234 defer sm.rwm.Unlock() 235 if sm.handlers == nil { 236 sm.handlers = make(map[string][]RotationHandler, 1) 237 } 238 sm.handlers[name] = append(sm.handlers[name], cb) 239 return nil 240 } 241 } 242 243 // ReportMetrics is called on each metrics flush to populate metrics. 244 func (sm *SecretManagerStore) ReportMetrics(ctx context.Context) { 245 sm.rwm.RLock() 246 defer sm.rwm.RUnlock() 247 for name, secret := range sm.secretsByName { 248 // `name` is output of normalizeName(...) 249 if strings.HasPrefix(name, "sm://") { 250 // `name` is "sm://<project>/<secret>". 251 parts := strings.SplitN(name[len("sm://"):], "/", 2) 252 project, name := parts[0], parts[1] 253 versionMetric.Set(ctx, secret.versionCurrent, project, name, "current") 254 versionMetric.Set(ctx, secret.versionPrevious, project, name, "previous") 255 versionMetric.Set(ctx, secret.versionNext, project, name, "next") 256 } 257 } 258 } 259 260 //////////////////////////////////////////////////////////////////////////////// 261 262 const ( 263 // Randomized secret reloading interval. It is pretty big, since secrets are 264 // assumed to be rotated infrequently. In rare emergencies a service can be 265 // restarted to pick new secrets faster. 266 reloadIntervalMin = 2 * time.Hour 267 reloadIntervalMax = 4 * time.Hour 268 269 // Max delay when retrying failing fetches. 270 maxRetryDelay = 30 * time.Minute 271 ) 272 273 // trackedSecret is a secret that is periodically reread in MaintenanceLoop. 274 // 275 // Instances of this type are static once constructed and thus are safe to 276 // share across goroutines. 277 type trackedSecret struct { 278 name string // the name it was loaded under 279 value Secret // the latest fetched state of the secret 280 281 versionCurrent int64 // the currently active version or 0 for static dev secrets 282 versionPrevious int64 // the previously active version or 0 if not available 283 versionNext int64 // the next active version or 0 if not available 284 285 attempts int // how many consecutive times we failed to reload the secret 286 nextReload time.Time // when we should reload the secret or zero for static dev secrets 287 } 288 289 func (s *trackedSecret) logActiveVersions(ctx context.Context) { 290 if s.versionCurrent == 0 { 291 return 292 } 293 294 formatVer := func(v int64) string { 295 if v == 0 { 296 return "none" 297 } 298 return strconv.FormatInt(v, 10) 299 } 300 301 logging.Infof(ctx, "Loaded secret %q (versions: current=%s, previous=%s, next=%s)", 302 s.name, 303 formatVer(s.versionCurrent), 304 formatVer(s.versionPrevious), 305 formatVer(s.versionNext), 306 ) 307 } 308 309 func (s *trackedSecret) logNextReloadTime(ctx context.Context) { 310 if !s.nextReload.IsZero() { 311 logging.Debugf(ctx, "Will attempt to reload the secret %q in %s", s.name, s.nextReload.Sub(clock.Now(ctx))) 312 } 313 } 314 315 type trackedSecretsPQ []*trackedSecret 316 317 func (pq trackedSecretsPQ) Len() int { return len(pq) } 318 func (pq trackedSecretsPQ) Less(i, j int) bool { return pq[i].nextReload.Before(pq[j].nextReload) } 319 func (pq trackedSecretsPQ) Swap(i, j int) { pq[i], pq[j] = pq[j], pq[i] } 320 321 func (pq *trackedSecretsPQ) Push(x any) { 322 *pq = append(*pq, x.(*trackedSecret)) 323 } 324 325 func (pq *trackedSecretsPQ) Pop() any { 326 panic("Pop is not actually used, but defined to comply with heap.Interface") 327 } 328 329 // normalizeName check the secret name format and normalizes it. 330 func (sm *SecretManagerStore) normalizeName(name string) (string, error) { 331 switch { 332 case strings.HasPrefix(name, "devsecret://"): 333 return name, nil 334 335 case strings.HasPrefix(name, "devsecret-gen://"): 336 return name, nil 337 338 case strings.HasPrefix(name, "devsecret-text://"): 339 return name, nil 340 341 case strings.HasPrefix(name, "sm://"): 342 switch parts := strings.Split(strings.TrimPrefix(name, "sm://"), "/"); { 343 case len(parts) == 1: 344 if sm.CloudProject == "" { 345 return "", errors.Reason("can't use secret reference %q when the Cloud Project name is not configured", name).Err() 346 } 347 return fmt.Sprintf("sm://%s/%s", sm.CloudProject, parts[0]), nil 348 case len(parts) == 2: 349 return name, nil 350 default: 351 return "", errors.Reason("sm:// secret reference should have form sm://<name> or sm://<project>/<name>").Err() 352 } 353 354 default: 355 return "", errors.Reason("not supported secret reference %q", name).Err() 356 } 357 } 358 359 // readSecret fetches a secret given its normalized name. 360 func (sm *SecretManagerStore) readSecret(ctx context.Context, name string) (*trackedSecret, error) { 361 switch { 362 case strings.HasPrefix(name, "devsecret://"): 363 value, err := base64.RawStdEncoding.DecodeString(strings.TrimPrefix(name, "devsecret://")) 364 if err != nil { 365 return nil, errors.Annotate(err, "bad devsecret://, not base64 encoding").Err() 366 } 367 return &trackedSecret{ 368 name: name, 369 value: Secret{Active: value}, 370 }, nil 371 372 case strings.HasPrefix(name, "devsecret-gen://"): 373 switch kind := strings.TrimPrefix(name, "devsecret-gen://"); kind { 374 case "tink/aead": 375 value, err := generateDevTinkAEADKeyset(ctx) 376 if err != nil { 377 return nil, errors.Annotate(err, "failed to generate new tink AEAD keyset").Err() 378 } 379 return &trackedSecret{ 380 name: name, 381 value: Secret{Active: value}, 382 }, nil 383 default: 384 return nil, errors.Reason("devsecret-gen:// kind %q is not supported", kind).Err() 385 } 386 387 case strings.HasPrefix(name, "devsecret-text://"): 388 return &trackedSecret{ 389 name: name, 390 value: Secret{Active: []byte(strings.TrimPrefix(name, "devsecret-text://"))}, 391 }, nil 392 393 case strings.HasPrefix(name, "sm://"): 394 return sm.readSecretFromGSM(ctx, name) 395 396 default: 397 panic("impossible, already checked in normalizeSecretName") 398 } 399 } 400 401 // readSecretFromGSM returns a sm://... secret given its normalized name. 402 func (sm *SecretManagerStore) readSecretFromGSM(ctx context.Context, name string) (*trackedSecret, error) { 403 logging.Debugf(ctx, "Loading secret %q", name) 404 405 // `name` here must have sm://<project>/<secret> format. 406 parts := strings.Split(strings.TrimPrefix(name, "sm://"), "/") 407 if len(parts) != 2 { 408 panic("impossible, should be normalize already") 409 } 410 project, secret := parts[0], parts[1] 411 412 // Try to access "current", "previous", "next" versions if they are available. 413 eg, egctx := errgroup.WithContext(ctx) 414 var ( 415 current secretVersion 416 previous secretVersion 417 next secretVersion 418 ) 419 attemptAccess := func(ver string, dest *secretVersion) error { 420 switch resp, err := sm.accessSecretVersion(egctx, project, secret, ver); { 421 case err == nil: 422 *dest = *resp 423 case !isMissingOrDisabledVersion(err): 424 return errors.Annotate(err, "version %q", ver).Err() 425 } 426 return nil 427 } 428 eg.Go(func() error { return attemptAccess("current", ¤t) }) 429 eg.Go(func() error { return attemptAccess("previous", &previous) }) 430 eg.Go(func() error { return attemptAccess("next", &next) }) 431 if err := eg.Wait(); err != nil { 432 return nil, errors.Annotate(err, "loading the secret %q", name).Err() 433 } 434 435 // If there's no "current", fallback to loading "latest" instead of "current" 436 // and the one version before it as instead of "previous" (if necessary). This 437 // is a scheme used by this code previously. 438 if current.version == 0 { 439 latest, err := sm.accessSecretVersion(ctx, project, secret, "latest") 440 if err != nil { 441 return nil, errors.Annotate(err, "failed to get the latest version of the secret %q", name).Err() 442 } 443 current = *latest 444 if current.version > 1 && previous.version == 0 { 445 prev, err := sm.accessSecretVersion(ctx, project, secret, fmt.Sprintf("%d", current.version-1)) 446 switch { 447 case err == nil: 448 previous = *prev 449 case !isMissingOrDisabledVersion(err): 450 return nil, errors.Annotate(err, "failed to get the previous version of the secret %q", name).Err() 451 } 452 } 453 } 454 455 // Set the active version, collect non-current versions into "Passive" list. 456 value := Secret{Active: current.payload} 457 seen := map[int64]bool{current.version: true} 458 if previous.version != 0 && !seen[previous.version] { 459 value.Passive = append(value.Passive, previous.payload) 460 seen[previous.version] = true 461 } 462 if next.version != 0 && !seen[next.version] { 463 value.Passive = append(value.Passive, next.payload) 464 seen[next.version] = true 465 } 466 467 return &trackedSecret{ 468 name: name, 469 value: value, 470 versionCurrent: current.version, 471 versionPrevious: previous.version, // may be 0 472 versionNext: next.version, // may be 0 473 nextReload: nextReloadTime(ctx), 474 }, nil 475 } 476 477 type secretVersion struct { 478 version int64 // integer version, >=1 for loaded secrets 479 payload []byte // the actual secret value 480 } 481 482 // accessSecretVersion wraps the AccessSecretVersion RPC, decoding the version. 483 // 484 // Returns gRPC errors. 485 func (sm *SecretManagerStore) accessSecretVersion(ctx context.Context, project, secret, version string) (*secretVersion, error) { 486 resp, err := sm.AccessSecretVersion(ctx, &secretmanagerpb.AccessSecretVersionRequest{ 487 Name: fmt.Sprintf("projects/%s/secrets/%s/versions/%s", project, secret, version), 488 }) 489 if err != nil { 490 return nil, err 491 } 492 // The version name has format "projects/.../secrets/.../versions/<number>". 493 // We want to grab the version number. Note that GSM uses numeric project IDs 494 // in `resp.Name` instead of Cloud Project names, so we can't just trim 495 // the prefix. 496 idx := strings.LastIndex(resp.Name, "/") 497 if idx == -1 { 498 return nil, status.Errorf(codes.Unknown, "unexpected version name format %q", resp.Name) 499 } 500 ver, err := strconv.ParseInt(resp.Name[idx+1:], 10, 64) 501 if err != nil { 502 return nil, status.Errorf(codes.Unknown, "unexpected version name format %q", resp.Name) 503 } 504 if ver <= 0 { 505 return nil, status.Errorf(codes.Unknown, "the version is unexpectedly non-positive %q", resp.Name) 506 } 507 return &secretVersion{ 508 version: ver, 509 payload: resp.Payload.Data, 510 }, nil 511 } 512 513 // reloadNextSecretLocked looks at the secret at the top of secretsByTime PQ and 514 // reloads it. 515 // 516 // Returns false if the top of the queue is not due for the reload yet. 517 // 518 // Launches RotationHandler in individual goroutines adding them to `wg`. 519 func (sm *SecretManagerStore) reloadNextSecretLocked(ctx context.Context, wg *sync.WaitGroup) bool { 520 if len(sm.secretsByTime) == 0 || clock.Now(ctx).Before(sm.secretsByTime[0].nextReload) { 521 return false 522 } 523 524 // Reload the secret. This always changes its nextReload, even on failures. 525 secret := sm.secretsByTime[0] 526 updated := sm.tryReloadSecretLocked(ctx, secret) 527 heap.Fix(&sm.secretsByTime, 0) // fix after its nextReload changed 528 529 // Call RotationHandler callbacks from a goroutine to avoid blocking the loop. 530 if updated { 531 handlers := append([]RotationHandler(nil), sm.handlers[secret.name]...) 532 newValue := secret.value 533 wg.Add(1) 534 go func() { 535 defer wg.Done() 536 for _, cb := range handlers { 537 cb(ctx, newValue) 538 } 539 }() 540 sm.emitTestingEvent(ctx, "reloaded %s", secret.name) 541 } else { 542 sm.emitTestingEvent(ctx, "checked %s", secret.name) 543 } 544 545 return true 546 } 547 548 // tryReloadSecretLocked attempts to reload the secret, mutating it in-place. 549 // 550 // Returns true if the secret has a new value now or false if the value didn't 551 // change (either we failed to fetch it or it really didn't change). 552 // 553 // Logs errors inside. Fields `secret.attempts` and `secret.nextReload` are 554 // mutated even on failures. 555 func (sm *SecretManagerStore) tryReloadSecretLocked(ctx context.Context, secret *trackedSecret) bool { 556 fresh, err := sm.readSecret(ctx, secret.name) 557 if err != nil { 558 secret.attempts += 1 559 sleep := reloadBackoffInterval(ctx, secret.attempts) 560 logging.Errorf(ctx, "Failed to reload the secret (attempt %d, next try in %s): %s", secret.attempts, sleep, err) 561 secret.nextReload = clock.Now(ctx).Add(sleep) 562 return false 563 } 564 updated := !fresh.value.Equal(secret.value) 565 *secret = *fresh 566 if updated { 567 secret.logActiveVersions(ctx) 568 } 569 secret.logNextReloadTime(ctx) 570 return updated 571 } 572 573 // emitTestingEvent is used in tests to expose what MaintenanceLoop is doing. 574 func (sm *SecretManagerStore) emitTestingEvent(ctx context.Context, msg string, args ...any) { 575 if sm.testingEvents != nil { 576 select { 577 case sm.testingEvents <- fmt.Sprintf(msg, args...): 578 case <-ctx.Done(): 579 } 580 } 581 } 582 583 // isMissingOrDisabledVersion checks for the gRPC code representing missing or 584 // disabled versions. 585 func isMissingOrDisabledVersion(err error) bool { 586 code := status.Code(err) 587 return code == codes.NotFound || code == codes.FailedPrecondition 588 } 589 590 // nextReloadTime returns a time when we should try to reload the secret. 591 func nextReloadTime(ctx context.Context) time.Time { 592 dt := reloadIntervalMin + time.Duration(mathrand.Int63n(ctx, int64(reloadIntervalMax-reloadIntervalMin))) 593 return clock.Now(ctx).Add(dt) 594 } 595 596 // reloadBackoffInterval tells how long to sleep after a failed reload attempt. 597 func reloadBackoffInterval(ctx context.Context, attempt int) time.Duration { 598 factor := math.Pow(2.0, float64(attempt)) // 2, 4, 8, ... 599 factor += 10 * mathrand.Float64(ctx) 600 dur := time.Duration(float64(time.Second) * factor) 601 if dur > maxRetryDelay { 602 dur = maxRetryDelay 603 } 604 return dur 605 } 606 607 // generateDevTinkAEADKeyset generates "devsecret-gen://tink/aead" key. 608 func generateDevTinkAEADKeyset(ctx context.Context) ([]byte, error) { 609 kh, err := keyset.NewHandle(aead.AES256GCMKeyTemplate()) 610 if err != nil { 611 return nil, errors.Annotate(err, "failed to generate from template").Err() 612 } 613 buf := bytes.NewBuffer(nil) 614 if err := insecurecleartextkeyset.Write(kh, keyset.NewJSONWriter(buf)); err != nil { 615 return nil, errors.Annotate(err, "failed to serialize newly generated keyset").Err() 616 } 617 value := buf.Bytes() 618 logging.Infof( 619 ctx, 620 "Generated a new development AEAD keyset. To re-use locally during development, "+ 621 "replace \"devsecret-gen://tink/aead\" with the value below\n\tdevsecret://%s", 622 base64.RawStdEncoding.EncodeToString(value), 623 ) 624 return value, nil 625 }