github.com/terramate-io/tf@v0.0.0-20230830114523-fce866b4dfcd/backend/remote-state/consul/client.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package consul 5 6 import ( 7 "bytes" 8 "compress/gzip" 9 "context" 10 "crypto/md5" 11 "encoding/json" 12 "errors" 13 "fmt" 14 "log" 15 "strings" 16 "sync" 17 "time" 18 19 consulapi "github.com/hashicorp/consul/api" 20 multierror "github.com/hashicorp/go-multierror" 21 "github.com/terramate-io/tf/states/remote" 22 "github.com/terramate-io/tf/states/statemgr" 23 ) 24 25 const ( 26 lockSuffix = "/.lock" 27 lockInfoSuffix = "/.lockinfo" 28 29 // The Session TTL associated with this lock. 30 lockSessionTTL = "15s" 31 32 // the delay time from when a session is lost to when the 33 // lock is released by the server 34 lockDelay = 5 * time.Second 35 // interval between attempts to reacquire a lost lock 36 lockReacquireInterval = 2 * time.Second 37 ) 38 39 var lostLockErr = errors.New("consul lock was lost") 40 41 // RemoteClient is a remote client that stores data in Consul. 42 type RemoteClient struct { 43 Client *consulapi.Client 44 Path string 45 GZip bool 46 47 mu sync.Mutex 48 // lockState is true if we're using locks 49 lockState bool 50 51 // The index of the last state we wrote. 52 // If this is > 0, Put will perform a CAS to ensure that the state wasn't 53 // changed during the operation. This is important even with locks, because 54 // if the client loses the lock for some reason, then reacquires it, we 55 // need to make sure that the state was not modified. 56 modifyIndex uint64 57 58 consulLock *consulapi.Lock 59 lockCh <-chan struct{} 60 61 info *statemgr.LockInfo 62 63 // cancel our goroutine which is monitoring the lock to automatically 64 // reacquire it when possible. 65 monitorCancel context.CancelFunc 66 monitorWG sync.WaitGroup 67 68 // sessionCancel cancels the Context use for session.RenewPeriodic, and is 69 // called when unlocking, or before creating a new lock if the lock is 70 // lost. 71 sessionCancel context.CancelFunc 72 } 73 74 func (c *RemoteClient) Get() (*remote.Payload, error) { 75 c.mu.Lock() 76 defer c.mu.Unlock() 77 78 kv := c.Client.KV() 79 80 chunked, hash, chunks, pair, err := c.chunkedMode() 81 if err != nil { 82 return nil, err 83 } 84 if pair == nil { 85 return nil, nil 86 } 87 88 c.modifyIndex = pair.ModifyIndex 89 90 var payload []byte 91 if chunked { 92 for _, c := range chunks { 93 pair, _, err := kv.Get(c, nil) 94 if err != nil { 95 return nil, err 96 } 97 if pair == nil { 98 return nil, fmt.Errorf("Key %q could not be found", c) 99 } 100 payload = append(payload, pair.Value[:]...) 101 } 102 } else { 103 payload = pair.Value 104 } 105 106 // If the payload starts with 0x1f, it's gzip, not json 107 if len(payload) >= 1 && payload[0] == '\x1f' { 108 payload, err = uncompressState(payload) 109 if err != nil { 110 return nil, err 111 } 112 } 113 114 md5 := md5.Sum(payload) 115 116 if hash != "" && fmt.Sprintf("%x", md5) != hash { 117 return nil, fmt.Errorf("The remote state does not match the expected hash") 118 } 119 120 return &remote.Payload{ 121 Data: payload, 122 MD5: md5[:], 123 }, nil 124 } 125 126 func (c *RemoteClient) Put(data []byte) error { 127 // The state can be stored in 4 different ways, based on the payload size 128 // and whether the user enabled gzip: 129 // - single entry mode with plain JSON: a single JSON is stored at 130 // "tfstate/my_project" 131 // - single entry mode gzip: the JSON payload is first gziped and stored at 132 // "tfstate/my_project" 133 // - chunked mode with plain JSON: the JSON payload is split in pieces and 134 // stored like so: 135 // - "tfstate/my_project" -> a JSON payload that contains the path of 136 // the chunks and an MD5 sum like so: 137 // { 138 // "current-hash": "abcdef1234", 139 // "chunks": [ 140 // "tfstate/my_project/tfstate.abcdef1234/0", 141 // "tfstate/my_project/tfstate.abcdef1234/1", 142 // "tfstate/my_project/tfstate.abcdef1234/2", 143 // ] 144 // } 145 // - "tfstate/my_project/tfstate.abcdef1234/0" -> The first chunk 146 // - "tfstate/my_project/tfstate.abcdef1234/1" -> The next one 147 // - ... 148 // - chunked mode with gzip: the same system but we gziped the JSON payload 149 // before splitting it in chunks 150 // 151 // When overwritting the current state, we need to clean the old chunks if 152 // we were in chunked mode (no matter whether we need to use chunks for the 153 // new one). To do so based on the 4 possibilities above we look at the 154 // value at "tfstate/my_project" and if it is: 155 // - absent then it's a new state and there will be nothing to cleanup, 156 // - not a JSON payload we were in single entry mode with gzip so there will 157 // be nothing to cleanup 158 // - a JSON payload, then we were either single entry mode with plain JSON 159 // or in chunked mode. To differentiate between the two we look whether a 160 // "current-hash" key is present in the payload. If we find one we were 161 // in chunked mode and we will need to remove the old chunks (whether or 162 // not we were using gzip does not matter in that case). 163 164 c.mu.Lock() 165 defer c.mu.Unlock() 166 167 kv := c.Client.KV() 168 169 // First we determine what mode we were using and to prepare the cleanup 170 chunked, hash, _, _, err := c.chunkedMode() 171 if err != nil { 172 return err 173 } 174 cleanupOldChunks := func() {} 175 if chunked { 176 cleanupOldChunks = func() { 177 // We ignore all errors that can happen here because we already 178 // saved the new state and there is no way to return a warning to 179 // the user. We may end up with dangling chunks but there is no way 180 // to be sure we won't. 181 path := strings.TrimRight(c.Path, "/") + fmt.Sprintf("/tfstate.%s/", hash) 182 kv.DeleteTree(path, nil) 183 } 184 } 185 186 payload := data 187 if c.GZip { 188 if compressedState, err := compressState(data); err == nil { 189 payload = compressedState 190 } else { 191 return err 192 } 193 } 194 195 // default to doing a CAS 196 verb := consulapi.KVCAS 197 198 // Assume a 0 index doesn't need a CAS for now, since we are either 199 // creating a new state or purposely overwriting one. 200 if c.modifyIndex == 0 { 201 verb = consulapi.KVSet 202 } 203 204 // The payload may be too large to store in a single KV entry in Consul. We 205 // could try to determine whether it will fit or not before sending the 206 // request but since we are using the Transaction API and not the KV API, 207 // it grows by about a 1/3 when it is base64 encoded plus the overhead of 208 // the fields specific to the Transaction API. 209 // Rather than trying to calculate the overhead (which could change from 210 // one version of Consul to another, and between Consul Community Edition 211 // and Consul Enterprise), we try to send the whole state in one request, if 212 // it fails because it is too big we then split it in chunks and send each 213 // chunk separately. 214 // When splitting in chunks, we make each chunk 524288 bits, which is the 215 // default max size for raft. If the user changed it, we still may send 216 // chunks too big and fail but this is not a setting that should be fiddled 217 // with anyway. 218 219 store := func(payload []byte) error { 220 // KV.Put doesn't return the new index, so we use a single operation 221 // transaction to get the new index with a single request. 222 txOps := consulapi.KVTxnOps{ 223 &consulapi.KVTxnOp{ 224 Verb: verb, 225 Key: c.Path, 226 Value: payload, 227 Index: c.modifyIndex, 228 }, 229 } 230 231 ok, resp, _, err := kv.Txn(txOps, nil) 232 if err != nil { 233 return err 234 } 235 // transaction was rolled back 236 if !ok { 237 var resultErr error 238 for _, respError := range resp.Errors { 239 resultErr = multierror.Append(resultErr, errors.New(respError.What)) 240 } 241 return fmt.Errorf("consul CAS failed with transaction errors: %w", resultErr) 242 } 243 244 if len(resp.Results) != 1 { 245 // this probably shouldn't happen 246 return fmt.Errorf("expected on 1 response value, got: %d", len(resp.Results)) 247 } 248 249 c.modifyIndex = resp.Results[0].ModifyIndex 250 251 // We remove all the old chunks 252 cleanupOldChunks() 253 254 return nil 255 } 256 257 if err = store(payload); err == nil { 258 // The payload was small enough to be stored 259 return nil 260 } else if !strings.Contains(err.Error(), "too large") { 261 // We failed for some other reason, report this to the user 262 return err 263 } 264 265 // The payload was too large so we split it in multiple chunks 266 267 md5 := md5.Sum(data) 268 chunks := split(payload, 524288) 269 chunkPaths := make([]string, 0) 270 271 // First we write the new chunks 272 for i, p := range chunks { 273 path := strings.TrimRight(c.Path, "/") + fmt.Sprintf("/tfstate.%x/%d", md5, i) 274 chunkPaths = append(chunkPaths, path) 275 _, err := kv.Put(&consulapi.KVPair{ 276 Key: path, 277 Value: p, 278 }, nil) 279 280 if err != nil { 281 return err 282 } 283 } 284 285 // Then we update the link to point to the new chunks 286 payload, err = json.Marshal(map[string]interface{}{ 287 "current-hash": fmt.Sprintf("%x", md5), 288 "chunks": chunkPaths, 289 }) 290 if err != nil { 291 return err 292 } 293 return store(payload) 294 } 295 296 func (c *RemoteClient) Delete() error { 297 c.mu.Lock() 298 defer c.mu.Unlock() 299 300 kv := c.Client.KV() 301 302 chunked, hash, _, _, err := c.chunkedMode() 303 if err != nil { 304 return err 305 } 306 307 _, err = kv.Delete(c.Path, nil) 308 309 // If there were chunks we need to remove them 310 if chunked { 311 path := strings.TrimRight(c.Path, "/") + fmt.Sprintf("/tfstate.%s/", hash) 312 kv.DeleteTree(path, nil) 313 } 314 315 return err 316 } 317 318 func (c *RemoteClient) lockPath() string { 319 // we sanitize the path for the lock as Consul does not like having 320 // two consecutive slashes for the lock path 321 return strings.TrimRight(c.Path, "/") 322 } 323 324 func (c *RemoteClient) putLockInfo(info *statemgr.LockInfo) error { 325 info.Path = c.Path 326 info.Created = time.Now().UTC() 327 328 kv := c.Client.KV() 329 _, err := kv.Put(&consulapi.KVPair{ 330 Key: c.lockPath() + lockInfoSuffix, 331 Value: info.Marshal(), 332 }, nil) 333 334 return err 335 } 336 337 func (c *RemoteClient) getLockInfo() (*statemgr.LockInfo, error) { 338 path := c.lockPath() + lockInfoSuffix 339 pair, _, err := c.Client.KV().Get(path, nil) 340 if err != nil { 341 return nil, err 342 } 343 if pair == nil { 344 return nil, nil 345 } 346 347 li := &statemgr.LockInfo{} 348 err = json.Unmarshal(pair.Value, li) 349 if err != nil { 350 return nil, fmt.Errorf("error unmarshaling lock info: %s", err) 351 } 352 353 return li, nil 354 } 355 356 func (c *RemoteClient) Lock(info *statemgr.LockInfo) (string, error) { 357 c.mu.Lock() 358 defer c.mu.Unlock() 359 360 if !c.lockState { 361 return "", nil 362 } 363 364 c.info = info 365 366 // These checks only are to ensure we strictly follow the specification. 367 // Terraform shouldn't ever re-lock, so provide errors for the 2 possible 368 // states if this is called. 369 select { 370 case <-c.lockCh: 371 // We had a lock, but lost it. 372 return "", errors.New("lost consul lock, cannot re-lock") 373 default: 374 if c.lockCh != nil { 375 // we have an active lock already 376 return "", fmt.Errorf("state %q already locked", c.Path) 377 } 378 } 379 380 return c.lock() 381 } 382 383 // the lock implementation. 384 // Only to be called while holding Client.mu 385 func (c *RemoteClient) lock() (string, error) { 386 // We create a new session here, so it can be canceled when the lock is 387 // lost or unlocked. 388 lockSession, err := c.createSession() 389 if err != nil { 390 return "", err 391 } 392 393 // store the session ID for correlation with consul logs 394 c.info.Info = "consul session: " + lockSession 395 396 // A random lock ID has been generated but we override it with the session 397 // ID as this will make it easier to manually invalidate the session 398 // if needed. 399 c.info.ID = lockSession 400 401 opts := &consulapi.LockOptions{ 402 Key: c.lockPath() + lockSuffix, 403 Session: lockSession, 404 405 // only wait briefly, so terraform has the choice to fail fast or 406 // retry as needed. 407 LockWaitTime: time.Second, 408 LockTryOnce: true, 409 410 // Don't let the lock monitor give up right away, as it's possible the 411 // session is still OK. While the session is refreshed at a rate of 412 // TTL/2, the lock monitor is an idle blocking request and is more 413 // susceptible to being closed by a lower network layer. 414 MonitorRetries: 5, 415 // 416 // The delay between lock monitor retries. 417 // While the session has a 15s TTL plus a 5s wait period on a lost 418 // lock, if we can't get our lock back in 10+ seconds something is 419 // wrong so we're going to drop the session and start over. 420 MonitorRetryTime: 2 * time.Second, 421 } 422 423 c.consulLock, err = c.Client.LockOpts(opts) 424 if err != nil { 425 return "", err 426 } 427 428 lockErr := &statemgr.LockError{} 429 430 lockCh, err := c.consulLock.Lock(make(chan struct{})) 431 if err != nil { 432 lockErr.Err = err 433 return "", lockErr 434 } 435 436 if lockCh == nil { 437 lockInfo, e := c.getLockInfo() 438 if e != nil { 439 lockErr.Err = e 440 return "", lockErr 441 } 442 443 lockErr.Info = lockInfo 444 445 return "", lockErr 446 } 447 448 c.lockCh = lockCh 449 450 err = c.putLockInfo(c.info) 451 if err != nil { 452 if unlockErr := c.unlock(c.info.ID); unlockErr != nil { 453 err = multierror.Append(err, unlockErr) 454 } 455 456 return "", err 457 } 458 459 // Start a goroutine to monitor the lock state. 460 // If we lose the lock to due communication issues with the consul agent, 461 // attempt to immediately reacquire the lock. Put will verify the integrity 462 // of the state by using a CAS operation. 463 ctx, cancel := context.WithCancel(context.Background()) 464 c.monitorCancel = cancel 465 c.monitorWG.Add(1) 466 go func() { 467 defer c.monitorWG.Done() 468 select { 469 case <-c.lockCh: 470 log.Println("[ERROR] lost consul lock") 471 for { 472 c.mu.Lock() 473 // We lost our lock, so we need to cancel the session too. 474 // The CancelFunc is only replaced while holding Client.mu, so 475 // this is safe to call here. This will be replaced by the 476 // lock() call below. 477 c.sessionCancel() 478 479 c.consulLock = nil 480 _, err := c.lock() 481 c.mu.Unlock() 482 483 if err != nil { 484 // We failed to get the lock, keep trying as long as 485 // terraform is running. There may be changes in progress, 486 // so there's no use in aborting. Either we eventually 487 // reacquire the lock, or a Put will fail on a CAS. 488 log.Printf("[ERROR] could not reacquire lock: %s", err) 489 time.Sleep(lockReacquireInterval) 490 491 select { 492 case <-ctx.Done(): 493 return 494 default: 495 } 496 continue 497 } 498 499 // if the error was nil, the new lock started a new copy of 500 // this goroutine. 501 return 502 } 503 504 case <-ctx.Done(): 505 return 506 } 507 }() 508 509 if testLockHook != nil { 510 testLockHook() 511 } 512 513 return c.info.ID, nil 514 } 515 516 // called after a lock is acquired 517 var testLockHook func() 518 519 func (c *RemoteClient) createSession() (string, error) { 520 // create the context first. Even if the session creation fails, we assume 521 // that the CancelFunc is always callable. 522 ctx, cancel := context.WithCancel(context.Background()) 523 c.sessionCancel = cancel 524 525 session := c.Client.Session() 526 se := &consulapi.SessionEntry{ 527 Name: consulapi.DefaultLockSessionName, 528 TTL: lockSessionTTL, 529 LockDelay: lockDelay, 530 } 531 532 id, _, err := session.Create(se, nil) 533 if err != nil { 534 return "", err 535 } 536 537 log.Println("[INFO] created consul lock session", id) 538 539 // keep the session renewed 540 go session.RenewPeriodic(lockSessionTTL, id, nil, ctx.Done()) 541 542 return id, nil 543 } 544 545 func (c *RemoteClient) Unlock(id string) error { 546 c.mu.Lock() 547 defer c.mu.Unlock() 548 549 if !c.lockState { 550 return nil 551 } 552 553 return c.unlock(id) 554 } 555 556 // the unlock implementation. 557 // Only to be called while holding Client.mu 558 func (c *RemoteClient) unlock(id string) error { 559 // This method can be called in two circumstances: 560 // - when the plan apply or destroy operation finishes and the lock needs to be released, 561 // the watchdog stopped and the session closed 562 // - when the user calls `terraform force-unlock <lock_id>` in which case 563 // we only need to release the lock. 564 565 if c.consulLock == nil || c.lockCh == nil { 566 // The user called `terraform force-unlock <lock_id>`, we just destroy 567 // the session which will release the lock, clean the KV store and quit. 568 569 _, err := c.Client.Session().Destroy(id, nil) 570 if err != nil { 571 return err 572 } 573 // We ignore the errors that may happen during cleanup 574 kv := c.Client.KV() 575 kv.Delete(c.lockPath()+lockSuffix, nil) 576 kv.Delete(c.lockPath()+lockInfoSuffix, nil) 577 578 return nil 579 } 580 581 // cancel our monitoring goroutine 582 c.monitorCancel() 583 584 defer func() { 585 c.consulLock = nil 586 587 // The consul session is only used for this single lock, so cancel it 588 // after we unlock. 589 // The session is only created and replaced holding Client.mu, so the 590 // CancelFunc must be non-nil. 591 c.sessionCancel() 592 }() 593 594 select { 595 case <-c.lockCh: 596 return lostLockErr 597 default: 598 } 599 600 kv := c.Client.KV() 601 602 var errs error 603 604 if _, err := kv.Delete(c.lockPath()+lockInfoSuffix, nil); err != nil { 605 errs = multierror.Append(errs, err) 606 } 607 608 if err := c.consulLock.Unlock(); err != nil { 609 errs = multierror.Append(errs, err) 610 } 611 612 // the monitoring goroutine may be in a select on the lockCh, so we need to 613 // wait for it to return before changing the value. 614 c.monitorWG.Wait() 615 c.lockCh = nil 616 617 // This is only cleanup, and will fail if the lock was immediately taken by 618 // another client, so we don't report an error to the user here. 619 c.consulLock.Destroy() 620 621 return errs 622 } 623 624 func compressState(data []byte) ([]byte, error) { 625 b := new(bytes.Buffer) 626 gz := gzip.NewWriter(b) 627 if _, err := gz.Write(data); err != nil { 628 return nil, err 629 } 630 if err := gz.Flush(); err != nil { 631 return nil, err 632 } 633 if err := gz.Close(); err != nil { 634 return nil, err 635 } 636 return b.Bytes(), nil 637 } 638 639 func uncompressState(data []byte) ([]byte, error) { 640 b := new(bytes.Buffer) 641 gz, err := gzip.NewReader(bytes.NewReader(data)) 642 if err != nil { 643 return nil, err 644 } 645 b.ReadFrom(gz) 646 if err := gz.Close(); err != nil { 647 return nil, err 648 } 649 return b.Bytes(), nil 650 } 651 652 func split(payload []byte, limit int) [][]byte { 653 var chunk []byte 654 chunks := make([][]byte, 0, len(payload)/limit+1) 655 for len(payload) >= limit { 656 chunk, payload = payload[:limit], payload[limit:] 657 chunks = append(chunks, chunk) 658 } 659 if len(payload) > 0 { 660 chunks = append(chunks, payload[:]) 661 } 662 return chunks 663 } 664 665 func (c *RemoteClient) chunkedMode() (bool, string, []string, *consulapi.KVPair, error) { 666 kv := c.Client.KV() 667 pair, _, err := kv.Get(c.Path, nil) 668 if err != nil { 669 return false, "", nil, pair, err 670 } 671 if pair != nil { 672 var d map[string]interface{} 673 err = json.Unmarshal(pair.Value, &d) 674 // If there is an error when unmarshaling the payload, the state has 675 // probably been gziped in single entry mode. 676 if err == nil { 677 // If we find the "current-hash" key we were in chunked mode 678 hash, ok := d["current-hash"] 679 if ok { 680 chunks := make([]string, 0) 681 for _, c := range d["chunks"].([]interface{}) { 682 chunks = append(chunks, c.(string)) 683 } 684 return true, hash.(string), chunks, pair, nil 685 } 686 } 687 } 688 return false, "", nil, pair, nil 689 }