github.com/opentofu/opentofu@v1.7.1/internal/backend/remote-state/consul/client.go (about) 1 // Copyright (c) The OpenTofu Authors 2 // SPDX-License-Identifier: MPL-2.0 3 // Copyright (c) 2023 HashiCorp, Inc. 4 // SPDX-License-Identifier: MPL-2.0 5 6 package consul 7 8 import ( 9 "bytes" 10 "compress/gzip" 11 "context" 12 "crypto/md5" 13 "encoding/json" 14 "errors" 15 "fmt" 16 "log" 17 "strings" 18 "sync" 19 "time" 20 21 consulapi "github.com/hashicorp/consul/api" 22 multierror "github.com/hashicorp/go-multierror" 23 "github.com/opentofu/opentofu/internal/states/remote" 24 "github.com/opentofu/opentofu/internal/states/statemgr" 25 ) 26 27 const ( 28 lockSuffix = "/.lock" 29 lockInfoSuffix = "/.lockinfo" 30 31 // The Session TTL associated with this lock. 32 lockSessionTTL = "15s" 33 34 // the delay time from when a session is lost to when the 35 // lock is released by the server 36 lockDelay = 5 * time.Second 37 // interval between attempts to reacquire a lost lock 38 lockReacquireInterval = 2 * time.Second 39 ) 40 41 var lostLockErr = errors.New("consul lock was lost") 42 43 // RemoteClient is a remote client that stores data in Consul. 44 type RemoteClient struct { 45 Client *consulapi.Client 46 Path string 47 GZip bool 48 49 mu sync.Mutex 50 // lockState is true if we're using locks 51 lockState bool 52 53 // The index of the last state we wrote. 54 // If this is > 0, Put will perform a CAS to ensure that the state wasn't 55 // changed during the operation. This is important even with locks, because 56 // if the client loses the lock for some reason, then reacquires it, we 57 // need to make sure that the state was not modified. 58 modifyIndex uint64 59 60 consulLock *consulapi.Lock 61 lockCh <-chan struct{} 62 63 info *statemgr.LockInfo 64 65 // cancel our goroutine which is monitoring the lock to automatically 66 // reacquire it when possible. 67 monitorCancel context.CancelFunc 68 monitorWG sync.WaitGroup 69 70 // sessionCancel cancels the Context use for session.RenewPeriodic, and is 71 // called when unlocking, or before creating a new lock if the lock is 72 // lost. 73 sessionCancel context.CancelFunc 74 } 75 76 func (c *RemoteClient) Get() (*remote.Payload, error) { 77 c.mu.Lock() 78 defer c.mu.Unlock() 79 80 kv := c.Client.KV() 81 82 chunked, hash, chunks, pair, err := c.chunkedMode() 83 if err != nil { 84 return nil, err 85 } 86 if pair == nil { 87 return nil, nil 88 } 89 90 c.modifyIndex = pair.ModifyIndex 91 92 var payload []byte 93 if chunked { 94 for _, c := range chunks { 95 pair, _, err := kv.Get(c, nil) 96 if err != nil { 97 return nil, err 98 } 99 if pair == nil { 100 return nil, fmt.Errorf("Key %q could not be found", c) 101 } 102 payload = append(payload, pair.Value[:]...) 103 } 104 } else { 105 payload = pair.Value 106 } 107 108 // If the payload starts with 0x1f, it's gzip, not json 109 if len(payload) >= 1 && payload[0] == '\x1f' { 110 payload, err = uncompressState(payload) 111 if err != nil { 112 return nil, err 113 } 114 } 115 116 md5 := md5.Sum(payload) 117 118 if hash != "" && fmt.Sprintf("%x", md5) != hash { 119 return nil, fmt.Errorf("The remote state does not match the expected hash") 120 } 121 122 return &remote.Payload{ 123 Data: payload, 124 MD5: md5[:], 125 }, nil 126 } 127 128 func (c *RemoteClient) Put(data []byte) error { 129 // The state can be stored in 4 different ways, based on the payload size 130 // and whether the user enabled gzip: 131 // - single entry mode with plain JSON: a single JSON is stored at 132 // "tfstate/my_project" 133 // - single entry mode gzip: the JSON payload is first gziped and stored at 134 // "tfstate/my_project" 135 // - chunked mode with plain JSON: the JSON payload is split in pieces and 136 // stored like so: 137 // - "tfstate/my_project" -> a JSON payload that contains the path of 138 // the chunks and an MD5 sum like so: 139 // { 140 // "current-hash": "abcdef1234", 141 // "chunks": [ 142 // "tfstate/my_project/tfstate.abcdef1234/0", 143 // "tfstate/my_project/tfstate.abcdef1234/1", 144 // "tfstate/my_project/tfstate.abcdef1234/2", 145 // ] 146 // } 147 // - "tfstate/my_project/tfstate.abcdef1234/0" -> The first chunk 148 // - "tfstate/my_project/tfstate.abcdef1234/1" -> The next one 149 // - ... 150 // - chunked mode with gzip: the same system but we gziped the JSON payload 151 // before splitting it in chunks 152 // 153 // When overwritting the current state, we need to clean the old chunks if 154 // we were in chunked mode (no matter whether we need to use chunks for the 155 // new one). To do so based on the 4 possibilities above we look at the 156 // value at "tfstate/my_project" and if it is: 157 // - absent then it's a new state and there will be nothing to cleanup, 158 // - not a JSON payload we were in single entry mode with gzip so there will 159 // be nothing to cleanup 160 // - a JSON payload, then we were either single entry mode with plain JSON 161 // or in chunked mode. To differentiate between the two we look whether a 162 // "current-hash" key is present in the payload. If we find one we were 163 // in chunked mode and we will need to remove the old chunks (whether or 164 // not we were using gzip does not matter in that case). 165 166 c.mu.Lock() 167 defer c.mu.Unlock() 168 169 kv := c.Client.KV() 170 171 // First we determine what mode we were using and to prepare the cleanup 172 chunked, hash, _, _, err := c.chunkedMode() 173 if err != nil { 174 return err 175 } 176 cleanupOldChunks := func() {} 177 if chunked { 178 cleanupOldChunks = func() { 179 // We ignore all errors that can happen here because we already 180 // saved the new state and there is no way to return a warning to 181 // the user. We may end up with dangling chunks but there is no way 182 // to be sure we won't. 183 path := strings.TrimRight(c.Path, "/") + fmt.Sprintf("/tfstate.%s/", hash) 184 kv.DeleteTree(path, nil) 185 } 186 } 187 188 payload := data 189 if c.GZip { 190 if compressedState, err := compressState(data); err == nil { 191 payload = compressedState 192 } else { 193 return err 194 } 195 } 196 197 // default to doing a CAS 198 verb := consulapi.KVCAS 199 200 // Assume a 0 index doesn't need a CAS for now, since we are either 201 // creating a new state or purposely overwriting one. 202 if c.modifyIndex == 0 { 203 verb = consulapi.KVSet 204 } 205 206 // The payload may be too large to store in a single KV entry in Consul. We 207 // could try to determine whether it will fit or not before sending the 208 // request but since we are using the Transaction API and not the KV API, 209 // it grows by about a 1/3 when it is base64 encoded plus the overhead of 210 // the fields specific to the Transaction API. 211 // Rather than trying to calculate the overhead (which could change from 212 // one version of Consul to another, and between Consul Community Edition 213 // and Consul Enterprise), we try to send the whole state in one request, if 214 // it fails because it is too big we then split it in chunks and send each 215 // chunk separately. 216 // When splitting in chunks, we make each chunk 524288 bits, which is the 217 // default max size for raft. If the user changed it, we still may send 218 // chunks too big and fail but this is not a setting that should be fiddled 219 // with anyway. 220 221 store := func(payload []byte) error { 222 // KV.Put doesn't return the new index, so we use a single operation 223 // transaction to get the new index with a single request. 224 txOps := consulapi.KVTxnOps{ 225 &consulapi.KVTxnOp{ 226 Verb: verb, 227 Key: c.Path, 228 Value: payload, 229 Index: c.modifyIndex, 230 }, 231 } 232 233 ok, resp, _, err := kv.Txn(txOps, nil) 234 if err != nil { 235 return err 236 } 237 // transaction was rolled back 238 if !ok { 239 var resultErr error 240 for _, respError := range resp.Errors { 241 resultErr = multierror.Append(resultErr, errors.New(respError.What)) 242 } 243 return fmt.Errorf("consul CAS failed with transaction errors: %w", resultErr) 244 } 245 246 if len(resp.Results) != 1 { 247 // this probably shouldn't happen 248 return fmt.Errorf("expected on 1 response value, got: %d", len(resp.Results)) 249 } 250 251 c.modifyIndex = resp.Results[0].ModifyIndex 252 253 // We remove all the old chunks 254 cleanupOldChunks() 255 256 return nil 257 } 258 259 if err = store(payload); err == nil { 260 // The payload was small enough to be stored 261 return nil 262 } else if !strings.Contains(err.Error(), "too large") { 263 // We failed for some other reason, report this to the user 264 return err 265 } 266 267 // The payload was too large so we split it in multiple chunks 268 269 md5 := md5.Sum(data) 270 chunks := split(payload, 524288) 271 chunkPaths := make([]string, 0) 272 273 // First we write the new chunks 274 for i, p := range chunks { 275 path := strings.TrimRight(c.Path, "/") + fmt.Sprintf("/tfstate.%x/%d", md5, i) 276 chunkPaths = append(chunkPaths, path) 277 _, err := kv.Put(&consulapi.KVPair{ 278 Key: path, 279 Value: p, 280 }, nil) 281 282 if err != nil { 283 return err 284 } 285 } 286 287 // Then we update the link to point to the new chunks 288 payload, err = json.Marshal(map[string]interface{}{ 289 "current-hash": fmt.Sprintf("%x", md5), 290 "chunks": chunkPaths, 291 }) 292 if err != nil { 293 return err 294 } 295 return store(payload) 296 } 297 298 func (c *RemoteClient) Delete() error { 299 c.mu.Lock() 300 defer c.mu.Unlock() 301 302 kv := c.Client.KV() 303 304 chunked, hash, _, _, err := c.chunkedMode() 305 if err != nil { 306 return err 307 } 308 309 _, err = kv.Delete(c.Path, nil) 310 311 // If there were chunks we need to remove them 312 if chunked { 313 path := strings.TrimRight(c.Path, "/") + fmt.Sprintf("/tfstate.%s/", hash) 314 kv.DeleteTree(path, nil) 315 } 316 317 return err 318 } 319 320 func (c *RemoteClient) lockPath() string { 321 // we sanitize the path for the lock as Consul does not like having 322 // two consecutive slashes for the lock path 323 return strings.TrimRight(c.Path, "/") 324 } 325 326 func (c *RemoteClient) putLockInfo(info *statemgr.LockInfo) error { 327 info.Path = c.Path 328 info.Created = time.Now().UTC() 329 330 kv := c.Client.KV() 331 _, err := kv.Put(&consulapi.KVPair{ 332 Key: c.lockPath() + lockInfoSuffix, 333 Value: info.Marshal(), 334 }, nil) 335 336 return err 337 } 338 339 func (c *RemoteClient) getLockInfo() (*statemgr.LockInfo, error) { 340 path := c.lockPath() + lockInfoSuffix 341 pair, _, err := c.Client.KV().Get(path, nil) 342 if err != nil { 343 return nil, err 344 } 345 if pair == nil { 346 return nil, nil 347 } 348 349 li := &statemgr.LockInfo{} 350 err = json.Unmarshal(pair.Value, li) 351 if err != nil { 352 return nil, fmt.Errorf("error unmarshaling lock info: %w", err) 353 } 354 355 return li, nil 356 } 357 358 func (c *RemoteClient) Lock(info *statemgr.LockInfo) (string, error) { 359 c.mu.Lock() 360 defer c.mu.Unlock() 361 362 if !c.lockState { 363 return "", nil 364 } 365 366 c.info = info 367 368 // These checks only are to ensure we strictly follow the specification. 369 // OpenTofu shouldn't ever re-lock, so provide errors for the 2 possible 370 // states if this is called. 371 select { 372 case <-c.lockCh: 373 // We had a lock, but lost it. 374 return "", errors.New("lost consul lock, cannot re-lock") 375 default: 376 if c.lockCh != nil { 377 // we have an active lock already 378 return "", fmt.Errorf("state %q already locked", c.Path) 379 } 380 } 381 382 return c.lock() 383 } 384 385 // the lock implementation. 386 // Only to be called while holding Client.mu 387 func (c *RemoteClient) lock() (string, error) { 388 // We create a new session here, so it can be canceled when the lock is 389 // lost or unlocked. 390 lockSession, err := c.createSession() 391 if err != nil { 392 return "", err 393 } 394 395 // store the session ID for correlation with consul logs 396 c.info.Info = "consul session: " + lockSession 397 398 // A random lock ID has been generated but we override it with the session 399 // ID as this will make it easier to manually invalidate the session 400 // if needed. 401 c.info.ID = lockSession 402 403 opts := &consulapi.LockOptions{ 404 Key: c.lockPath() + lockSuffix, 405 Session: lockSession, 406 407 // only wait briefly, so tofu has the choice to fail fast or 408 // retry as needed. 409 LockWaitTime: time.Second, 410 LockTryOnce: true, 411 412 // Don't let the lock monitor give up right away, as it's possible the 413 // session is still OK. While the session is refreshed at a rate of 414 // TTL/2, the lock monitor is an idle blocking request and is more 415 // susceptible to being closed by a lower network layer. 416 MonitorRetries: 5, 417 // 418 // The delay between lock monitor retries. 419 // While the session has a 15s TTL plus a 5s wait period on a lost 420 // lock, if we can't get our lock back in 10+ seconds something is 421 // wrong so we're going to drop the session and start over. 422 MonitorRetryTime: 2 * time.Second, 423 } 424 425 c.consulLock, err = c.Client.LockOpts(opts) 426 if err != nil { 427 return "", err 428 } 429 430 lockErr := &statemgr.LockError{} 431 432 lockCh, err := c.consulLock.Lock(make(chan struct{})) 433 if err != nil { 434 lockErr.Err = err 435 return "", lockErr 436 } 437 438 if lockCh == nil { 439 lockInfo, e := c.getLockInfo() 440 if e != nil { 441 lockErr.Err = e 442 return "", lockErr 443 } 444 445 lockErr.Info = lockInfo 446 447 return "", lockErr 448 } 449 450 c.lockCh = lockCh 451 452 err = c.putLockInfo(c.info) 453 if err != nil { 454 if unlockErr := c.unlock(c.info.ID); unlockErr != nil { 455 err = multierror.Append(err, unlockErr) 456 } 457 458 return "", err 459 } 460 461 // Start a goroutine to monitor the lock state. 462 // If we lose the lock to due communication issues with the consul agent, 463 // attempt to immediately reacquire the lock. Put will verify the integrity 464 // of the state by using a CAS operation. 465 ctx, cancel := context.WithCancel(context.Background()) 466 c.monitorCancel = cancel 467 c.monitorWG.Add(1) 468 go func() { 469 defer c.monitorWG.Done() 470 select { 471 case <-c.lockCh: 472 log.Println("[ERROR] lost consul lock") 473 for { 474 c.mu.Lock() 475 // We lost our lock, so we need to cancel the session too. 476 // The CancelFunc is only replaced while holding Client.mu, so 477 // this is safe to call here. This will be replaced by the 478 // lock() call below. 479 c.sessionCancel() 480 481 c.consulLock = nil 482 _, err := c.lock() 483 c.mu.Unlock() 484 485 if err != nil { 486 // We failed to get the lock, keep trying as long as 487 // tofu is running. There may be changes in progress, 488 // so there's no use in aborting. Either we eventually 489 // reacquire the lock, or a Put will fail on a CAS. 490 log.Printf("[ERROR] could not reacquire lock: %s", err) 491 time.Sleep(lockReacquireInterval) 492 493 select { 494 case <-ctx.Done(): 495 return 496 default: 497 } 498 continue 499 } 500 501 // if the error was nil, the new lock started a new copy of 502 // this goroutine. 503 return 504 } 505 506 case <-ctx.Done(): 507 return 508 } 509 }() 510 511 if testLockHook != nil { 512 testLockHook() 513 } 514 515 return c.info.ID, nil 516 } 517 518 // called after a lock is acquired 519 var testLockHook func() 520 521 func (c *RemoteClient) createSession() (string, error) { 522 // create the context first. Even if the session creation fails, we assume 523 // that the CancelFunc is always callable. 524 ctx, cancel := context.WithCancel(context.Background()) 525 c.sessionCancel = cancel 526 527 session := c.Client.Session() 528 se := &consulapi.SessionEntry{ 529 Name: consulapi.DefaultLockSessionName, 530 TTL: lockSessionTTL, 531 LockDelay: lockDelay, 532 } 533 534 id, _, err := session.Create(se, nil) 535 if err != nil { 536 return "", err 537 } 538 539 log.Println("[INFO] created consul lock session", id) 540 541 // keep the session renewed 542 go session.RenewPeriodic(lockSessionTTL, id, nil, ctx.Done()) 543 544 return id, nil 545 } 546 547 func (c *RemoteClient) Unlock(id string) error { 548 c.mu.Lock() 549 defer c.mu.Unlock() 550 551 if !c.lockState { 552 return nil 553 } 554 555 return c.unlock(id) 556 } 557 558 // the unlock implementation. 559 // Only to be called while holding Client.mu 560 func (c *RemoteClient) unlock(id string) error { 561 // This method can be called in two circumstances: 562 // - when the plan apply or destroy operation finishes and the lock needs to be released, 563 // the watchdog stopped and the session closed 564 // - when the user calls `tofu force-unlock <lock_id>` in which case 565 // we only need to release the lock. 566 567 if c.consulLock == nil || c.lockCh == nil { 568 // The user called `tofu force-unlock <lock_id>`, we just destroy 569 // the session which will release the lock, clean the KV store and quit. 570 571 _, err := c.Client.Session().Destroy(id, nil) 572 if err != nil { 573 return err 574 } 575 // We ignore the errors that may happen during cleanup 576 kv := c.Client.KV() 577 kv.Delete(c.lockPath()+lockSuffix, nil) 578 kv.Delete(c.lockPath()+lockInfoSuffix, nil) 579 580 return nil 581 } 582 583 // cancel our monitoring goroutine 584 c.monitorCancel() 585 586 defer func() { 587 c.consulLock = nil 588 589 // The consul session is only used for this single lock, so cancel it 590 // after we unlock. 591 // The session is only created and replaced holding Client.mu, so the 592 // CancelFunc must be non-nil. 593 c.sessionCancel() 594 }() 595 596 select { 597 case <-c.lockCh: 598 return lostLockErr 599 default: 600 } 601 602 kv := c.Client.KV() 603 604 var errs error 605 606 if _, err := kv.Delete(c.lockPath()+lockInfoSuffix, nil); err != nil { 607 errs = multierror.Append(errs, err) 608 } 609 610 if err := c.consulLock.Unlock(); err != nil { 611 errs = multierror.Append(errs, err) 612 } 613 614 // the monitoring goroutine may be in a select on the lockCh, so we need to 615 // wait for it to return before changing the value. 616 c.monitorWG.Wait() 617 c.lockCh = nil 618 619 // This is only cleanup, and will fail if the lock was immediately taken by 620 // another client, so we don't report an error to the user here. 621 c.consulLock.Destroy() 622 623 return errs 624 } 625 626 func compressState(data []byte) ([]byte, error) { 627 b := new(bytes.Buffer) 628 gz := gzip.NewWriter(b) 629 if _, err := gz.Write(data); err != nil { 630 return nil, err 631 } 632 if err := gz.Flush(); err != nil { 633 return nil, err 634 } 635 if err := gz.Close(); err != nil { 636 return nil, err 637 } 638 return b.Bytes(), nil 639 } 640 641 func uncompressState(data []byte) ([]byte, error) { 642 b := new(bytes.Buffer) 643 gz, err := gzip.NewReader(bytes.NewReader(data)) 644 if err != nil { 645 return nil, err 646 } 647 b.ReadFrom(gz) 648 if err := gz.Close(); err != nil { 649 return nil, err 650 } 651 return b.Bytes(), nil 652 } 653 654 func split(payload []byte, limit int) [][]byte { 655 var chunk []byte 656 chunks := make([][]byte, 0, len(payload)/limit+1) 657 for len(payload) >= limit { 658 chunk, payload = payload[:limit], payload[limit:] 659 chunks = append(chunks, chunk) 660 } 661 if len(payload) > 0 { 662 chunks = append(chunks, payload[:]) 663 } 664 return chunks 665 } 666 667 func (c *RemoteClient) chunkedMode() (bool, string, []string, *consulapi.KVPair, error) { 668 kv := c.Client.KV() 669 pair, _, err := kv.Get(c.Path, nil) 670 if err != nil { 671 return false, "", nil, pair, err 672 } 673 if pair != nil { 674 var d map[string]interface{} 675 err = json.Unmarshal(pair.Value, &d) 676 // If there is an error when unmarshaling the payload, the state has 677 // probably been gziped in single entry mode. 678 if err == nil { 679 // If we find the "current-hash" key we were in chunked mode 680 hash, ok := d["current-hash"] 681 if ok { 682 chunks := make([]string, 0) 683 for _, c := range d["chunks"].([]interface{}) { 684 chunks = append(chunks, c.(string)) 685 } 686 return true, hash.(string), chunks, pair, nil 687 } 688 } 689 } 690 return false, "", nil, pair, nil 691 }