github.com/graywolf-at-work-2/terraform-vendor@v1.4.5/internal/backend/remote-state/consul/client.go (about) 1 package consul 2 3 import ( 4 "bytes" 5 "compress/gzip" 6 "context" 7 "crypto/md5" 8 "encoding/json" 9 "errors" 10 "fmt" 11 "log" 12 "strings" 13 "sync" 14 "time" 15 16 consulapi "github.com/hashicorp/consul/api" 17 multierror "github.com/hashicorp/go-multierror" 18 "github.com/hashicorp/terraform/internal/states/remote" 19 "github.com/hashicorp/terraform/internal/states/statemgr" 20 ) 21 22 const ( 23 lockSuffix = "/.lock" 24 lockInfoSuffix = "/.lockinfo" 25 26 // The Session TTL associated with this lock. 27 lockSessionTTL = "15s" 28 29 // the delay time from when a session is lost to when the 30 // lock is released by the server 31 lockDelay = 5 * time.Second 32 // interval between attempts to reacquire a lost lock 33 lockReacquireInterval = 2 * time.Second 34 ) 35 36 var lostLockErr = errors.New("consul lock was lost") 37 38 // RemoteClient is a remote client that stores data in Consul. 39 type RemoteClient struct { 40 Client *consulapi.Client 41 Path string 42 GZip bool 43 44 mu sync.Mutex 45 // lockState is true if we're using locks 46 lockState bool 47 48 // The index of the last state we wrote. 49 // If this is > 0, Put will perform a CAS to ensure that the state wasn't 50 // changed during the operation. This is important even with locks, because 51 // if the client loses the lock for some reason, then reacquires it, we 52 // need to make sure that the state was not modified. 53 modifyIndex uint64 54 55 consulLock *consulapi.Lock 56 lockCh <-chan struct{} 57 58 info *statemgr.LockInfo 59 60 // cancel our goroutine which is monitoring the lock to automatically 61 // reacquire it when possible. 62 monitorCancel context.CancelFunc 63 monitorWG sync.WaitGroup 64 65 // sessionCancel cancels the Context use for session.RenewPeriodic, and is 66 // called when unlocking, or before creating a new lock if the lock is 67 // lost. 68 sessionCancel context.CancelFunc 69 } 70 71 func (c *RemoteClient) Get() (*remote.Payload, error) { 72 c.mu.Lock() 73 defer c.mu.Unlock() 74 75 kv := c.Client.KV() 76 77 chunked, hash, chunks, pair, err := c.chunkedMode() 78 if err != nil { 79 return nil, err 80 } 81 if pair == nil { 82 return nil, nil 83 } 84 85 c.modifyIndex = pair.ModifyIndex 86 87 var payload []byte 88 if chunked { 89 for _, c := range chunks { 90 pair, _, err := kv.Get(c, nil) 91 if err != nil { 92 return nil, err 93 } 94 if pair == nil { 95 return nil, fmt.Errorf("Key %q could not be found", c) 96 } 97 payload = append(payload, pair.Value[:]...) 98 } 99 } else { 100 payload = pair.Value 101 } 102 103 // If the payload starts with 0x1f, it's gzip, not json 104 if len(payload) >= 1 && payload[0] == '\x1f' { 105 payload, err = uncompressState(payload) 106 if err != nil { 107 return nil, err 108 } 109 } 110 111 md5 := md5.Sum(payload) 112 113 if hash != "" && fmt.Sprintf("%x", md5) != hash { 114 return nil, fmt.Errorf("The remote state does not match the expected hash") 115 } 116 117 return &remote.Payload{ 118 Data: payload, 119 MD5: md5[:], 120 }, nil 121 } 122 123 func (c *RemoteClient) Put(data []byte) error { 124 // The state can be stored in 4 different ways, based on the payload size 125 // and whether the user enabled gzip: 126 // - single entry mode with plain JSON: a single JSON is stored at 127 // "tfstate/my_project" 128 // - single entry mode gzip: the JSON payload is first gziped and stored at 129 // "tfstate/my_project" 130 // - chunked mode with plain JSON: the JSON payload is split in pieces and 131 // stored like so: 132 // - "tfstate/my_project" -> a JSON payload that contains the path of 133 // the chunks and an MD5 sum like so: 134 // { 135 // "current-hash": "abcdef1234", 136 // "chunks": [ 137 // "tfstate/my_project/tfstate.abcdef1234/0", 138 // "tfstate/my_project/tfstate.abcdef1234/1", 139 // "tfstate/my_project/tfstate.abcdef1234/2", 140 // ] 141 // } 142 // - "tfstate/my_project/tfstate.abcdef1234/0" -> The first chunk 143 // - "tfstate/my_project/tfstate.abcdef1234/1" -> The next one 144 // - ... 145 // - chunked mode with gzip: the same system but we gziped the JSON payload 146 // before splitting it in chunks 147 // 148 // When overwritting the current state, we need to clean the old chunks if 149 // we were in chunked mode (no matter whether we need to use chunks for the 150 // new one). To do so based on the 4 possibilities above we look at the 151 // value at "tfstate/my_project" and if it is: 152 // - absent then it's a new state and there will be nothing to cleanup, 153 // - not a JSON payload we were in single entry mode with gzip so there will 154 // be nothing to cleanup 155 // - a JSON payload, then we were either single entry mode with plain JSON 156 // or in chunked mode. To differentiate between the two we look whether a 157 // "current-hash" key is present in the payload. If we find one we were 158 // in chunked mode and we will need to remove the old chunks (whether or 159 // not we were using gzip does not matter in that case). 160 161 c.mu.Lock() 162 defer c.mu.Unlock() 163 164 kv := c.Client.KV() 165 166 // First we determine what mode we were using and to prepare the cleanup 167 chunked, hash, _, _, err := c.chunkedMode() 168 if err != nil { 169 return err 170 } 171 cleanupOldChunks := func() {} 172 if chunked { 173 cleanupOldChunks = func() { 174 // We ignore all errors that can happen here because we already 175 // saved the new state and there is no way to return a warning to 176 // the user. We may end up with dangling chunks but there is no way 177 // to be sure we won't. 178 path := strings.TrimRight(c.Path, "/") + fmt.Sprintf("/tfstate.%s/", hash) 179 kv.DeleteTree(path, nil) 180 } 181 } 182 183 payload := data 184 if c.GZip { 185 if compressedState, err := compressState(data); err == nil { 186 payload = compressedState 187 } else { 188 return err 189 } 190 } 191 192 // default to doing a CAS 193 verb := consulapi.KVCAS 194 195 // Assume a 0 index doesn't need a CAS for now, since we are either 196 // creating a new state or purposely overwriting one. 197 if c.modifyIndex == 0 { 198 verb = consulapi.KVSet 199 } 200 201 // The payload may be too large to store in a single KV entry in Consul. We 202 // could try to determine whether it will fit or not before sending the 203 // request but since we are using the Transaction API and not the KV API, 204 // it grows by about a 1/3 when it is base64 encoded plus the overhead of 205 // the fields specific to the Transaction API. 206 // Rather than trying to calculate the overhead (which could change from 207 // one version of Consul to another, and between Consul Community Edition 208 // and Consul Enterprise), we try to send the whole state in one request, if 209 // it fails because it is too big we then split it in chunks and send each 210 // chunk separately. 211 // When splitting in chunks, we make each chunk 524288 bits, which is the 212 // default max size for raft. If the user changed it, we still may send 213 // chunks too big and fail but this is not a setting that should be fiddled 214 // with anyway. 215 216 store := func(payload []byte) error { 217 // KV.Put doesn't return the new index, so we use a single operation 218 // transaction to get the new index with a single request. 219 txOps := consulapi.KVTxnOps{ 220 &consulapi.KVTxnOp{ 221 Verb: verb, 222 Key: c.Path, 223 Value: payload, 224 Index: c.modifyIndex, 225 }, 226 } 227 228 ok, resp, _, err := kv.Txn(txOps, nil) 229 if err != nil { 230 return err 231 } 232 // transaction was rolled back 233 if !ok { 234 return fmt.Errorf("consul CAS failed with transaction errors: %v", resp.Errors) 235 } 236 237 if len(resp.Results) != 1 { 238 // this probably shouldn't happen 239 return fmt.Errorf("expected on 1 response value, got: %d", len(resp.Results)) 240 } 241 242 c.modifyIndex = resp.Results[0].ModifyIndex 243 244 // We remove all the old chunks 245 cleanupOldChunks() 246 247 return nil 248 } 249 250 if err = store(payload); err == nil { 251 // The payload was small enough to be stored 252 return nil 253 } else if !strings.Contains(err.Error(), "too large") { 254 // We failed for some other reason, report this to the user 255 return err 256 } 257 258 // The payload was too large so we split it in multiple chunks 259 260 md5 := md5.Sum(data) 261 chunks := split(payload, 524288) 262 chunkPaths := make([]string, 0) 263 264 // First we write the new chunks 265 for i, p := range chunks { 266 path := strings.TrimRight(c.Path, "/") + fmt.Sprintf("/tfstate.%x/%d", md5, i) 267 chunkPaths = append(chunkPaths, path) 268 _, err := kv.Put(&consulapi.KVPair{ 269 Key: path, 270 Value: p, 271 }, nil) 272 273 if err != nil { 274 return err 275 } 276 } 277 278 // Then we update the link to point to the new chunks 279 payload, err = json.Marshal(map[string]interface{}{ 280 "current-hash": fmt.Sprintf("%x", md5), 281 "chunks": chunkPaths, 282 }) 283 if err != nil { 284 return err 285 } 286 return store(payload) 287 } 288 289 func (c *RemoteClient) Delete() error { 290 c.mu.Lock() 291 defer c.mu.Unlock() 292 293 kv := c.Client.KV() 294 295 chunked, hash, _, _, err := c.chunkedMode() 296 if err != nil { 297 return err 298 } 299 300 _, err = kv.Delete(c.Path, nil) 301 302 // If there were chunks we need to remove them 303 if chunked { 304 path := strings.TrimRight(c.Path, "/") + fmt.Sprintf("/tfstate.%s/", hash) 305 kv.DeleteTree(path, nil) 306 } 307 308 return err 309 } 310 311 func (c *RemoteClient) lockPath() string { 312 // we sanitize the path for the lock as Consul does not like having 313 // two consecutive slashes for the lock path 314 return strings.TrimRight(c.Path, "/") 315 } 316 317 func (c *RemoteClient) putLockInfo(info *statemgr.LockInfo) error { 318 info.Path = c.Path 319 info.Created = time.Now().UTC() 320 321 kv := c.Client.KV() 322 _, err := kv.Put(&consulapi.KVPair{ 323 Key: c.lockPath() + lockInfoSuffix, 324 Value: info.Marshal(), 325 }, nil) 326 327 return err 328 } 329 330 func (c *RemoteClient) getLockInfo() (*statemgr.LockInfo, error) { 331 path := c.lockPath() + lockInfoSuffix 332 pair, _, err := c.Client.KV().Get(path, nil) 333 if err != nil { 334 return nil, err 335 } 336 if pair == nil { 337 return nil, nil 338 } 339 340 li := &statemgr.LockInfo{} 341 err = json.Unmarshal(pair.Value, li) 342 if err != nil { 343 return nil, fmt.Errorf("error unmarshaling lock info: %s", err) 344 } 345 346 return li, nil 347 } 348 349 func (c *RemoteClient) Lock(info *statemgr.LockInfo) (string, error) { 350 c.mu.Lock() 351 defer c.mu.Unlock() 352 353 if !c.lockState { 354 return "", nil 355 } 356 357 c.info = info 358 359 // These checks only are to ensure we strictly follow the specification. 360 // Terraform shouldn't ever re-lock, so provide errors for the 2 possible 361 // states if this is called. 362 select { 363 case <-c.lockCh: 364 // We had a lock, but lost it. 365 return "", errors.New("lost consul lock, cannot re-lock") 366 default: 367 if c.lockCh != nil { 368 // we have an active lock already 369 return "", fmt.Errorf("state %q already locked", c.Path) 370 } 371 } 372 373 return c.lock() 374 } 375 376 // the lock implementation. 377 // Only to be called while holding Client.mu 378 func (c *RemoteClient) lock() (string, error) { 379 // We create a new session here, so it can be canceled when the lock is 380 // lost or unlocked. 381 lockSession, err := c.createSession() 382 if err != nil { 383 return "", err 384 } 385 386 // store the session ID for correlation with consul logs 387 c.info.Info = "consul session: " + lockSession 388 389 // A random lock ID has been generated but we override it with the session 390 // ID as this will make it easier to manually invalidate the session 391 // if needed. 392 c.info.ID = lockSession 393 394 opts := &consulapi.LockOptions{ 395 Key: c.lockPath() + lockSuffix, 396 Session: lockSession, 397 398 // only wait briefly, so terraform has the choice to fail fast or 399 // retry as needed. 400 LockWaitTime: time.Second, 401 LockTryOnce: true, 402 403 // Don't let the lock monitor give up right away, as it's possible the 404 // session is still OK. While the session is refreshed at a rate of 405 // TTL/2, the lock monitor is an idle blocking request and is more 406 // susceptible to being closed by a lower network layer. 407 MonitorRetries: 5, 408 // 409 // The delay between lock monitor retries. 410 // While the session has a 15s TTL plus a 5s wait period on a lost 411 // lock, if we can't get our lock back in 10+ seconds something is 412 // wrong so we're going to drop the session and start over. 413 MonitorRetryTime: 2 * time.Second, 414 } 415 416 c.consulLock, err = c.Client.LockOpts(opts) 417 if err != nil { 418 return "", err 419 } 420 421 lockErr := &statemgr.LockError{} 422 423 lockCh, err := c.consulLock.Lock(make(chan struct{})) 424 if err != nil { 425 lockErr.Err = err 426 return "", lockErr 427 } 428 429 if lockCh == nil { 430 lockInfo, e := c.getLockInfo() 431 if e != nil { 432 lockErr.Err = e 433 return "", lockErr 434 } 435 436 lockErr.Info = lockInfo 437 438 return "", lockErr 439 } 440 441 c.lockCh = lockCh 442 443 err = c.putLockInfo(c.info) 444 if err != nil { 445 if unlockErr := c.unlock(c.info.ID); unlockErr != nil { 446 err = multierror.Append(err, unlockErr) 447 } 448 449 return "", err 450 } 451 452 // Start a goroutine to monitor the lock state. 453 // If we lose the lock to due communication issues with the consul agent, 454 // attempt to immediately reacquire the lock. Put will verify the integrity 455 // of the state by using a CAS operation. 456 ctx, cancel := context.WithCancel(context.Background()) 457 c.monitorCancel = cancel 458 c.monitorWG.Add(1) 459 go func() { 460 defer c.monitorWG.Done() 461 select { 462 case <-c.lockCh: 463 log.Println("[ERROR] lost consul lock") 464 for { 465 c.mu.Lock() 466 // We lost our lock, so we need to cancel the session too. 467 // The CancelFunc is only replaced while holding Client.mu, so 468 // this is safe to call here. This will be replaced by the 469 // lock() call below. 470 c.sessionCancel() 471 472 c.consulLock = nil 473 _, err := c.lock() 474 c.mu.Unlock() 475 476 if err != nil { 477 // We failed to get the lock, keep trying as long as 478 // terraform is running. There may be changes in progress, 479 // so there's no use in aborting. Either we eventually 480 // reacquire the lock, or a Put will fail on a CAS. 481 log.Printf("[ERROR] could not reacquire lock: %s", err) 482 time.Sleep(lockReacquireInterval) 483 484 select { 485 case <-ctx.Done(): 486 return 487 default: 488 } 489 continue 490 } 491 492 // if the error was nil, the new lock started a new copy of 493 // this goroutine. 494 return 495 } 496 497 case <-ctx.Done(): 498 return 499 } 500 }() 501 502 if testLockHook != nil { 503 testLockHook() 504 } 505 506 return c.info.ID, nil 507 } 508 509 // called after a lock is acquired 510 var testLockHook func() 511 512 func (c *RemoteClient) createSession() (string, error) { 513 // create the context first. Even if the session creation fails, we assume 514 // that the CancelFunc is always callable. 515 ctx, cancel := context.WithCancel(context.Background()) 516 c.sessionCancel = cancel 517 518 session := c.Client.Session() 519 se := &consulapi.SessionEntry{ 520 Name: consulapi.DefaultLockSessionName, 521 TTL: lockSessionTTL, 522 LockDelay: lockDelay, 523 } 524 525 id, _, err := session.Create(se, nil) 526 if err != nil { 527 return "", err 528 } 529 530 log.Println("[INFO] created consul lock session", id) 531 532 // keep the session renewed 533 go session.RenewPeriodic(lockSessionTTL, id, nil, ctx.Done()) 534 535 return id, nil 536 } 537 538 func (c *RemoteClient) Unlock(id string) error { 539 c.mu.Lock() 540 defer c.mu.Unlock() 541 542 if !c.lockState { 543 return nil 544 } 545 546 return c.unlock(id) 547 } 548 549 // the unlock implementation. 550 // Only to be called while holding Client.mu 551 func (c *RemoteClient) unlock(id string) error { 552 // This method can be called in two circumstances: 553 // - when the plan apply or destroy operation finishes and the lock needs to be released, 554 // the watchdog stopped and the session closed 555 // - when the user calls `terraform force-unlock <lock_id>` in which case 556 // we only need to release the lock. 557 558 if c.consulLock == nil || c.lockCh == nil { 559 // The user called `terraform force-unlock <lock_id>`, we just destroy 560 // the session which will release the lock, clean the KV store and quit. 561 562 _, err := c.Client.Session().Destroy(id, nil) 563 if err != nil { 564 return err 565 } 566 // We ignore the errors that may happen during cleanup 567 kv := c.Client.KV() 568 kv.Delete(c.lockPath()+lockSuffix, nil) 569 kv.Delete(c.lockPath()+lockInfoSuffix, nil) 570 571 return nil 572 } 573 574 // cancel our monitoring goroutine 575 c.monitorCancel() 576 577 defer func() { 578 c.consulLock = nil 579 580 // The consul session is only used for this single lock, so cancel it 581 // after we unlock. 582 // The session is only created and replaced holding Client.mu, so the 583 // CancelFunc must be non-nil. 584 c.sessionCancel() 585 }() 586 587 select { 588 case <-c.lockCh: 589 return lostLockErr 590 default: 591 } 592 593 kv := c.Client.KV() 594 595 var errs error 596 597 if _, err := kv.Delete(c.lockPath()+lockInfoSuffix, nil); err != nil { 598 errs = multierror.Append(errs, err) 599 } 600 601 if err := c.consulLock.Unlock(); err != nil { 602 errs = multierror.Append(errs, err) 603 } 604 605 // the monitoring goroutine may be in a select on the lockCh, so we need to 606 // wait for it to return before changing the value. 607 c.monitorWG.Wait() 608 c.lockCh = nil 609 610 // This is only cleanup, and will fail if the lock was immediately taken by 611 // another client, so we don't report an error to the user here. 612 c.consulLock.Destroy() 613 614 return errs 615 } 616 617 func compressState(data []byte) ([]byte, error) { 618 b := new(bytes.Buffer) 619 gz := gzip.NewWriter(b) 620 if _, err := gz.Write(data); err != nil { 621 return nil, err 622 } 623 if err := gz.Flush(); err != nil { 624 return nil, err 625 } 626 if err := gz.Close(); err != nil { 627 return nil, err 628 } 629 return b.Bytes(), nil 630 } 631 632 func uncompressState(data []byte) ([]byte, error) { 633 b := new(bytes.Buffer) 634 gz, err := gzip.NewReader(bytes.NewReader(data)) 635 if err != nil { 636 return nil, err 637 } 638 b.ReadFrom(gz) 639 if err := gz.Close(); err != nil { 640 return nil, err 641 } 642 return b.Bytes(), nil 643 } 644 645 func split(payload []byte, limit int) [][]byte { 646 var chunk []byte 647 chunks := make([][]byte, 0, len(payload)/limit+1) 648 for len(payload) >= limit { 649 chunk, payload = payload[:limit], payload[limit:] 650 chunks = append(chunks, chunk) 651 } 652 if len(payload) > 0 { 653 chunks = append(chunks, payload[:]) 654 } 655 return chunks 656 } 657 658 func (c *RemoteClient) chunkedMode() (bool, string, []string, *consulapi.KVPair, error) { 659 kv := c.Client.KV() 660 pair, _, err := kv.Get(c.Path, nil) 661 if err != nil { 662 return false, "", nil, pair, err 663 } 664 if pair != nil { 665 var d map[string]interface{} 666 err = json.Unmarshal(pair.Value, &d) 667 // If there is an error when unmarshaling the payload, the state has 668 // probably been gziped in single entry mode. 669 if err == nil { 670 // If we find the "current-hash" key we were in chunked mode 671 hash, ok := d["current-hash"] 672 if ok { 673 chunks := make([]string, 0) 674 for _, c := range d["chunks"].([]interface{}) { 675 chunks = append(chunks, c.(string)) 676 } 677 return true, hash.(string), chunks, pair, nil 678 } 679 } 680 } 681 return false, "", nil, pair, nil 682 }