github.com/opentofu/opentofu@v1.7.1/internal/backend/remote-state/consul/client.go (about)

     1  // Copyright (c) The OpenTofu Authors
     2  // SPDX-License-Identifier: MPL-2.0
     3  // Copyright (c) 2023 HashiCorp, Inc.
     4  // SPDX-License-Identifier: MPL-2.0
     5  
     6  package consul
     7  
     8  import (
     9  	"bytes"
    10  	"compress/gzip"
    11  	"context"
    12  	"crypto/md5"
    13  	"encoding/json"
    14  	"errors"
    15  	"fmt"
    16  	"log"
    17  	"strings"
    18  	"sync"
    19  	"time"
    20  
    21  	consulapi "github.com/hashicorp/consul/api"
    22  	multierror "github.com/hashicorp/go-multierror"
    23  	"github.com/opentofu/opentofu/internal/states/remote"
    24  	"github.com/opentofu/opentofu/internal/states/statemgr"
    25  )
    26  
    27  const (
    28  	lockSuffix     = "/.lock"
    29  	lockInfoSuffix = "/.lockinfo"
    30  
    31  	// The Session TTL associated with this lock.
    32  	lockSessionTTL = "15s"
    33  
    34  	// the delay time from when a session is lost to when the
    35  	// lock is released by the server
    36  	lockDelay = 5 * time.Second
    37  	// interval between attempts to reacquire a lost lock
    38  	lockReacquireInterval = 2 * time.Second
    39  )
    40  
    41  var lostLockErr = errors.New("consul lock was lost")
    42  
    43  // RemoteClient is a remote client that stores data in Consul.
    44  type RemoteClient struct {
    45  	Client *consulapi.Client
    46  	Path   string
    47  	GZip   bool
    48  
    49  	mu sync.Mutex
    50  	// lockState is true if we're using locks
    51  	lockState bool
    52  
    53  	// The index of the last state we wrote.
    54  	// If this is > 0, Put will perform a CAS to ensure that the state wasn't
    55  	// changed during the operation. This is important even with locks, because
    56  	// if the client loses the lock for some reason, then reacquires it, we
    57  	// need to make sure that the state was not modified.
    58  	modifyIndex uint64
    59  
    60  	consulLock *consulapi.Lock
    61  	lockCh     <-chan struct{}
    62  
    63  	info *statemgr.LockInfo
    64  
    65  	// cancel our goroutine which is monitoring the lock to automatically
    66  	// reacquire it when possible.
    67  	monitorCancel context.CancelFunc
    68  	monitorWG     sync.WaitGroup
    69  
    70  	// sessionCancel cancels the Context use for session.RenewPeriodic, and is
    71  	// called when unlocking, or before creating a new lock if the lock is
    72  	// lost.
    73  	sessionCancel context.CancelFunc
    74  }
    75  
    76  func (c *RemoteClient) Get() (*remote.Payload, error) {
    77  	c.mu.Lock()
    78  	defer c.mu.Unlock()
    79  
    80  	kv := c.Client.KV()
    81  
    82  	chunked, hash, chunks, pair, err := c.chunkedMode()
    83  	if err != nil {
    84  		return nil, err
    85  	}
    86  	if pair == nil {
    87  		return nil, nil
    88  	}
    89  
    90  	c.modifyIndex = pair.ModifyIndex
    91  
    92  	var payload []byte
    93  	if chunked {
    94  		for _, c := range chunks {
    95  			pair, _, err := kv.Get(c, nil)
    96  			if err != nil {
    97  				return nil, err
    98  			}
    99  			if pair == nil {
   100  				return nil, fmt.Errorf("Key %q could not be found", c)
   101  			}
   102  			payload = append(payload, pair.Value[:]...)
   103  		}
   104  	} else {
   105  		payload = pair.Value
   106  	}
   107  
   108  	// If the payload starts with 0x1f, it's gzip, not json
   109  	if len(payload) >= 1 && payload[0] == '\x1f' {
   110  		payload, err = uncompressState(payload)
   111  		if err != nil {
   112  			return nil, err
   113  		}
   114  	}
   115  
   116  	md5 := md5.Sum(payload)
   117  
   118  	if hash != "" && fmt.Sprintf("%x", md5) != hash {
   119  		return nil, fmt.Errorf("The remote state does not match the expected hash")
   120  	}
   121  
   122  	return &remote.Payload{
   123  		Data: payload,
   124  		MD5:  md5[:],
   125  	}, nil
   126  }
   127  
   128  func (c *RemoteClient) Put(data []byte) error {
   129  	// The state can be stored in 4 different ways, based on the payload size
   130  	// and whether the user enabled gzip:
   131  	//  - single entry mode with plain JSON: a single JSON is stored at
   132  	//	  "tfstate/my_project"
   133  	//  - single entry mode gzip: the JSON payload is first gziped and stored at
   134  	//    "tfstate/my_project"
   135  	//  - chunked mode with plain JSON: the JSON payload is split in pieces and
   136  	//    stored like so:
   137  	//       - "tfstate/my_project" -> a JSON payload that contains the path of
   138  	//         the chunks and an MD5 sum like so:
   139  	//              {
   140  	//              	"current-hash": "abcdef1234",
   141  	//              	"chunks": [
   142  	//              		"tfstate/my_project/tfstate.abcdef1234/0",
   143  	//              		"tfstate/my_project/tfstate.abcdef1234/1",
   144  	//              		"tfstate/my_project/tfstate.abcdef1234/2",
   145  	//              	]
   146  	//              }
   147  	//       - "tfstate/my_project/tfstate.abcdef1234/0" -> The first chunk
   148  	//       - "tfstate/my_project/tfstate.abcdef1234/1" -> The next one
   149  	//       - ...
   150  	//  - chunked mode with gzip: the same system but we gziped the JSON payload
   151  	//    before splitting it in chunks
   152  	//
   153  	// When overwritting the current state, we need to clean the old chunks if
   154  	// we were in chunked mode (no matter whether we need to use chunks for the
   155  	// new one). To do so based on the 4 possibilities above we look at the
   156  	// value at "tfstate/my_project" and if it is:
   157  	//  - absent then it's a new state and there will be nothing to cleanup,
   158  	//  - not a JSON payload we were in single entry mode with gzip so there will
   159  	// 	  be nothing to cleanup
   160  	//  - a JSON payload, then we were either single entry mode with plain JSON
   161  	//    or in chunked mode. To differentiate between the two we look whether a
   162  	//    "current-hash" key is present in the payload. If we find one we were
   163  	//    in chunked mode and we will need to remove the old chunks (whether or
   164  	//    not we were using gzip does not matter in that case).
   165  
   166  	c.mu.Lock()
   167  	defer c.mu.Unlock()
   168  
   169  	kv := c.Client.KV()
   170  
   171  	// First we determine what mode we were using and to prepare the cleanup
   172  	chunked, hash, _, _, err := c.chunkedMode()
   173  	if err != nil {
   174  		return err
   175  	}
   176  	cleanupOldChunks := func() {}
   177  	if chunked {
   178  		cleanupOldChunks = func() {
   179  			// We ignore all errors that can happen here because we already
   180  			// saved the new state and there is no way to return a warning to
   181  			// the user. We may end up with dangling chunks but there is no way
   182  			// to be sure we won't.
   183  			path := strings.TrimRight(c.Path, "/") + fmt.Sprintf("/tfstate.%s/", hash)
   184  			kv.DeleteTree(path, nil)
   185  		}
   186  	}
   187  
   188  	payload := data
   189  	if c.GZip {
   190  		if compressedState, err := compressState(data); err == nil {
   191  			payload = compressedState
   192  		} else {
   193  			return err
   194  		}
   195  	}
   196  
   197  	// default to doing a CAS
   198  	verb := consulapi.KVCAS
   199  
   200  	// Assume a 0 index doesn't need a CAS for now, since we are either
   201  	// creating a new state or purposely overwriting one.
   202  	if c.modifyIndex == 0 {
   203  		verb = consulapi.KVSet
   204  	}
   205  
   206  	// The payload may be too large to store in a single KV entry in Consul. We
   207  	// could try to determine whether it will fit or not before sending the
   208  	// request but since we are using the Transaction API and not the KV API,
   209  	// it grows by about a 1/3 when it is base64 encoded plus the overhead of
   210  	// the fields specific to the Transaction API.
   211  	// Rather than trying to calculate the overhead (which could change from
   212  	// one version of Consul to another, and between Consul Community Edition
   213  	// and Consul Enterprise), we try to send the whole state in one request, if
   214  	// it fails because it is too big we then split it in chunks and send each
   215  	// chunk separately.
   216  	// When splitting in chunks, we make each chunk 524288 bits, which is the
   217  	// default max size for raft. If the user changed it, we still may send
   218  	// chunks too big and fail but this is not a setting that should be fiddled
   219  	// with anyway.
   220  
   221  	store := func(payload []byte) error {
   222  		// KV.Put doesn't return the new index, so we use a single operation
   223  		// transaction to get the new index with a single request.
   224  		txOps := consulapi.KVTxnOps{
   225  			&consulapi.KVTxnOp{
   226  				Verb:  verb,
   227  				Key:   c.Path,
   228  				Value: payload,
   229  				Index: c.modifyIndex,
   230  			},
   231  		}
   232  
   233  		ok, resp, _, err := kv.Txn(txOps, nil)
   234  		if err != nil {
   235  			return err
   236  		}
   237  		// transaction was rolled back
   238  		if !ok {
   239  			var resultErr error
   240  			for _, respError := range resp.Errors {
   241  				resultErr = multierror.Append(resultErr, errors.New(respError.What))
   242  			}
   243  			return fmt.Errorf("consul CAS failed with transaction errors: %w", resultErr)
   244  		}
   245  
   246  		if len(resp.Results) != 1 {
   247  			// this probably shouldn't happen
   248  			return fmt.Errorf("expected on 1 response value, got: %d", len(resp.Results))
   249  		}
   250  
   251  		c.modifyIndex = resp.Results[0].ModifyIndex
   252  
   253  		// We remove all the old chunks
   254  		cleanupOldChunks()
   255  
   256  		return nil
   257  	}
   258  
   259  	if err = store(payload); err == nil {
   260  		// The payload was small enough to be stored
   261  		return nil
   262  	} else if !strings.Contains(err.Error(), "too large") {
   263  		// We failed for some other reason, report this to the user
   264  		return err
   265  	}
   266  
   267  	// The payload was too large so we split it in multiple chunks
   268  
   269  	md5 := md5.Sum(data)
   270  	chunks := split(payload, 524288)
   271  	chunkPaths := make([]string, 0)
   272  
   273  	// First we write the new chunks
   274  	for i, p := range chunks {
   275  		path := strings.TrimRight(c.Path, "/") + fmt.Sprintf("/tfstate.%x/%d", md5, i)
   276  		chunkPaths = append(chunkPaths, path)
   277  		_, err := kv.Put(&consulapi.KVPair{
   278  			Key:   path,
   279  			Value: p,
   280  		}, nil)
   281  
   282  		if err != nil {
   283  			return err
   284  		}
   285  	}
   286  
   287  	// Then we update the link to point to the new chunks
   288  	payload, err = json.Marshal(map[string]interface{}{
   289  		"current-hash": fmt.Sprintf("%x", md5),
   290  		"chunks":       chunkPaths,
   291  	})
   292  	if err != nil {
   293  		return err
   294  	}
   295  	return store(payload)
   296  }
   297  
   298  func (c *RemoteClient) Delete() error {
   299  	c.mu.Lock()
   300  	defer c.mu.Unlock()
   301  
   302  	kv := c.Client.KV()
   303  
   304  	chunked, hash, _, _, err := c.chunkedMode()
   305  	if err != nil {
   306  		return err
   307  	}
   308  
   309  	_, err = kv.Delete(c.Path, nil)
   310  
   311  	// If there were chunks we need to remove them
   312  	if chunked {
   313  		path := strings.TrimRight(c.Path, "/") + fmt.Sprintf("/tfstate.%s/", hash)
   314  		kv.DeleteTree(path, nil)
   315  	}
   316  
   317  	return err
   318  }
   319  
   320  func (c *RemoteClient) lockPath() string {
   321  	// we sanitize the path for the lock as Consul does not like having
   322  	// two consecutive slashes for the lock path
   323  	return strings.TrimRight(c.Path, "/")
   324  }
   325  
   326  func (c *RemoteClient) putLockInfo(info *statemgr.LockInfo) error {
   327  	info.Path = c.Path
   328  	info.Created = time.Now().UTC()
   329  
   330  	kv := c.Client.KV()
   331  	_, err := kv.Put(&consulapi.KVPair{
   332  		Key:   c.lockPath() + lockInfoSuffix,
   333  		Value: info.Marshal(),
   334  	}, nil)
   335  
   336  	return err
   337  }
   338  
   339  func (c *RemoteClient) getLockInfo() (*statemgr.LockInfo, error) {
   340  	path := c.lockPath() + lockInfoSuffix
   341  	pair, _, err := c.Client.KV().Get(path, nil)
   342  	if err != nil {
   343  		return nil, err
   344  	}
   345  	if pair == nil {
   346  		return nil, nil
   347  	}
   348  
   349  	li := &statemgr.LockInfo{}
   350  	err = json.Unmarshal(pair.Value, li)
   351  	if err != nil {
   352  		return nil, fmt.Errorf("error unmarshaling lock info: %w", err)
   353  	}
   354  
   355  	return li, nil
   356  }
   357  
   358  func (c *RemoteClient) Lock(info *statemgr.LockInfo) (string, error) {
   359  	c.mu.Lock()
   360  	defer c.mu.Unlock()
   361  
   362  	if !c.lockState {
   363  		return "", nil
   364  	}
   365  
   366  	c.info = info
   367  
   368  	// These checks only are to ensure we strictly follow the specification.
   369  	// OpenTofu shouldn't ever re-lock, so provide errors for the 2 possible
   370  	// states if this is called.
   371  	select {
   372  	case <-c.lockCh:
   373  		// We had a lock, but lost it.
   374  		return "", errors.New("lost consul lock, cannot re-lock")
   375  	default:
   376  		if c.lockCh != nil {
   377  			// we have an active lock already
   378  			return "", fmt.Errorf("state %q already locked", c.Path)
   379  		}
   380  	}
   381  
   382  	return c.lock()
   383  }
   384  
   385  // the lock implementation.
   386  // Only to be called while holding Client.mu
   387  func (c *RemoteClient) lock() (string, error) {
   388  	// We create a new session here, so it can be canceled when the lock is
   389  	// lost or unlocked.
   390  	lockSession, err := c.createSession()
   391  	if err != nil {
   392  		return "", err
   393  	}
   394  
   395  	// store the session ID for correlation with consul logs
   396  	c.info.Info = "consul session: " + lockSession
   397  
   398  	// A random lock ID has been generated but we override it with the session
   399  	// ID as this will make it easier to manually invalidate the session
   400  	// if needed.
   401  	c.info.ID = lockSession
   402  
   403  	opts := &consulapi.LockOptions{
   404  		Key:     c.lockPath() + lockSuffix,
   405  		Session: lockSession,
   406  
   407  		// only wait briefly, so tofu has the choice to fail fast or
   408  		// retry as needed.
   409  		LockWaitTime: time.Second,
   410  		LockTryOnce:  true,
   411  
   412  		// Don't let the lock monitor give up right away, as it's possible the
   413  		// session is still OK. While the session is refreshed at a rate of
   414  		// TTL/2, the lock monitor is an idle blocking request and is more
   415  		// susceptible to being closed by a lower network layer.
   416  		MonitorRetries: 5,
   417  		//
   418  		// The delay between lock monitor retries.
   419  		// While the session has a 15s TTL plus a 5s wait period on a lost
   420  		// lock, if we can't get our lock back in 10+ seconds something is
   421  		// wrong so we're going to drop the session and start over.
   422  		MonitorRetryTime: 2 * time.Second,
   423  	}
   424  
   425  	c.consulLock, err = c.Client.LockOpts(opts)
   426  	if err != nil {
   427  		return "", err
   428  	}
   429  
   430  	lockErr := &statemgr.LockError{}
   431  
   432  	lockCh, err := c.consulLock.Lock(make(chan struct{}))
   433  	if err != nil {
   434  		lockErr.Err = err
   435  		return "", lockErr
   436  	}
   437  
   438  	if lockCh == nil {
   439  		lockInfo, e := c.getLockInfo()
   440  		if e != nil {
   441  			lockErr.Err = e
   442  			return "", lockErr
   443  		}
   444  
   445  		lockErr.Info = lockInfo
   446  
   447  		return "", lockErr
   448  	}
   449  
   450  	c.lockCh = lockCh
   451  
   452  	err = c.putLockInfo(c.info)
   453  	if err != nil {
   454  		if unlockErr := c.unlock(c.info.ID); unlockErr != nil {
   455  			err = multierror.Append(err, unlockErr)
   456  		}
   457  
   458  		return "", err
   459  	}
   460  
   461  	// Start a goroutine to monitor the lock state.
   462  	// If we lose the lock to due communication issues with the consul agent,
   463  	// attempt to immediately reacquire the lock. Put will verify the integrity
   464  	// of the state by using a CAS operation.
   465  	ctx, cancel := context.WithCancel(context.Background())
   466  	c.monitorCancel = cancel
   467  	c.monitorWG.Add(1)
   468  	go func() {
   469  		defer c.monitorWG.Done()
   470  		select {
   471  		case <-c.lockCh:
   472  			log.Println("[ERROR] lost consul lock")
   473  			for {
   474  				c.mu.Lock()
   475  				// We lost our lock, so we need to cancel the session too.
   476  				// The CancelFunc is only replaced while holding Client.mu, so
   477  				// this is safe to call here. This will be replaced by the
   478  				// lock() call below.
   479  				c.sessionCancel()
   480  
   481  				c.consulLock = nil
   482  				_, err := c.lock()
   483  				c.mu.Unlock()
   484  
   485  				if err != nil {
   486  					// We failed to get the lock, keep trying as long as
   487  					// tofu is running. There may be changes in progress,
   488  					// so there's no use in aborting. Either we eventually
   489  					// reacquire the lock, or a Put will fail on a CAS.
   490  					log.Printf("[ERROR] could not reacquire lock: %s", err)
   491  					time.Sleep(lockReacquireInterval)
   492  
   493  					select {
   494  					case <-ctx.Done():
   495  						return
   496  					default:
   497  					}
   498  					continue
   499  				}
   500  
   501  				// if the error was nil, the new lock started a new copy of
   502  				// this goroutine.
   503  				return
   504  			}
   505  
   506  		case <-ctx.Done():
   507  			return
   508  		}
   509  	}()
   510  
   511  	if testLockHook != nil {
   512  		testLockHook()
   513  	}
   514  
   515  	return c.info.ID, nil
   516  }
   517  
   518  // called after a lock is acquired
   519  var testLockHook func()
   520  
   521  func (c *RemoteClient) createSession() (string, error) {
   522  	// create the context first. Even if the session creation fails, we assume
   523  	// that the CancelFunc is always callable.
   524  	ctx, cancel := context.WithCancel(context.Background())
   525  	c.sessionCancel = cancel
   526  
   527  	session := c.Client.Session()
   528  	se := &consulapi.SessionEntry{
   529  		Name:      consulapi.DefaultLockSessionName,
   530  		TTL:       lockSessionTTL,
   531  		LockDelay: lockDelay,
   532  	}
   533  
   534  	id, _, err := session.Create(se, nil)
   535  	if err != nil {
   536  		return "", err
   537  	}
   538  
   539  	log.Println("[INFO] created consul lock session", id)
   540  
   541  	// keep the session renewed
   542  	go session.RenewPeriodic(lockSessionTTL, id, nil, ctx.Done())
   543  
   544  	return id, nil
   545  }
   546  
   547  func (c *RemoteClient) Unlock(id string) error {
   548  	c.mu.Lock()
   549  	defer c.mu.Unlock()
   550  
   551  	if !c.lockState {
   552  		return nil
   553  	}
   554  
   555  	return c.unlock(id)
   556  }
   557  
   558  // the unlock implementation.
   559  // Only to be called while holding Client.mu
   560  func (c *RemoteClient) unlock(id string) error {
   561  	// This method can be called in two circumstances:
   562  	// - when the plan apply or destroy operation finishes and the lock needs to be released,
   563  	// the watchdog stopped and the session closed
   564  	// - when the user calls `tofu force-unlock <lock_id>` in which case
   565  	// we only need to release the lock.
   566  
   567  	if c.consulLock == nil || c.lockCh == nil {
   568  		// The user called `tofu force-unlock <lock_id>`, we just destroy
   569  		// the session which will release the lock, clean the KV store and quit.
   570  
   571  		_, err := c.Client.Session().Destroy(id, nil)
   572  		if err != nil {
   573  			return err
   574  		}
   575  		// We ignore the errors that may happen during cleanup
   576  		kv := c.Client.KV()
   577  		kv.Delete(c.lockPath()+lockSuffix, nil)
   578  		kv.Delete(c.lockPath()+lockInfoSuffix, nil)
   579  
   580  		return nil
   581  	}
   582  
   583  	// cancel our monitoring goroutine
   584  	c.monitorCancel()
   585  
   586  	defer func() {
   587  		c.consulLock = nil
   588  
   589  		// The consul session is only used for this single lock, so cancel it
   590  		// after we unlock.
   591  		// The session is only created and replaced holding Client.mu, so the
   592  		// CancelFunc must be non-nil.
   593  		c.sessionCancel()
   594  	}()
   595  
   596  	select {
   597  	case <-c.lockCh:
   598  		return lostLockErr
   599  	default:
   600  	}
   601  
   602  	kv := c.Client.KV()
   603  
   604  	var errs error
   605  
   606  	if _, err := kv.Delete(c.lockPath()+lockInfoSuffix, nil); err != nil {
   607  		errs = multierror.Append(errs, err)
   608  	}
   609  
   610  	if err := c.consulLock.Unlock(); err != nil {
   611  		errs = multierror.Append(errs, err)
   612  	}
   613  
   614  	// the monitoring goroutine may be in a select on the lockCh, so we need to
   615  	// wait for it to return before changing the value.
   616  	c.monitorWG.Wait()
   617  	c.lockCh = nil
   618  
   619  	// This is only cleanup, and will fail if the lock was immediately taken by
   620  	// another client, so we don't report an error to the user here.
   621  	c.consulLock.Destroy()
   622  
   623  	return errs
   624  }
   625  
   626  func compressState(data []byte) ([]byte, error) {
   627  	b := new(bytes.Buffer)
   628  	gz := gzip.NewWriter(b)
   629  	if _, err := gz.Write(data); err != nil {
   630  		return nil, err
   631  	}
   632  	if err := gz.Flush(); err != nil {
   633  		return nil, err
   634  	}
   635  	if err := gz.Close(); err != nil {
   636  		return nil, err
   637  	}
   638  	return b.Bytes(), nil
   639  }
   640  
   641  func uncompressState(data []byte) ([]byte, error) {
   642  	b := new(bytes.Buffer)
   643  	gz, err := gzip.NewReader(bytes.NewReader(data))
   644  	if err != nil {
   645  		return nil, err
   646  	}
   647  	b.ReadFrom(gz)
   648  	if err := gz.Close(); err != nil {
   649  		return nil, err
   650  	}
   651  	return b.Bytes(), nil
   652  }
   653  
   654  func split(payload []byte, limit int) [][]byte {
   655  	var chunk []byte
   656  	chunks := make([][]byte, 0, len(payload)/limit+1)
   657  	for len(payload) >= limit {
   658  		chunk, payload = payload[:limit], payload[limit:]
   659  		chunks = append(chunks, chunk)
   660  	}
   661  	if len(payload) > 0 {
   662  		chunks = append(chunks, payload[:])
   663  	}
   664  	return chunks
   665  }
   666  
   667  func (c *RemoteClient) chunkedMode() (bool, string, []string, *consulapi.KVPair, error) {
   668  	kv := c.Client.KV()
   669  	pair, _, err := kv.Get(c.Path, nil)
   670  	if err != nil {
   671  		return false, "", nil, pair, err
   672  	}
   673  	if pair != nil {
   674  		var d map[string]interface{}
   675  		err = json.Unmarshal(pair.Value, &d)
   676  		// If there is an error when unmarshaling the payload, the state has
   677  		// probably been gziped in single entry mode.
   678  		if err == nil {
   679  			// If we find the "current-hash" key we were in chunked mode
   680  			hash, ok := d["current-hash"]
   681  			if ok {
   682  				chunks := make([]string, 0)
   683  				for _, c := range d["chunks"].([]interface{}) {
   684  					chunks = append(chunks, c.(string))
   685  				}
   686  				return true, hash.(string), chunks, pair, nil
   687  			}
   688  		}
   689  	}
   690  	return false, "", nil, pair, nil
   691  }