github.com/terramate-io/tf@v0.0.0-20230830114523-fce866b4dfcd/backend/remote-state/consul/client.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package consul
     5  
     6  import (
     7  	"bytes"
     8  	"compress/gzip"
     9  	"context"
    10  	"crypto/md5"
    11  	"encoding/json"
    12  	"errors"
    13  	"fmt"
    14  	"log"
    15  	"strings"
    16  	"sync"
    17  	"time"
    18  
    19  	consulapi "github.com/hashicorp/consul/api"
    20  	multierror "github.com/hashicorp/go-multierror"
    21  	"github.com/terramate-io/tf/states/remote"
    22  	"github.com/terramate-io/tf/states/statemgr"
    23  )
    24  
    25  const (
    26  	lockSuffix     = "/.lock"
    27  	lockInfoSuffix = "/.lockinfo"
    28  
    29  	// The Session TTL associated with this lock.
    30  	lockSessionTTL = "15s"
    31  
    32  	// the delay time from when a session is lost to when the
    33  	// lock is released by the server
    34  	lockDelay = 5 * time.Second
    35  	// interval between attempts to reacquire a lost lock
    36  	lockReacquireInterval = 2 * time.Second
    37  )
    38  
    39  var lostLockErr = errors.New("consul lock was lost")
    40  
    41  // RemoteClient is a remote client that stores data in Consul.
    42  type RemoteClient struct {
    43  	Client *consulapi.Client
    44  	Path   string
    45  	GZip   bool
    46  
    47  	mu sync.Mutex
    48  	// lockState is true if we're using locks
    49  	lockState bool
    50  
    51  	// The index of the last state we wrote.
    52  	// If this is > 0, Put will perform a CAS to ensure that the state wasn't
    53  	// changed during the operation. This is important even with locks, because
    54  	// if the client loses the lock for some reason, then reacquires it, we
    55  	// need to make sure that the state was not modified.
    56  	modifyIndex uint64
    57  
    58  	consulLock *consulapi.Lock
    59  	lockCh     <-chan struct{}
    60  
    61  	info *statemgr.LockInfo
    62  
    63  	// cancel our goroutine which is monitoring the lock to automatically
    64  	// reacquire it when possible.
    65  	monitorCancel context.CancelFunc
    66  	monitorWG     sync.WaitGroup
    67  
    68  	// sessionCancel cancels the Context use for session.RenewPeriodic, and is
    69  	// called when unlocking, or before creating a new lock if the lock is
    70  	// lost.
    71  	sessionCancel context.CancelFunc
    72  }
    73  
    74  func (c *RemoteClient) Get() (*remote.Payload, error) {
    75  	c.mu.Lock()
    76  	defer c.mu.Unlock()
    77  
    78  	kv := c.Client.KV()
    79  
    80  	chunked, hash, chunks, pair, err := c.chunkedMode()
    81  	if err != nil {
    82  		return nil, err
    83  	}
    84  	if pair == nil {
    85  		return nil, nil
    86  	}
    87  
    88  	c.modifyIndex = pair.ModifyIndex
    89  
    90  	var payload []byte
    91  	if chunked {
    92  		for _, c := range chunks {
    93  			pair, _, err := kv.Get(c, nil)
    94  			if err != nil {
    95  				return nil, err
    96  			}
    97  			if pair == nil {
    98  				return nil, fmt.Errorf("Key %q could not be found", c)
    99  			}
   100  			payload = append(payload, pair.Value[:]...)
   101  		}
   102  	} else {
   103  		payload = pair.Value
   104  	}
   105  
   106  	// If the payload starts with 0x1f, it's gzip, not json
   107  	if len(payload) >= 1 && payload[0] == '\x1f' {
   108  		payload, err = uncompressState(payload)
   109  		if err != nil {
   110  			return nil, err
   111  		}
   112  	}
   113  
   114  	md5 := md5.Sum(payload)
   115  
   116  	if hash != "" && fmt.Sprintf("%x", md5) != hash {
   117  		return nil, fmt.Errorf("The remote state does not match the expected hash")
   118  	}
   119  
   120  	return &remote.Payload{
   121  		Data: payload,
   122  		MD5:  md5[:],
   123  	}, nil
   124  }
   125  
   126  func (c *RemoteClient) Put(data []byte) error {
   127  	// The state can be stored in 4 different ways, based on the payload size
   128  	// and whether the user enabled gzip:
   129  	//  - single entry mode with plain JSON: a single JSON is stored at
   130  	//	  "tfstate/my_project"
   131  	//  - single entry mode gzip: the JSON payload is first gziped and stored at
   132  	//    "tfstate/my_project"
   133  	//  - chunked mode with plain JSON: the JSON payload is split in pieces and
   134  	//    stored like so:
   135  	//       - "tfstate/my_project" -> a JSON payload that contains the path of
   136  	//         the chunks and an MD5 sum like so:
   137  	//              {
   138  	//              	"current-hash": "abcdef1234",
   139  	//              	"chunks": [
   140  	//              		"tfstate/my_project/tfstate.abcdef1234/0",
   141  	//              		"tfstate/my_project/tfstate.abcdef1234/1",
   142  	//              		"tfstate/my_project/tfstate.abcdef1234/2",
   143  	//              	]
   144  	//              }
   145  	//       - "tfstate/my_project/tfstate.abcdef1234/0" -> The first chunk
   146  	//       - "tfstate/my_project/tfstate.abcdef1234/1" -> The next one
   147  	//       - ...
   148  	//  - chunked mode with gzip: the same system but we gziped the JSON payload
   149  	//    before splitting it in chunks
   150  	//
   151  	// When overwritting the current state, we need to clean the old chunks if
   152  	// we were in chunked mode (no matter whether we need to use chunks for the
   153  	// new one). To do so based on the 4 possibilities above we look at the
   154  	// value at "tfstate/my_project" and if it is:
   155  	//  - absent then it's a new state and there will be nothing to cleanup,
   156  	//  - not a JSON payload we were in single entry mode with gzip so there will
   157  	// 	  be nothing to cleanup
   158  	//  - a JSON payload, then we were either single entry mode with plain JSON
   159  	//    or in chunked mode. To differentiate between the two we look whether a
   160  	//    "current-hash" key is present in the payload. If we find one we were
   161  	//    in chunked mode and we will need to remove the old chunks (whether or
   162  	//    not we were using gzip does not matter in that case).
   163  
   164  	c.mu.Lock()
   165  	defer c.mu.Unlock()
   166  
   167  	kv := c.Client.KV()
   168  
   169  	// First we determine what mode we were using and to prepare the cleanup
   170  	chunked, hash, _, _, err := c.chunkedMode()
   171  	if err != nil {
   172  		return err
   173  	}
   174  	cleanupOldChunks := func() {}
   175  	if chunked {
   176  		cleanupOldChunks = func() {
   177  			// We ignore all errors that can happen here because we already
   178  			// saved the new state and there is no way to return a warning to
   179  			// the user. We may end up with dangling chunks but there is no way
   180  			// to be sure we won't.
   181  			path := strings.TrimRight(c.Path, "/") + fmt.Sprintf("/tfstate.%s/", hash)
   182  			kv.DeleteTree(path, nil)
   183  		}
   184  	}
   185  
   186  	payload := data
   187  	if c.GZip {
   188  		if compressedState, err := compressState(data); err == nil {
   189  			payload = compressedState
   190  		} else {
   191  			return err
   192  		}
   193  	}
   194  
   195  	// default to doing a CAS
   196  	verb := consulapi.KVCAS
   197  
   198  	// Assume a 0 index doesn't need a CAS for now, since we are either
   199  	// creating a new state or purposely overwriting one.
   200  	if c.modifyIndex == 0 {
   201  		verb = consulapi.KVSet
   202  	}
   203  
   204  	// The payload may be too large to store in a single KV entry in Consul. We
   205  	// could try to determine whether it will fit or not before sending the
   206  	// request but since we are using the Transaction API and not the KV API,
   207  	// it grows by about a 1/3 when it is base64 encoded plus the overhead of
   208  	// the fields specific to the Transaction API.
   209  	// Rather than trying to calculate the overhead (which could change from
   210  	// one version of Consul to another, and between Consul Community Edition
   211  	// and Consul Enterprise), we try to send the whole state in one request, if
   212  	// it fails because it is too big we then split it in chunks and send each
   213  	// chunk separately.
   214  	// When splitting in chunks, we make each chunk 524288 bits, which is the
   215  	// default max size for raft. If the user changed it, we still may send
   216  	// chunks too big and fail but this is not a setting that should be fiddled
   217  	// with anyway.
   218  
   219  	store := func(payload []byte) error {
   220  		// KV.Put doesn't return the new index, so we use a single operation
   221  		// transaction to get the new index with a single request.
   222  		txOps := consulapi.KVTxnOps{
   223  			&consulapi.KVTxnOp{
   224  				Verb:  verb,
   225  				Key:   c.Path,
   226  				Value: payload,
   227  				Index: c.modifyIndex,
   228  			},
   229  		}
   230  
   231  		ok, resp, _, err := kv.Txn(txOps, nil)
   232  		if err != nil {
   233  			return err
   234  		}
   235  		// transaction was rolled back
   236  		if !ok {
   237  			var resultErr error
   238  			for _, respError := range resp.Errors {
   239  				resultErr = multierror.Append(resultErr, errors.New(respError.What))
   240  			}
   241  			return fmt.Errorf("consul CAS failed with transaction errors: %w", resultErr)
   242  		}
   243  
   244  		if len(resp.Results) != 1 {
   245  			// this probably shouldn't happen
   246  			return fmt.Errorf("expected on 1 response value, got: %d", len(resp.Results))
   247  		}
   248  
   249  		c.modifyIndex = resp.Results[0].ModifyIndex
   250  
   251  		// We remove all the old chunks
   252  		cleanupOldChunks()
   253  
   254  		return nil
   255  	}
   256  
   257  	if err = store(payload); err == nil {
   258  		// The payload was small enough to be stored
   259  		return nil
   260  	} else if !strings.Contains(err.Error(), "too large") {
   261  		// We failed for some other reason, report this to the user
   262  		return err
   263  	}
   264  
   265  	// The payload was too large so we split it in multiple chunks
   266  
   267  	md5 := md5.Sum(data)
   268  	chunks := split(payload, 524288)
   269  	chunkPaths := make([]string, 0)
   270  
   271  	// First we write the new chunks
   272  	for i, p := range chunks {
   273  		path := strings.TrimRight(c.Path, "/") + fmt.Sprintf("/tfstate.%x/%d", md5, i)
   274  		chunkPaths = append(chunkPaths, path)
   275  		_, err := kv.Put(&consulapi.KVPair{
   276  			Key:   path,
   277  			Value: p,
   278  		}, nil)
   279  
   280  		if err != nil {
   281  			return err
   282  		}
   283  	}
   284  
   285  	// Then we update the link to point to the new chunks
   286  	payload, err = json.Marshal(map[string]interface{}{
   287  		"current-hash": fmt.Sprintf("%x", md5),
   288  		"chunks":       chunkPaths,
   289  	})
   290  	if err != nil {
   291  		return err
   292  	}
   293  	return store(payload)
   294  }
   295  
   296  func (c *RemoteClient) Delete() error {
   297  	c.mu.Lock()
   298  	defer c.mu.Unlock()
   299  
   300  	kv := c.Client.KV()
   301  
   302  	chunked, hash, _, _, err := c.chunkedMode()
   303  	if err != nil {
   304  		return err
   305  	}
   306  
   307  	_, err = kv.Delete(c.Path, nil)
   308  
   309  	// If there were chunks we need to remove them
   310  	if chunked {
   311  		path := strings.TrimRight(c.Path, "/") + fmt.Sprintf("/tfstate.%s/", hash)
   312  		kv.DeleteTree(path, nil)
   313  	}
   314  
   315  	return err
   316  }
   317  
   318  func (c *RemoteClient) lockPath() string {
   319  	// we sanitize the path for the lock as Consul does not like having
   320  	// two consecutive slashes for the lock path
   321  	return strings.TrimRight(c.Path, "/")
   322  }
   323  
   324  func (c *RemoteClient) putLockInfo(info *statemgr.LockInfo) error {
   325  	info.Path = c.Path
   326  	info.Created = time.Now().UTC()
   327  
   328  	kv := c.Client.KV()
   329  	_, err := kv.Put(&consulapi.KVPair{
   330  		Key:   c.lockPath() + lockInfoSuffix,
   331  		Value: info.Marshal(),
   332  	}, nil)
   333  
   334  	return err
   335  }
   336  
   337  func (c *RemoteClient) getLockInfo() (*statemgr.LockInfo, error) {
   338  	path := c.lockPath() + lockInfoSuffix
   339  	pair, _, err := c.Client.KV().Get(path, nil)
   340  	if err != nil {
   341  		return nil, err
   342  	}
   343  	if pair == nil {
   344  		return nil, nil
   345  	}
   346  
   347  	li := &statemgr.LockInfo{}
   348  	err = json.Unmarshal(pair.Value, li)
   349  	if err != nil {
   350  		return nil, fmt.Errorf("error unmarshaling lock info: %s", err)
   351  	}
   352  
   353  	return li, nil
   354  }
   355  
   356  func (c *RemoteClient) Lock(info *statemgr.LockInfo) (string, error) {
   357  	c.mu.Lock()
   358  	defer c.mu.Unlock()
   359  
   360  	if !c.lockState {
   361  		return "", nil
   362  	}
   363  
   364  	c.info = info
   365  
   366  	// These checks only are to ensure we strictly follow the specification.
   367  	// Terraform shouldn't ever re-lock, so provide errors for the 2 possible
   368  	// states if this is called.
   369  	select {
   370  	case <-c.lockCh:
   371  		// We had a lock, but lost it.
   372  		return "", errors.New("lost consul lock, cannot re-lock")
   373  	default:
   374  		if c.lockCh != nil {
   375  			// we have an active lock already
   376  			return "", fmt.Errorf("state %q already locked", c.Path)
   377  		}
   378  	}
   379  
   380  	return c.lock()
   381  }
   382  
   383  // the lock implementation.
   384  // Only to be called while holding Client.mu
   385  func (c *RemoteClient) lock() (string, error) {
   386  	// We create a new session here, so it can be canceled when the lock is
   387  	// lost or unlocked.
   388  	lockSession, err := c.createSession()
   389  	if err != nil {
   390  		return "", err
   391  	}
   392  
   393  	// store the session ID for correlation with consul logs
   394  	c.info.Info = "consul session: " + lockSession
   395  
   396  	// A random lock ID has been generated but we override it with the session
   397  	// ID as this will make it easier to manually invalidate the session
   398  	// if needed.
   399  	c.info.ID = lockSession
   400  
   401  	opts := &consulapi.LockOptions{
   402  		Key:     c.lockPath() + lockSuffix,
   403  		Session: lockSession,
   404  
   405  		// only wait briefly, so terraform has the choice to fail fast or
   406  		// retry as needed.
   407  		LockWaitTime: time.Second,
   408  		LockTryOnce:  true,
   409  
   410  		// Don't let the lock monitor give up right away, as it's possible the
   411  		// session is still OK. While the session is refreshed at a rate of
   412  		// TTL/2, the lock monitor is an idle blocking request and is more
   413  		// susceptible to being closed by a lower network layer.
   414  		MonitorRetries: 5,
   415  		//
   416  		// The delay between lock monitor retries.
   417  		// While the session has a 15s TTL plus a 5s wait period on a lost
   418  		// lock, if we can't get our lock back in 10+ seconds something is
   419  		// wrong so we're going to drop the session and start over.
   420  		MonitorRetryTime: 2 * time.Second,
   421  	}
   422  
   423  	c.consulLock, err = c.Client.LockOpts(opts)
   424  	if err != nil {
   425  		return "", err
   426  	}
   427  
   428  	lockErr := &statemgr.LockError{}
   429  
   430  	lockCh, err := c.consulLock.Lock(make(chan struct{}))
   431  	if err != nil {
   432  		lockErr.Err = err
   433  		return "", lockErr
   434  	}
   435  
   436  	if lockCh == nil {
   437  		lockInfo, e := c.getLockInfo()
   438  		if e != nil {
   439  			lockErr.Err = e
   440  			return "", lockErr
   441  		}
   442  
   443  		lockErr.Info = lockInfo
   444  
   445  		return "", lockErr
   446  	}
   447  
   448  	c.lockCh = lockCh
   449  
   450  	err = c.putLockInfo(c.info)
   451  	if err != nil {
   452  		if unlockErr := c.unlock(c.info.ID); unlockErr != nil {
   453  			err = multierror.Append(err, unlockErr)
   454  		}
   455  
   456  		return "", err
   457  	}
   458  
   459  	// Start a goroutine to monitor the lock state.
   460  	// If we lose the lock to due communication issues with the consul agent,
   461  	// attempt to immediately reacquire the lock. Put will verify the integrity
   462  	// of the state by using a CAS operation.
   463  	ctx, cancel := context.WithCancel(context.Background())
   464  	c.monitorCancel = cancel
   465  	c.monitorWG.Add(1)
   466  	go func() {
   467  		defer c.monitorWG.Done()
   468  		select {
   469  		case <-c.lockCh:
   470  			log.Println("[ERROR] lost consul lock")
   471  			for {
   472  				c.mu.Lock()
   473  				// We lost our lock, so we need to cancel the session too.
   474  				// The CancelFunc is only replaced while holding Client.mu, so
   475  				// this is safe to call here. This will be replaced by the
   476  				// lock() call below.
   477  				c.sessionCancel()
   478  
   479  				c.consulLock = nil
   480  				_, err := c.lock()
   481  				c.mu.Unlock()
   482  
   483  				if err != nil {
   484  					// We failed to get the lock, keep trying as long as
   485  					// terraform is running. There may be changes in progress,
   486  					// so there's no use in aborting. Either we eventually
   487  					// reacquire the lock, or a Put will fail on a CAS.
   488  					log.Printf("[ERROR] could not reacquire lock: %s", err)
   489  					time.Sleep(lockReacquireInterval)
   490  
   491  					select {
   492  					case <-ctx.Done():
   493  						return
   494  					default:
   495  					}
   496  					continue
   497  				}
   498  
   499  				// if the error was nil, the new lock started a new copy of
   500  				// this goroutine.
   501  				return
   502  			}
   503  
   504  		case <-ctx.Done():
   505  			return
   506  		}
   507  	}()
   508  
   509  	if testLockHook != nil {
   510  		testLockHook()
   511  	}
   512  
   513  	return c.info.ID, nil
   514  }
   515  
   516  // called after a lock is acquired
   517  var testLockHook func()
   518  
   519  func (c *RemoteClient) createSession() (string, error) {
   520  	// create the context first. Even if the session creation fails, we assume
   521  	// that the CancelFunc is always callable.
   522  	ctx, cancel := context.WithCancel(context.Background())
   523  	c.sessionCancel = cancel
   524  
   525  	session := c.Client.Session()
   526  	se := &consulapi.SessionEntry{
   527  		Name:      consulapi.DefaultLockSessionName,
   528  		TTL:       lockSessionTTL,
   529  		LockDelay: lockDelay,
   530  	}
   531  
   532  	id, _, err := session.Create(se, nil)
   533  	if err != nil {
   534  		return "", err
   535  	}
   536  
   537  	log.Println("[INFO] created consul lock session", id)
   538  
   539  	// keep the session renewed
   540  	go session.RenewPeriodic(lockSessionTTL, id, nil, ctx.Done())
   541  
   542  	return id, nil
   543  }
   544  
   545  func (c *RemoteClient) Unlock(id string) error {
   546  	c.mu.Lock()
   547  	defer c.mu.Unlock()
   548  
   549  	if !c.lockState {
   550  		return nil
   551  	}
   552  
   553  	return c.unlock(id)
   554  }
   555  
   556  // the unlock implementation.
   557  // Only to be called while holding Client.mu
   558  func (c *RemoteClient) unlock(id string) error {
   559  	// This method can be called in two circumstances:
   560  	// - when the plan apply or destroy operation finishes and the lock needs to be released,
   561  	// the watchdog stopped and the session closed
   562  	// - when the user calls `terraform force-unlock <lock_id>` in which case
   563  	// we only need to release the lock.
   564  
   565  	if c.consulLock == nil || c.lockCh == nil {
   566  		// The user called `terraform force-unlock <lock_id>`, we just destroy
   567  		// the session which will release the lock, clean the KV store and quit.
   568  
   569  		_, err := c.Client.Session().Destroy(id, nil)
   570  		if err != nil {
   571  			return err
   572  		}
   573  		// We ignore the errors that may happen during cleanup
   574  		kv := c.Client.KV()
   575  		kv.Delete(c.lockPath()+lockSuffix, nil)
   576  		kv.Delete(c.lockPath()+lockInfoSuffix, nil)
   577  
   578  		return nil
   579  	}
   580  
   581  	// cancel our monitoring goroutine
   582  	c.monitorCancel()
   583  
   584  	defer func() {
   585  		c.consulLock = nil
   586  
   587  		// The consul session is only used for this single lock, so cancel it
   588  		// after we unlock.
   589  		// The session is only created and replaced holding Client.mu, so the
   590  		// CancelFunc must be non-nil.
   591  		c.sessionCancel()
   592  	}()
   593  
   594  	select {
   595  	case <-c.lockCh:
   596  		return lostLockErr
   597  	default:
   598  	}
   599  
   600  	kv := c.Client.KV()
   601  
   602  	var errs error
   603  
   604  	if _, err := kv.Delete(c.lockPath()+lockInfoSuffix, nil); err != nil {
   605  		errs = multierror.Append(errs, err)
   606  	}
   607  
   608  	if err := c.consulLock.Unlock(); err != nil {
   609  		errs = multierror.Append(errs, err)
   610  	}
   611  
   612  	// the monitoring goroutine may be in a select on the lockCh, so we need to
   613  	// wait for it to return before changing the value.
   614  	c.monitorWG.Wait()
   615  	c.lockCh = nil
   616  
   617  	// This is only cleanup, and will fail if the lock was immediately taken by
   618  	// another client, so we don't report an error to the user here.
   619  	c.consulLock.Destroy()
   620  
   621  	return errs
   622  }
   623  
   624  func compressState(data []byte) ([]byte, error) {
   625  	b := new(bytes.Buffer)
   626  	gz := gzip.NewWriter(b)
   627  	if _, err := gz.Write(data); err != nil {
   628  		return nil, err
   629  	}
   630  	if err := gz.Flush(); err != nil {
   631  		return nil, err
   632  	}
   633  	if err := gz.Close(); err != nil {
   634  		return nil, err
   635  	}
   636  	return b.Bytes(), nil
   637  }
   638  
   639  func uncompressState(data []byte) ([]byte, error) {
   640  	b := new(bytes.Buffer)
   641  	gz, err := gzip.NewReader(bytes.NewReader(data))
   642  	if err != nil {
   643  		return nil, err
   644  	}
   645  	b.ReadFrom(gz)
   646  	if err := gz.Close(); err != nil {
   647  		return nil, err
   648  	}
   649  	return b.Bytes(), nil
   650  }
   651  
   652  func split(payload []byte, limit int) [][]byte {
   653  	var chunk []byte
   654  	chunks := make([][]byte, 0, len(payload)/limit+1)
   655  	for len(payload) >= limit {
   656  		chunk, payload = payload[:limit], payload[limit:]
   657  		chunks = append(chunks, chunk)
   658  	}
   659  	if len(payload) > 0 {
   660  		chunks = append(chunks, payload[:])
   661  	}
   662  	return chunks
   663  }
   664  
   665  func (c *RemoteClient) chunkedMode() (bool, string, []string, *consulapi.KVPair, error) {
   666  	kv := c.Client.KV()
   667  	pair, _, err := kv.Get(c.Path, nil)
   668  	if err != nil {
   669  		return false, "", nil, pair, err
   670  	}
   671  	if pair != nil {
   672  		var d map[string]interface{}
   673  		err = json.Unmarshal(pair.Value, &d)
   674  		// If there is an error when unmarshaling the payload, the state has
   675  		// probably been gziped in single entry mode.
   676  		if err == nil {
   677  			// If we find the "current-hash" key we were in chunked mode
   678  			hash, ok := d["current-hash"]
   679  			if ok {
   680  				chunks := make([]string, 0)
   681  				for _, c := range d["chunks"].([]interface{}) {
   682  					chunks = append(chunks, c.(string))
   683  				}
   684  				return true, hash.(string), chunks, pair, nil
   685  			}
   686  		}
   687  	}
   688  	return false, "", nil, pair, nil
   689  }