github.com/graywolf-at-work-2/terraform-vendor@v1.4.5/internal/backend/remote-state/consul/client.go (about)

     1  package consul
     2  
     3  import (
     4  	"bytes"
     5  	"compress/gzip"
     6  	"context"
     7  	"crypto/md5"
     8  	"encoding/json"
     9  	"errors"
    10  	"fmt"
    11  	"log"
    12  	"strings"
    13  	"sync"
    14  	"time"
    15  
    16  	consulapi "github.com/hashicorp/consul/api"
    17  	multierror "github.com/hashicorp/go-multierror"
    18  	"github.com/hashicorp/terraform/internal/states/remote"
    19  	"github.com/hashicorp/terraform/internal/states/statemgr"
    20  )
    21  
    22  const (
    23  	lockSuffix     = "/.lock"
    24  	lockInfoSuffix = "/.lockinfo"
    25  
    26  	// The Session TTL associated with this lock.
    27  	lockSessionTTL = "15s"
    28  
    29  	// the delay time from when a session is lost to when the
    30  	// lock is released by the server
    31  	lockDelay = 5 * time.Second
    32  	// interval between attempts to reacquire a lost lock
    33  	lockReacquireInterval = 2 * time.Second
    34  )
    35  
    36  var lostLockErr = errors.New("consul lock was lost")
    37  
    38  // RemoteClient is a remote client that stores data in Consul.
    39  type RemoteClient struct {
    40  	Client *consulapi.Client
    41  	Path   string
    42  	GZip   bool
    43  
    44  	mu sync.Mutex
    45  	// lockState is true if we're using locks
    46  	lockState bool
    47  
    48  	// The index of the last state we wrote.
    49  	// If this is > 0, Put will perform a CAS to ensure that the state wasn't
    50  	// changed during the operation. This is important even with locks, because
    51  	// if the client loses the lock for some reason, then reacquires it, we
    52  	// need to make sure that the state was not modified.
    53  	modifyIndex uint64
    54  
    55  	consulLock *consulapi.Lock
    56  	lockCh     <-chan struct{}
    57  
    58  	info *statemgr.LockInfo
    59  
    60  	// cancel our goroutine which is monitoring the lock to automatically
    61  	// reacquire it when possible.
    62  	monitorCancel context.CancelFunc
    63  	monitorWG     sync.WaitGroup
    64  
    65  	// sessionCancel cancels the Context use for session.RenewPeriodic, and is
    66  	// called when unlocking, or before creating a new lock if the lock is
    67  	// lost.
    68  	sessionCancel context.CancelFunc
    69  }
    70  
    71  func (c *RemoteClient) Get() (*remote.Payload, error) {
    72  	c.mu.Lock()
    73  	defer c.mu.Unlock()
    74  
    75  	kv := c.Client.KV()
    76  
    77  	chunked, hash, chunks, pair, err := c.chunkedMode()
    78  	if err != nil {
    79  		return nil, err
    80  	}
    81  	if pair == nil {
    82  		return nil, nil
    83  	}
    84  
    85  	c.modifyIndex = pair.ModifyIndex
    86  
    87  	var payload []byte
    88  	if chunked {
    89  		for _, c := range chunks {
    90  			pair, _, err := kv.Get(c, nil)
    91  			if err != nil {
    92  				return nil, err
    93  			}
    94  			if pair == nil {
    95  				return nil, fmt.Errorf("Key %q could not be found", c)
    96  			}
    97  			payload = append(payload, pair.Value[:]...)
    98  		}
    99  	} else {
   100  		payload = pair.Value
   101  	}
   102  
   103  	// If the payload starts with 0x1f, it's gzip, not json
   104  	if len(payload) >= 1 && payload[0] == '\x1f' {
   105  		payload, err = uncompressState(payload)
   106  		if err != nil {
   107  			return nil, err
   108  		}
   109  	}
   110  
   111  	md5 := md5.Sum(payload)
   112  
   113  	if hash != "" && fmt.Sprintf("%x", md5) != hash {
   114  		return nil, fmt.Errorf("The remote state does not match the expected hash")
   115  	}
   116  
   117  	return &remote.Payload{
   118  		Data: payload,
   119  		MD5:  md5[:],
   120  	}, nil
   121  }
   122  
   123  func (c *RemoteClient) Put(data []byte) error {
   124  	// The state can be stored in 4 different ways, based on the payload size
   125  	// and whether the user enabled gzip:
   126  	//  - single entry mode with plain JSON: a single JSON is stored at
   127  	//	  "tfstate/my_project"
   128  	//  - single entry mode gzip: the JSON payload is first gziped and stored at
   129  	//    "tfstate/my_project"
   130  	//  - chunked mode with plain JSON: the JSON payload is split in pieces and
   131  	//    stored like so:
   132  	//       - "tfstate/my_project" -> a JSON payload that contains the path of
   133  	//         the chunks and an MD5 sum like so:
   134  	//              {
   135  	//              	"current-hash": "abcdef1234",
   136  	//              	"chunks": [
   137  	//              		"tfstate/my_project/tfstate.abcdef1234/0",
   138  	//              		"tfstate/my_project/tfstate.abcdef1234/1",
   139  	//              		"tfstate/my_project/tfstate.abcdef1234/2",
   140  	//              	]
   141  	//              }
   142  	//       - "tfstate/my_project/tfstate.abcdef1234/0" -> The first chunk
   143  	//       - "tfstate/my_project/tfstate.abcdef1234/1" -> The next one
   144  	//       - ...
   145  	//  - chunked mode with gzip: the same system but we gziped the JSON payload
   146  	//    before splitting it in chunks
   147  	//
   148  	// When overwritting the current state, we need to clean the old chunks if
   149  	// we were in chunked mode (no matter whether we need to use chunks for the
   150  	// new one). To do so based on the 4 possibilities above we look at the
   151  	// value at "tfstate/my_project" and if it is:
   152  	//  - absent then it's a new state and there will be nothing to cleanup,
   153  	//  - not a JSON payload we were in single entry mode with gzip so there will
   154  	// 	  be nothing to cleanup
   155  	//  - a JSON payload, then we were either single entry mode with plain JSON
   156  	//    or in chunked mode. To differentiate between the two we look whether a
   157  	//    "current-hash" key is present in the payload. If we find one we were
   158  	//    in chunked mode and we will need to remove the old chunks (whether or
   159  	//    not we were using gzip does not matter in that case).
   160  
   161  	c.mu.Lock()
   162  	defer c.mu.Unlock()
   163  
   164  	kv := c.Client.KV()
   165  
   166  	// First we determine what mode we were using and to prepare the cleanup
   167  	chunked, hash, _, _, err := c.chunkedMode()
   168  	if err != nil {
   169  		return err
   170  	}
   171  	cleanupOldChunks := func() {}
   172  	if chunked {
   173  		cleanupOldChunks = func() {
   174  			// We ignore all errors that can happen here because we already
   175  			// saved the new state and there is no way to return a warning to
   176  			// the user. We may end up with dangling chunks but there is no way
   177  			// to be sure we won't.
   178  			path := strings.TrimRight(c.Path, "/") + fmt.Sprintf("/tfstate.%s/", hash)
   179  			kv.DeleteTree(path, nil)
   180  		}
   181  	}
   182  
   183  	payload := data
   184  	if c.GZip {
   185  		if compressedState, err := compressState(data); err == nil {
   186  			payload = compressedState
   187  		} else {
   188  			return err
   189  		}
   190  	}
   191  
   192  	// default to doing a CAS
   193  	verb := consulapi.KVCAS
   194  
   195  	// Assume a 0 index doesn't need a CAS for now, since we are either
   196  	// creating a new state or purposely overwriting one.
   197  	if c.modifyIndex == 0 {
   198  		verb = consulapi.KVSet
   199  	}
   200  
   201  	// The payload may be too large to store in a single KV entry in Consul. We
   202  	// could try to determine whether it will fit or not before sending the
   203  	// request but since we are using the Transaction API and not the KV API,
   204  	// it grows by about a 1/3 when it is base64 encoded plus the overhead of
   205  	// the fields specific to the Transaction API.
   206  	// Rather than trying to calculate the overhead (which could change from
   207  	// one version of Consul to another, and between Consul Community Edition
   208  	// and Consul Enterprise), we try to send the whole state in one request, if
   209  	// it fails because it is too big we then split it in chunks and send each
   210  	// chunk separately.
   211  	// When splitting in chunks, we make each chunk 524288 bits, which is the
   212  	// default max size for raft. If the user changed it, we still may send
   213  	// chunks too big and fail but this is not a setting that should be fiddled
   214  	// with anyway.
   215  
   216  	store := func(payload []byte) error {
   217  		// KV.Put doesn't return the new index, so we use a single operation
   218  		// transaction to get the new index with a single request.
   219  		txOps := consulapi.KVTxnOps{
   220  			&consulapi.KVTxnOp{
   221  				Verb:  verb,
   222  				Key:   c.Path,
   223  				Value: payload,
   224  				Index: c.modifyIndex,
   225  			},
   226  		}
   227  
   228  		ok, resp, _, err := kv.Txn(txOps, nil)
   229  		if err != nil {
   230  			return err
   231  		}
   232  		// transaction was rolled back
   233  		if !ok {
   234  			return fmt.Errorf("consul CAS failed with transaction errors: %v", resp.Errors)
   235  		}
   236  
   237  		if len(resp.Results) != 1 {
   238  			// this probably shouldn't happen
   239  			return fmt.Errorf("expected on 1 response value, got: %d", len(resp.Results))
   240  		}
   241  
   242  		c.modifyIndex = resp.Results[0].ModifyIndex
   243  
   244  		// We remove all the old chunks
   245  		cleanupOldChunks()
   246  
   247  		return nil
   248  	}
   249  
   250  	if err = store(payload); err == nil {
   251  		// The payload was small enough to be stored
   252  		return nil
   253  	} else if !strings.Contains(err.Error(), "too large") {
   254  		// We failed for some other reason, report this to the user
   255  		return err
   256  	}
   257  
   258  	// The payload was too large so we split it in multiple chunks
   259  
   260  	md5 := md5.Sum(data)
   261  	chunks := split(payload, 524288)
   262  	chunkPaths := make([]string, 0)
   263  
   264  	// First we write the new chunks
   265  	for i, p := range chunks {
   266  		path := strings.TrimRight(c.Path, "/") + fmt.Sprintf("/tfstate.%x/%d", md5, i)
   267  		chunkPaths = append(chunkPaths, path)
   268  		_, err := kv.Put(&consulapi.KVPair{
   269  			Key:   path,
   270  			Value: p,
   271  		}, nil)
   272  
   273  		if err != nil {
   274  			return err
   275  		}
   276  	}
   277  
   278  	// Then we update the link to point to the new chunks
   279  	payload, err = json.Marshal(map[string]interface{}{
   280  		"current-hash": fmt.Sprintf("%x", md5),
   281  		"chunks":       chunkPaths,
   282  	})
   283  	if err != nil {
   284  		return err
   285  	}
   286  	return store(payload)
   287  }
   288  
   289  func (c *RemoteClient) Delete() error {
   290  	c.mu.Lock()
   291  	defer c.mu.Unlock()
   292  
   293  	kv := c.Client.KV()
   294  
   295  	chunked, hash, _, _, err := c.chunkedMode()
   296  	if err != nil {
   297  		return err
   298  	}
   299  
   300  	_, err = kv.Delete(c.Path, nil)
   301  
   302  	// If there were chunks we need to remove them
   303  	if chunked {
   304  		path := strings.TrimRight(c.Path, "/") + fmt.Sprintf("/tfstate.%s/", hash)
   305  		kv.DeleteTree(path, nil)
   306  	}
   307  
   308  	return err
   309  }
   310  
   311  func (c *RemoteClient) lockPath() string {
   312  	// we sanitize the path for the lock as Consul does not like having
   313  	// two consecutive slashes for the lock path
   314  	return strings.TrimRight(c.Path, "/")
   315  }
   316  
   317  func (c *RemoteClient) putLockInfo(info *statemgr.LockInfo) error {
   318  	info.Path = c.Path
   319  	info.Created = time.Now().UTC()
   320  
   321  	kv := c.Client.KV()
   322  	_, err := kv.Put(&consulapi.KVPair{
   323  		Key:   c.lockPath() + lockInfoSuffix,
   324  		Value: info.Marshal(),
   325  	}, nil)
   326  
   327  	return err
   328  }
   329  
   330  func (c *RemoteClient) getLockInfo() (*statemgr.LockInfo, error) {
   331  	path := c.lockPath() + lockInfoSuffix
   332  	pair, _, err := c.Client.KV().Get(path, nil)
   333  	if err != nil {
   334  		return nil, err
   335  	}
   336  	if pair == nil {
   337  		return nil, nil
   338  	}
   339  
   340  	li := &statemgr.LockInfo{}
   341  	err = json.Unmarshal(pair.Value, li)
   342  	if err != nil {
   343  		return nil, fmt.Errorf("error unmarshaling lock info: %s", err)
   344  	}
   345  
   346  	return li, nil
   347  }
   348  
   349  func (c *RemoteClient) Lock(info *statemgr.LockInfo) (string, error) {
   350  	c.mu.Lock()
   351  	defer c.mu.Unlock()
   352  
   353  	if !c.lockState {
   354  		return "", nil
   355  	}
   356  
   357  	c.info = info
   358  
   359  	// These checks only are to ensure we strictly follow the specification.
   360  	// Terraform shouldn't ever re-lock, so provide errors for the 2 possible
   361  	// states if this is called.
   362  	select {
   363  	case <-c.lockCh:
   364  		// We had a lock, but lost it.
   365  		return "", errors.New("lost consul lock, cannot re-lock")
   366  	default:
   367  		if c.lockCh != nil {
   368  			// we have an active lock already
   369  			return "", fmt.Errorf("state %q already locked", c.Path)
   370  		}
   371  	}
   372  
   373  	return c.lock()
   374  }
   375  
   376  // the lock implementation.
   377  // Only to be called while holding Client.mu
   378  func (c *RemoteClient) lock() (string, error) {
   379  	// We create a new session here, so it can be canceled when the lock is
   380  	// lost or unlocked.
   381  	lockSession, err := c.createSession()
   382  	if err != nil {
   383  		return "", err
   384  	}
   385  
   386  	// store the session ID for correlation with consul logs
   387  	c.info.Info = "consul session: " + lockSession
   388  
   389  	// A random lock ID has been generated but we override it with the session
   390  	// ID as this will make it easier to manually invalidate the session
   391  	// if needed.
   392  	c.info.ID = lockSession
   393  
   394  	opts := &consulapi.LockOptions{
   395  		Key:     c.lockPath() + lockSuffix,
   396  		Session: lockSession,
   397  
   398  		// only wait briefly, so terraform has the choice to fail fast or
   399  		// retry as needed.
   400  		LockWaitTime: time.Second,
   401  		LockTryOnce:  true,
   402  
   403  		// Don't let the lock monitor give up right away, as it's possible the
   404  		// session is still OK. While the session is refreshed at a rate of
   405  		// TTL/2, the lock monitor is an idle blocking request and is more
   406  		// susceptible to being closed by a lower network layer.
   407  		MonitorRetries: 5,
   408  		//
   409  		// The delay between lock monitor retries.
   410  		// While the session has a 15s TTL plus a 5s wait period on a lost
   411  		// lock, if we can't get our lock back in 10+ seconds something is
   412  		// wrong so we're going to drop the session and start over.
   413  		MonitorRetryTime: 2 * time.Second,
   414  	}
   415  
   416  	c.consulLock, err = c.Client.LockOpts(opts)
   417  	if err != nil {
   418  		return "", err
   419  	}
   420  
   421  	lockErr := &statemgr.LockError{}
   422  
   423  	lockCh, err := c.consulLock.Lock(make(chan struct{}))
   424  	if err != nil {
   425  		lockErr.Err = err
   426  		return "", lockErr
   427  	}
   428  
   429  	if lockCh == nil {
   430  		lockInfo, e := c.getLockInfo()
   431  		if e != nil {
   432  			lockErr.Err = e
   433  			return "", lockErr
   434  		}
   435  
   436  		lockErr.Info = lockInfo
   437  
   438  		return "", lockErr
   439  	}
   440  
   441  	c.lockCh = lockCh
   442  
   443  	err = c.putLockInfo(c.info)
   444  	if err != nil {
   445  		if unlockErr := c.unlock(c.info.ID); unlockErr != nil {
   446  			err = multierror.Append(err, unlockErr)
   447  		}
   448  
   449  		return "", err
   450  	}
   451  
   452  	// Start a goroutine to monitor the lock state.
   453  	// If we lose the lock to due communication issues with the consul agent,
   454  	// attempt to immediately reacquire the lock. Put will verify the integrity
   455  	// of the state by using a CAS operation.
   456  	ctx, cancel := context.WithCancel(context.Background())
   457  	c.monitorCancel = cancel
   458  	c.monitorWG.Add(1)
   459  	go func() {
   460  		defer c.monitorWG.Done()
   461  		select {
   462  		case <-c.lockCh:
   463  			log.Println("[ERROR] lost consul lock")
   464  			for {
   465  				c.mu.Lock()
   466  				// We lost our lock, so we need to cancel the session too.
   467  				// The CancelFunc is only replaced while holding Client.mu, so
   468  				// this is safe to call here. This will be replaced by the
   469  				// lock() call below.
   470  				c.sessionCancel()
   471  
   472  				c.consulLock = nil
   473  				_, err := c.lock()
   474  				c.mu.Unlock()
   475  
   476  				if err != nil {
   477  					// We failed to get the lock, keep trying as long as
   478  					// terraform is running. There may be changes in progress,
   479  					// so there's no use in aborting. Either we eventually
   480  					// reacquire the lock, or a Put will fail on a CAS.
   481  					log.Printf("[ERROR] could not reacquire lock: %s", err)
   482  					time.Sleep(lockReacquireInterval)
   483  
   484  					select {
   485  					case <-ctx.Done():
   486  						return
   487  					default:
   488  					}
   489  					continue
   490  				}
   491  
   492  				// if the error was nil, the new lock started a new copy of
   493  				// this goroutine.
   494  				return
   495  			}
   496  
   497  		case <-ctx.Done():
   498  			return
   499  		}
   500  	}()
   501  
   502  	if testLockHook != nil {
   503  		testLockHook()
   504  	}
   505  
   506  	return c.info.ID, nil
   507  }
   508  
   509  // called after a lock is acquired
   510  var testLockHook func()
   511  
   512  func (c *RemoteClient) createSession() (string, error) {
   513  	// create the context first. Even if the session creation fails, we assume
   514  	// that the CancelFunc is always callable.
   515  	ctx, cancel := context.WithCancel(context.Background())
   516  	c.sessionCancel = cancel
   517  
   518  	session := c.Client.Session()
   519  	se := &consulapi.SessionEntry{
   520  		Name:      consulapi.DefaultLockSessionName,
   521  		TTL:       lockSessionTTL,
   522  		LockDelay: lockDelay,
   523  	}
   524  
   525  	id, _, err := session.Create(se, nil)
   526  	if err != nil {
   527  		return "", err
   528  	}
   529  
   530  	log.Println("[INFO] created consul lock session", id)
   531  
   532  	// keep the session renewed
   533  	go session.RenewPeriodic(lockSessionTTL, id, nil, ctx.Done())
   534  
   535  	return id, nil
   536  }
   537  
   538  func (c *RemoteClient) Unlock(id string) error {
   539  	c.mu.Lock()
   540  	defer c.mu.Unlock()
   541  
   542  	if !c.lockState {
   543  		return nil
   544  	}
   545  
   546  	return c.unlock(id)
   547  }
   548  
   549  // the unlock implementation.
   550  // Only to be called while holding Client.mu
   551  func (c *RemoteClient) unlock(id string) error {
   552  	// This method can be called in two circumstances:
   553  	// - when the plan apply or destroy operation finishes and the lock needs to be released,
   554  	// the watchdog stopped and the session closed
   555  	// - when the user calls `terraform force-unlock <lock_id>` in which case
   556  	// we only need to release the lock.
   557  
   558  	if c.consulLock == nil || c.lockCh == nil {
   559  		// The user called `terraform force-unlock <lock_id>`, we just destroy
   560  		// the session which will release the lock, clean the KV store and quit.
   561  
   562  		_, err := c.Client.Session().Destroy(id, nil)
   563  		if err != nil {
   564  			return err
   565  		}
   566  		// We ignore the errors that may happen during cleanup
   567  		kv := c.Client.KV()
   568  		kv.Delete(c.lockPath()+lockSuffix, nil)
   569  		kv.Delete(c.lockPath()+lockInfoSuffix, nil)
   570  
   571  		return nil
   572  	}
   573  
   574  	// cancel our monitoring goroutine
   575  	c.monitorCancel()
   576  
   577  	defer func() {
   578  		c.consulLock = nil
   579  
   580  		// The consul session is only used for this single lock, so cancel it
   581  		// after we unlock.
   582  		// The session is only created and replaced holding Client.mu, so the
   583  		// CancelFunc must be non-nil.
   584  		c.sessionCancel()
   585  	}()
   586  
   587  	select {
   588  	case <-c.lockCh:
   589  		return lostLockErr
   590  	default:
   591  	}
   592  
   593  	kv := c.Client.KV()
   594  
   595  	var errs error
   596  
   597  	if _, err := kv.Delete(c.lockPath()+lockInfoSuffix, nil); err != nil {
   598  		errs = multierror.Append(errs, err)
   599  	}
   600  
   601  	if err := c.consulLock.Unlock(); err != nil {
   602  		errs = multierror.Append(errs, err)
   603  	}
   604  
   605  	// the monitoring goroutine may be in a select on the lockCh, so we need to
   606  	// wait for it to return before changing the value.
   607  	c.monitorWG.Wait()
   608  	c.lockCh = nil
   609  
   610  	// This is only cleanup, and will fail if the lock was immediately taken by
   611  	// another client, so we don't report an error to the user here.
   612  	c.consulLock.Destroy()
   613  
   614  	return errs
   615  }
   616  
   617  func compressState(data []byte) ([]byte, error) {
   618  	b := new(bytes.Buffer)
   619  	gz := gzip.NewWriter(b)
   620  	if _, err := gz.Write(data); err != nil {
   621  		return nil, err
   622  	}
   623  	if err := gz.Flush(); err != nil {
   624  		return nil, err
   625  	}
   626  	if err := gz.Close(); err != nil {
   627  		return nil, err
   628  	}
   629  	return b.Bytes(), nil
   630  }
   631  
   632  func uncompressState(data []byte) ([]byte, error) {
   633  	b := new(bytes.Buffer)
   634  	gz, err := gzip.NewReader(bytes.NewReader(data))
   635  	if err != nil {
   636  		return nil, err
   637  	}
   638  	b.ReadFrom(gz)
   639  	if err := gz.Close(); err != nil {
   640  		return nil, err
   641  	}
   642  	return b.Bytes(), nil
   643  }
   644  
   645  func split(payload []byte, limit int) [][]byte {
   646  	var chunk []byte
   647  	chunks := make([][]byte, 0, len(payload)/limit+1)
   648  	for len(payload) >= limit {
   649  		chunk, payload = payload[:limit], payload[limit:]
   650  		chunks = append(chunks, chunk)
   651  	}
   652  	if len(payload) > 0 {
   653  		chunks = append(chunks, payload[:])
   654  	}
   655  	return chunks
   656  }
   657  
   658  func (c *RemoteClient) chunkedMode() (bool, string, []string, *consulapi.KVPair, error) {
   659  	kv := c.Client.KV()
   660  	pair, _, err := kv.Get(c.Path, nil)
   661  	if err != nil {
   662  		return false, "", nil, pair, err
   663  	}
   664  	if pair != nil {
   665  		var d map[string]interface{}
   666  		err = json.Unmarshal(pair.Value, &d)
   667  		// If there is an error when unmarshaling the payload, the state has
   668  		// probably been gziped in single entry mode.
   669  		if err == nil {
   670  			// If we find the "current-hash" key we were in chunked mode
   671  			hash, ok := d["current-hash"]
   672  			if ok {
   673  				chunks := make([]string, 0)
   674  				for _, c := range d["chunks"].([]interface{}) {
   675  					chunks = append(chunks, c.(string))
   676  				}
   677  				return true, hash.(string), chunks, pair, nil
   678  			}
   679  		}
   680  	}
   681  	return false, "", nil, pair, nil
   682  }