github.com/Kevinklinger/open_terraform@v0.11.12-beta1/backend/remote-state/consul/client.go (about)

     1  package consul
     2  
     3  import (
     4  	"bytes"
     5  	"compress/gzip"
     6  	"context"
     7  	"crypto/md5"
     8  	"encoding/json"
     9  	"errors"
    10  	"fmt"
    11  	"log"
    12  	"sync"
    13  	"time"
    14  
    15  	consulapi "github.com/hashicorp/consul/api"
    16  	multierror "github.com/hashicorp/go-multierror"
    17  	"github.com/hashicorp/terraform/state"
    18  	"github.com/hashicorp/terraform/state/remote"
    19  )
    20  
    21  const (
    22  	lockSuffix     = "/.lock"
    23  	lockInfoSuffix = "/.lockinfo"
    24  
    25  	// The Session TTL associated with this lock.
    26  	lockSessionTTL = "15s"
    27  
    28  	// the delay time from when a session is lost to when the
    29  	// lock is released by the server
    30  	lockDelay = 5 * time.Second
    31  	// interval between attempts to reacquire a lost lock
    32  	lockReacquireInterval = 2 * time.Second
    33  )
    34  
    35  var lostLockErr = errors.New("consul lock was lost")
    36  
    37  // RemoteClient is a remote client that stores data in Consul.
    38  type RemoteClient struct {
    39  	Client *consulapi.Client
    40  	Path   string
    41  	GZip   bool
    42  
    43  	mu sync.Mutex
    44  	// lockState is true if we're using locks
    45  	lockState bool
    46  
    47  	// The index of the last state we wrote.
    48  	// If this is > 0, Put will perform a CAS to ensure that the state wasn't
    49  	// changed during the operation. This is important even with locks, because
    50  	// if the client loses the lock for some reason, then reacquires it, we
    51  	// need to make sure that the state was not modified.
    52  	modifyIndex uint64
    53  
    54  	consulLock *consulapi.Lock
    55  	lockCh     <-chan struct{}
    56  
    57  	info *state.LockInfo
    58  
    59  	// cancel our goroutine which is monitoring the lock to automatically
    60  	// reacquire it when possible.
    61  	monitorCancel context.CancelFunc
    62  	monitorWG     sync.WaitGroup
    63  
    64  	// sessionCancel cancels the Context use for session.RenewPeriodic, and is
    65  	// called when unlocking, or before creating a new lock if the lock is
    66  	// lost.
    67  	sessionCancel context.CancelFunc
    68  }
    69  
    70  func (c *RemoteClient) Get() (*remote.Payload, error) {
    71  	c.mu.Lock()
    72  	defer c.mu.Unlock()
    73  
    74  	pair, _, err := c.Client.KV().Get(c.Path, nil)
    75  	if err != nil {
    76  		return nil, err
    77  	}
    78  	if pair == nil {
    79  		return nil, nil
    80  	}
    81  
    82  	c.modifyIndex = pair.ModifyIndex
    83  
    84  	payload := pair.Value
    85  	// If the payload starts with 0x1f, it's gzip, not json
    86  	if len(pair.Value) >= 1 && pair.Value[0] == '\x1f' {
    87  		if data, err := uncompressState(pair.Value); err == nil {
    88  			payload = data
    89  		} else {
    90  			return nil, err
    91  		}
    92  	}
    93  
    94  	md5 := md5.Sum(pair.Value)
    95  	return &remote.Payload{
    96  		Data: payload,
    97  		MD5:  md5[:],
    98  	}, nil
    99  }
   100  
   101  func (c *RemoteClient) Put(data []byte) error {
   102  	c.mu.Lock()
   103  	defer c.mu.Unlock()
   104  
   105  	payload := data
   106  	if c.GZip {
   107  		if compressedState, err := compressState(data); err == nil {
   108  			payload = compressedState
   109  		} else {
   110  			return err
   111  		}
   112  	}
   113  
   114  	kv := c.Client.KV()
   115  
   116  	// default to doing a CAS
   117  	verb := consulapi.KVCAS
   118  
   119  	// Assume a 0 index doesn't need a CAS for now, since we are either
   120  	// creating a new state or purposely overwriting one.
   121  	if c.modifyIndex == 0 {
   122  		verb = consulapi.KVSet
   123  	}
   124  
   125  	// KV.Put doesn't return the new index, so we use a single operation
   126  	// transaction to get the new index with a single request.
   127  	txOps := consulapi.KVTxnOps{
   128  		&consulapi.KVTxnOp{
   129  			Verb:  verb,
   130  			Key:   c.Path,
   131  			Value: payload,
   132  			Index: c.modifyIndex,
   133  		},
   134  	}
   135  
   136  	ok, resp, _, err := kv.Txn(txOps, nil)
   137  	if err != nil {
   138  		return err
   139  	}
   140  
   141  	// transaction was rolled back
   142  	if !ok {
   143  		return fmt.Errorf("consul CAS failed with transaction errors: %v", resp.Errors)
   144  	}
   145  
   146  	if len(resp.Results) != 1 {
   147  		// this probably shouldn't happen
   148  		return fmt.Errorf("expected on 1 response value, got: %d", len(resp.Results))
   149  	}
   150  
   151  	c.modifyIndex = resp.Results[0].ModifyIndex
   152  	return nil
   153  }
   154  
   155  func (c *RemoteClient) Delete() error {
   156  	c.mu.Lock()
   157  	defer c.mu.Unlock()
   158  
   159  	kv := c.Client.KV()
   160  	_, err := kv.Delete(c.Path, nil)
   161  	return err
   162  }
   163  
   164  func (c *RemoteClient) putLockInfo(info *state.LockInfo) error {
   165  	info.Path = c.Path
   166  	info.Created = time.Now().UTC()
   167  
   168  	kv := c.Client.KV()
   169  	_, err := kv.Put(&consulapi.KVPair{
   170  		Key:   c.Path + lockInfoSuffix,
   171  		Value: info.Marshal(),
   172  	}, nil)
   173  
   174  	return err
   175  }
   176  
   177  func (c *RemoteClient) getLockInfo() (*state.LockInfo, error) {
   178  	path := c.Path + lockInfoSuffix
   179  	pair, _, err := c.Client.KV().Get(path, nil)
   180  	if err != nil {
   181  		return nil, err
   182  	}
   183  	if pair == nil {
   184  		return nil, nil
   185  	}
   186  
   187  	li := &state.LockInfo{}
   188  	err = json.Unmarshal(pair.Value, li)
   189  	if err != nil {
   190  		return nil, fmt.Errorf("error unmarshaling lock info: %s", err)
   191  	}
   192  
   193  	return li, nil
   194  }
   195  
   196  func (c *RemoteClient) Lock(info *state.LockInfo) (string, error) {
   197  	c.mu.Lock()
   198  	defer c.mu.Unlock()
   199  
   200  	if !c.lockState {
   201  		return "", nil
   202  	}
   203  
   204  	c.info = info
   205  
   206  	// These checks only are to ensure we strictly follow the specification.
   207  	// Terraform shouldn't ever re-lock, so provide errors for the 2 possible
   208  	// states if this is called.
   209  	select {
   210  	case <-c.lockCh:
   211  		// We had a lock, but lost it.
   212  		return "", errors.New("lost consul lock, cannot re-lock")
   213  	default:
   214  		if c.lockCh != nil {
   215  			// we have an active lock already
   216  			return "", fmt.Errorf("state %q already locked", c.Path)
   217  		}
   218  	}
   219  
   220  	return c.lock()
   221  }
   222  
   223  // the lock implementation.
   224  // Only to be called while holding Client.mu
   225  func (c *RemoteClient) lock() (string, error) {
   226  	// We create a new session here, so it can be canceled when the lock is
   227  	// lost or unlocked.
   228  	lockSession, err := c.createSession()
   229  	if err != nil {
   230  		return "", err
   231  	}
   232  
   233  	// store the session ID for correlation with consul logs
   234  	c.info.Info = "consul session: " + lockSession
   235  
   236  	opts := &consulapi.LockOptions{
   237  		Key:     c.Path + lockSuffix,
   238  		Session: lockSession,
   239  
   240  		// only wait briefly, so terraform has the choice to fail fast or
   241  		// retry as needed.
   242  		LockWaitTime: time.Second,
   243  		LockTryOnce:  true,
   244  
   245  		// Don't let the lock monitor give up right away, as it's possible the
   246  		// session is still OK. While the session is refreshed at a rate of
   247  		// TTL/2, the lock monitor is an idle blocking request and is more
   248  		// susceptible to being closed by a lower network layer.
   249  		MonitorRetries: 5,
   250  		//
   251  		// The delay between lock monitor retries.
   252  		// While the session has a 15s TTL plus a 5s wait period on a lost
   253  		// lock, if we can't get our lock back in 10+ seconds something is
   254  		// wrong so we're going to drop the session and start over.
   255  		MonitorRetryTime: 2 * time.Second,
   256  	}
   257  
   258  	c.consulLock, err = c.Client.LockOpts(opts)
   259  	if err != nil {
   260  		return "", err
   261  	}
   262  
   263  	lockErr := &state.LockError{}
   264  
   265  	lockCh, err := c.consulLock.Lock(make(chan struct{}))
   266  	if err != nil {
   267  		lockErr.Err = err
   268  		return "", lockErr
   269  	}
   270  
   271  	if lockCh == nil {
   272  		lockInfo, e := c.getLockInfo()
   273  		if e != nil {
   274  			lockErr.Err = e
   275  			return "", lockErr
   276  		}
   277  
   278  		lockErr.Info = lockInfo
   279  
   280  		return "", lockErr
   281  	}
   282  
   283  	c.lockCh = lockCh
   284  
   285  	err = c.putLockInfo(c.info)
   286  	if err != nil {
   287  		if unlockErr := c.unlock(c.info.ID); unlockErr != nil {
   288  			err = multierror.Append(err, unlockErr)
   289  		}
   290  
   291  		return "", err
   292  	}
   293  
   294  	// Start a goroutine to monitor the lock state.
   295  	// If we lose the lock to due communication issues with the consul agent,
   296  	// attempt to immediately reacquire the lock. Put will verify the integrity
   297  	// of the state by using a CAS operation.
   298  	ctx, cancel := context.WithCancel(context.Background())
   299  	c.monitorCancel = cancel
   300  	c.monitorWG.Add(1)
   301  	go func() {
   302  		defer c.monitorWG.Done()
   303  		select {
   304  		case <-c.lockCh:
   305  			log.Println("[ERROR] lost consul lock")
   306  			for {
   307  				c.mu.Lock()
   308  				// We lost our lock, so we need to cancel the session too.
   309  				// The CancelFunc is only replaced while holding Client.mu, so
   310  				// this is safe to call here. This will be replaced by the
   311  				// lock() call below.
   312  				c.sessionCancel()
   313  
   314  				c.consulLock = nil
   315  				_, err := c.lock()
   316  				c.mu.Unlock()
   317  
   318  				if err != nil {
   319  					// We failed to get the lock, keep trying as long as
   320  					// terraform is running. There may be changes in progress,
   321  					// so there's no use in aborting. Either we eventually
   322  					// reacquire the lock, or a Put will fail on a CAS.
   323  					log.Printf("[ERROR] could not reacquire lock: %s", err)
   324  					time.Sleep(lockReacquireInterval)
   325  
   326  					select {
   327  					case <-ctx.Done():
   328  						return
   329  					default:
   330  					}
   331  					continue
   332  				}
   333  
   334  				// if the error was nil, the new lock started a new copy of
   335  				// this goroutine.
   336  				return
   337  			}
   338  
   339  		case <-ctx.Done():
   340  			return
   341  		}
   342  	}()
   343  
   344  	if testLockHook != nil {
   345  		testLockHook()
   346  	}
   347  
   348  	return c.info.ID, nil
   349  }
   350  
   351  // called after a lock is acquired
   352  var testLockHook func()
   353  
   354  func (c *RemoteClient) createSession() (string, error) {
   355  	// create the context first. Even if the session creation fails, we assume
   356  	// that the CancelFunc is always callable.
   357  	ctx, cancel := context.WithCancel(context.Background())
   358  	c.sessionCancel = cancel
   359  
   360  	session := c.Client.Session()
   361  	se := &consulapi.SessionEntry{
   362  		Name:      consulapi.DefaultLockSessionName,
   363  		TTL:       lockSessionTTL,
   364  		LockDelay: lockDelay,
   365  	}
   366  
   367  	id, _, err := session.Create(se, nil)
   368  	if err != nil {
   369  		return "", err
   370  	}
   371  
   372  	log.Println("[INFO] created consul lock session", id)
   373  
   374  	// keep the session renewed
   375  	go session.RenewPeriodic(lockSessionTTL, id, nil, ctx.Done())
   376  
   377  	return id, nil
   378  }
   379  
   380  func (c *RemoteClient) Unlock(id string) error {
   381  	c.mu.Lock()
   382  	defer c.mu.Unlock()
   383  
   384  	if !c.lockState {
   385  		return nil
   386  	}
   387  
   388  	return c.unlock(id)
   389  }
   390  
   391  // the unlock implementation.
   392  // Only to be called while holding Client.mu
   393  func (c *RemoteClient) unlock(id string) error {
   394  	// this doesn't use the lock id, because the lock is tied to the consul client.
   395  	if c.consulLock == nil || c.lockCh == nil {
   396  		return nil
   397  	}
   398  
   399  	// cancel our monitoring goroutine
   400  	c.monitorCancel()
   401  
   402  	defer func() {
   403  		c.consulLock = nil
   404  
   405  		// The consul session is only used for this single lock, so cancel it
   406  		// after we unlock.
   407  		// The session is only created and replaced holding Client.mu, so the
   408  		// CancelFunc must be non-nil.
   409  		c.sessionCancel()
   410  	}()
   411  
   412  	select {
   413  	case <-c.lockCh:
   414  		return lostLockErr
   415  	default:
   416  	}
   417  
   418  	kv := c.Client.KV()
   419  
   420  	var errs error
   421  
   422  	if _, err := kv.Delete(c.Path+lockInfoSuffix, nil); err != nil {
   423  		errs = multierror.Append(errs, err)
   424  	}
   425  
   426  	if err := c.consulLock.Unlock(); err != nil {
   427  		errs = multierror.Append(errs, err)
   428  	}
   429  
   430  	// the monitoring goroutine may be in a select on the lockCh, so we need to
   431  	// wait for it to return before changing the value.
   432  	c.monitorWG.Wait()
   433  	c.lockCh = nil
   434  
   435  	// This is only cleanup, and will fail if the lock was immediately taken by
   436  	// another client, so we don't report an error to the user here.
   437  	c.consulLock.Destroy()
   438  
   439  	return errs
   440  }
   441  
   442  func compressState(data []byte) ([]byte, error) {
   443  	b := new(bytes.Buffer)
   444  	gz := gzip.NewWriter(b)
   445  	if _, err := gz.Write(data); err != nil {
   446  		return nil, err
   447  	}
   448  	if err := gz.Flush(); err != nil {
   449  		return nil, err
   450  	}
   451  	if err := gz.Close(); err != nil {
   452  		return nil, err
   453  	}
   454  	return b.Bytes(), nil
   455  }
   456  
   457  func uncompressState(data []byte) ([]byte, error) {
   458  	b := new(bytes.Buffer)
   459  	gz, err := gzip.NewReader(bytes.NewReader(data))
   460  	if err != nil {
   461  		return nil, err
   462  	}
   463  	b.ReadFrom(gz)
   464  	if err := gz.Close(); err != nil {
   465  		return nil, err
   466  	}
   467  	return b.Bytes(), nil
   468  }