github.com/pgray/terraform@v0.5.4-0.20170822184730-b6a464c5214d/backend/remote-state/consul/client.go (about)

     1  package consul
     2  
     3  import (
     4  	"bytes"
     5  	"compress/gzip"
     6  	"context"
     7  	"crypto/md5"
     8  	"encoding/json"
     9  	"errors"
    10  	"fmt"
    11  	"log"
    12  	"sync"
    13  	"time"
    14  
    15  	consulapi "github.com/hashicorp/consul/api"
    16  	multierror "github.com/hashicorp/go-multierror"
    17  	"github.com/hashicorp/terraform/state"
    18  	"github.com/hashicorp/terraform/state/remote"
    19  )
    20  
    21  const (
    22  	lockSuffix     = "/.lock"
    23  	lockInfoSuffix = "/.lockinfo"
    24  
    25  	// The Session TTL associated with this lock.
    26  	lockSessionTTL = "15s"
    27  
    28  	// the delay time from when a session is lost to when the
    29  	// lock is released by the server
    30  	lockDelay = 5 * time.Second
    31  	// interval between attempts to reacquire a lost lock
    32  	lockReacquireInterval = 2 * time.Second
    33  )
    34  
    35  // RemoteClient is a remote client that stores data in Consul.
    36  type RemoteClient struct {
    37  	Client *consulapi.Client
    38  	Path   string
    39  	GZip   bool
    40  
    41  	mu sync.Mutex
    42  	// lockState is true if we're using locks
    43  	lockState bool
    44  
    45  	// The index of the last state we wrote.
    46  	// If this is > 0, Put will perform a CAS to ensure that the state wasn't
    47  	// changed during the operation. This is important even with locks, because
    48  	// if the client loses the lock for some reason, then reacquires it, we
    49  	// need to make sure that the state was not modified.
    50  	modifyIndex uint64
    51  
    52  	consulLock *consulapi.Lock
    53  	lockCh     <-chan struct{}
    54  
    55  	info *state.LockInfo
    56  
    57  	// cancel our goroutine which is monitoring the lock to automatically
    58  	// reacquire it when possible.
    59  	monitorCancel context.CancelFunc
    60  	monitorWG     sync.WaitGroup
    61  
    62  	// sessionCancel cancels the Context use for session.RenewPeriodic, and is
    63  	// called when unlocking, or before creating a new lock if the lock is
    64  	// lost.
    65  	sessionCancel context.CancelFunc
    66  }
    67  
    68  func (c *RemoteClient) Get() (*remote.Payload, error) {
    69  	c.mu.Lock()
    70  	defer c.mu.Unlock()
    71  
    72  	pair, _, err := c.Client.KV().Get(c.Path, nil)
    73  	if err != nil {
    74  		return nil, err
    75  	}
    76  	if pair == nil {
    77  		return nil, nil
    78  	}
    79  
    80  	c.modifyIndex = pair.ModifyIndex
    81  
    82  	payload := pair.Value
    83  	// If the payload starts with 0x1f, it's gzip, not json
    84  	if len(pair.Value) >= 1 && pair.Value[0] == '\x1f' {
    85  		if data, err := uncompressState(pair.Value); err == nil {
    86  			payload = data
    87  		} else {
    88  			return nil, err
    89  		}
    90  	}
    91  
    92  	md5 := md5.Sum(pair.Value)
    93  	return &remote.Payload{
    94  		Data: payload,
    95  		MD5:  md5[:],
    96  	}, nil
    97  }
    98  
    99  func (c *RemoteClient) Put(data []byte) error {
   100  	c.mu.Lock()
   101  	defer c.mu.Unlock()
   102  
   103  	payload := data
   104  	if c.GZip {
   105  		if compressedState, err := compressState(data); err == nil {
   106  			payload = compressedState
   107  		} else {
   108  			return err
   109  		}
   110  	}
   111  
   112  	kv := c.Client.KV()
   113  
   114  	// default to doing a CAS
   115  	verb := consulapi.KVCAS
   116  
   117  	// Assume a 0 index doesn't need a CAS for now, since we are either
   118  	// creating a new state or purposely overwriting one.
   119  	if c.modifyIndex == 0 {
   120  		verb = consulapi.KVSet
   121  	}
   122  
   123  	// KV.Put doesn't return the new index, so we use a single operation
   124  	// transaction to get the new index with a single request.
   125  	txOps := consulapi.KVTxnOps{
   126  		&consulapi.KVTxnOp{
   127  			Verb:  verb,
   128  			Key:   c.Path,
   129  			Value: payload,
   130  			Index: c.modifyIndex,
   131  		},
   132  	}
   133  
   134  	ok, resp, _, err := kv.Txn(txOps, nil)
   135  	if err != nil {
   136  		return err
   137  	}
   138  
   139  	// transaction was rolled back
   140  	if !ok {
   141  		return fmt.Errorf("consul CAS failed with transaction errors: %v", resp.Errors)
   142  	}
   143  
   144  	if len(resp.Results) != 1 {
   145  		// this probably shouldn't happen
   146  		return fmt.Errorf("expected on 1 response value, got: %d", len(resp.Results))
   147  	}
   148  
   149  	c.modifyIndex = resp.Results[0].ModifyIndex
   150  	return nil
   151  }
   152  
   153  func (c *RemoteClient) Delete() error {
   154  	c.mu.Lock()
   155  	defer c.mu.Unlock()
   156  
   157  	kv := c.Client.KV()
   158  	_, err := kv.Delete(c.Path, nil)
   159  	return err
   160  }
   161  
   162  func (c *RemoteClient) putLockInfo(info *state.LockInfo) error {
   163  	info.Path = c.Path
   164  	info.Created = time.Now().UTC()
   165  
   166  	kv := c.Client.KV()
   167  	_, err := kv.Put(&consulapi.KVPair{
   168  		Key:   c.Path + lockInfoSuffix,
   169  		Value: info.Marshal(),
   170  	}, nil)
   171  
   172  	return err
   173  }
   174  
   175  func (c *RemoteClient) getLockInfo() (*state.LockInfo, error) {
   176  	path := c.Path + lockInfoSuffix
   177  	pair, _, err := c.Client.KV().Get(path, nil)
   178  	if err != nil {
   179  		return nil, err
   180  	}
   181  	if pair == nil {
   182  		return nil, nil
   183  	}
   184  
   185  	li := &state.LockInfo{}
   186  	err = json.Unmarshal(pair.Value, li)
   187  	if err != nil {
   188  		return nil, fmt.Errorf("error unmarshaling lock info: %s", err)
   189  	}
   190  
   191  	return li, nil
   192  }
   193  
   194  func (c *RemoteClient) Lock(info *state.LockInfo) (string, error) {
   195  	c.mu.Lock()
   196  	defer c.mu.Unlock()
   197  
   198  	if !c.lockState {
   199  		return "", nil
   200  	}
   201  
   202  	c.info = info
   203  
   204  	// These checks only are to ensure we strictly follow the specification.
   205  	// Terraform shouldn't ever re-lock, so provide errors for the 2 possible
   206  	// states if this is called.
   207  	select {
   208  	case <-c.lockCh:
   209  		// We had a lock, but lost it.
   210  		return "", errors.New("lost consul lock, cannot re-lock")
   211  	default:
   212  		if c.lockCh != nil {
   213  			// we have an active lock already
   214  			return "", fmt.Errorf("state %q already locked", c.Path)
   215  		}
   216  	}
   217  
   218  	return c.lock()
   219  }
   220  
   221  // the lock implementation.
   222  // Only to be called while holding Client.mu
   223  func (c *RemoteClient) lock() (string, error) {
   224  	// We create a new session here, so it can be canceled when the lock is
   225  	// lost or unlocked.
   226  	lockSession, err := c.createSession()
   227  	if err != nil {
   228  		return "", err
   229  	}
   230  
   231  	opts := &consulapi.LockOptions{
   232  		Key:     c.Path + lockSuffix,
   233  		Session: lockSession,
   234  
   235  		// only wait briefly, so terraform has the choice to fail fast or
   236  		// retry as needed.
   237  		LockWaitTime: time.Second,
   238  		LockTryOnce:  true,
   239  
   240  		// Don't let the lock monitor give up right away, as it's possible the
   241  		// session is still OK. While the session is refreshed at a rate of
   242  		// TTL/2, the lock monitor is an idle blocking request and is more
   243  		// susceptible to being closed by a lower network layer.
   244  		MonitorRetries: 5,
   245  		//
   246  		// The delay between lock monitor retries.
   247  		// While the session has a 15s TTL plus a 5s wait period on a lost
   248  		// lock, if we can't get our lock back in 10+ seconds something is
   249  		// wrong so we're going to drop the session and start over.
   250  		MonitorRetryTime: 2 * time.Second,
   251  	}
   252  
   253  	c.consulLock, err = c.Client.LockOpts(opts)
   254  	if err != nil {
   255  		return "", err
   256  	}
   257  
   258  	lockErr := &state.LockError{}
   259  
   260  	lockCh, err := c.consulLock.Lock(make(chan struct{}))
   261  	if err != nil {
   262  		lockErr.Err = err
   263  		return "", lockErr
   264  	}
   265  
   266  	if lockCh == nil {
   267  		lockInfo, e := c.getLockInfo()
   268  		if e != nil {
   269  			lockErr.Err = e
   270  			return "", lockErr
   271  		}
   272  
   273  		lockErr.Info = lockInfo
   274  
   275  		return "", lockErr
   276  	}
   277  
   278  	c.lockCh = lockCh
   279  
   280  	err = c.putLockInfo(c.info)
   281  	if err != nil {
   282  		if unlockErr := c.unlock(c.info.ID); unlockErr != nil {
   283  			err = multierror.Append(err, unlockErr)
   284  		}
   285  
   286  		return "", err
   287  	}
   288  
   289  	// Start a goroutine to monitor the lock state.
   290  	// If we lose the lock to due communication issues with the consul agent,
   291  	// attempt to immediately reacquire the lock. Put will verify the integrity
   292  	// of the state by using a CAS operation.
   293  	ctx, cancel := context.WithCancel(context.Background())
   294  	c.monitorCancel = cancel
   295  	c.monitorWG.Add(1)
   296  	go func() {
   297  		defer c.monitorWG.Done()
   298  		select {
   299  		case <-c.lockCh:
   300  			log.Println("[ERROR] lost consul lock")
   301  			for {
   302  				c.mu.Lock()
   303  				// We lost our lock, so we need to cancel the session too.
   304  				// The CancelFunc is only replaced while holding Client.mu, so
   305  				// this is safe to call here. This will be replaced by the
   306  				// lock() call below.
   307  				c.sessionCancel()
   308  
   309  				c.consulLock = nil
   310  				_, err := c.lock()
   311  				c.mu.Unlock()
   312  
   313  				if err != nil {
   314  					// We failed to get the lock, keep trying as long as
   315  					// terraform is running. There may be changes in progress,
   316  					// so there's no use in aborting. Either we eventually
   317  					// reacquire the lock, or a Put will fail on a CAS.
   318  					log.Printf("[ERROR] could not reacquire lock: %s", err)
   319  					time.Sleep(lockReacquireInterval)
   320  
   321  					select {
   322  					case <-ctx.Done():
   323  						return
   324  					default:
   325  					}
   326  					continue
   327  				}
   328  
   329  				// if the error was nil, the new lock started a new copy of
   330  				// this goroutine.
   331  				return
   332  			}
   333  
   334  		case <-ctx.Done():
   335  			return
   336  		}
   337  	}()
   338  
   339  	if testLockHook != nil {
   340  		testLockHook()
   341  	}
   342  
   343  	return c.info.ID, nil
   344  }
   345  
   346  // called after a lock is acquired
   347  var testLockHook func()
   348  
   349  func (c *RemoteClient) createSession() (string, error) {
   350  	// create the context first. Even if the session creation fails, we assume
   351  	// that the CancelFunc is always callable.
   352  	ctx, cancel := context.WithCancel(context.Background())
   353  	c.sessionCancel = cancel
   354  
   355  	session := c.Client.Session()
   356  	se := &consulapi.SessionEntry{
   357  		Name:      consulapi.DefaultLockSessionName,
   358  		TTL:       lockSessionTTL,
   359  		LockDelay: lockDelay,
   360  	}
   361  
   362  	id, _, err := session.Create(se, nil)
   363  	if err != nil {
   364  		return "", err
   365  	}
   366  
   367  	log.Println("[INFO] created consul lock session", id)
   368  
   369  	// keep the session renewed
   370  	go session.RenewPeriodic(lockSessionTTL, id, nil, ctx.Done())
   371  
   372  	return id, nil
   373  }
   374  
   375  func (c *RemoteClient) Unlock(id string) error {
   376  	c.mu.Lock()
   377  	defer c.mu.Unlock()
   378  
   379  	if !c.lockState {
   380  		return nil
   381  	}
   382  
   383  	return c.unlock(id)
   384  }
   385  
   386  // the unlock implementation.
   387  // Only to be called while holding Client.mu
   388  func (c *RemoteClient) unlock(id string) error {
   389  	// this doesn't use the lock id, because the lock is tied to the consul client.
   390  	if c.consulLock == nil || c.lockCh == nil {
   391  		return nil
   392  	}
   393  
   394  	// cancel our monitoring goroutine
   395  	c.monitorCancel()
   396  
   397  	defer func() {
   398  		c.consulLock = nil
   399  
   400  		// The consul session is only used for this single lock, so cancel it
   401  		// after we unlock.
   402  		// The session is only created and replaced holding Client.mu, so the
   403  		// CancelFunc must be non-nil.
   404  		c.sessionCancel()
   405  	}()
   406  
   407  	select {
   408  	case <-c.lockCh:
   409  		return errors.New("consul lock was lost")
   410  	default:
   411  	}
   412  
   413  	kv := c.Client.KV()
   414  
   415  	var errs error
   416  
   417  	if _, err := kv.Delete(c.Path+lockInfoSuffix, nil); err != nil {
   418  		errs = multierror.Append(errs, err)
   419  	}
   420  
   421  	if err := c.consulLock.Unlock(); err != nil {
   422  		errs = multierror.Append(errs, err)
   423  	}
   424  
   425  	// the monitoring goroutine may be in a select on the lockCh, so we need to
   426  	// wait for it to return before changing the value.
   427  	c.monitorWG.Wait()
   428  	c.lockCh = nil
   429  
   430  	// This is only cleanup, and will fail if the lock was immediately taken by
   431  	// another client, so we don't report an error to the user here.
   432  	c.consulLock.Destroy()
   433  
   434  	return errs
   435  }
   436  
   437  func compressState(data []byte) ([]byte, error) {
   438  	b := new(bytes.Buffer)
   439  	gz := gzip.NewWriter(b)
   440  	if _, err := gz.Write(data); err != nil {
   441  		return nil, err
   442  	}
   443  	if err := gz.Flush(); err != nil {
   444  		return nil, err
   445  	}
   446  	if err := gz.Close(); err != nil {
   447  		return nil, err
   448  	}
   449  	return b.Bytes(), nil
   450  }
   451  
   452  func uncompressState(data []byte) ([]byte, error) {
   453  	b := new(bytes.Buffer)
   454  	gz, err := gzip.NewReader(bytes.NewReader(data))
   455  	if err != nil {
   456  		return nil, err
   457  	}
   458  	b.ReadFrom(gz)
   459  	if err := gz.Close(); err != nil {
   460  		return nil, err
   461  	}
   462  	return b.Bytes(), nil
   463  }