github.com/hashicorp/vault/sdk@v0.13.0/helper/testcluster/replication.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package testcluster
     5  
     6  import (
     7  	"context"
     8  	"encoding/json"
     9  	"fmt"
    10  	"reflect"
    11  	"strings"
    12  	"time"
    13  
    14  	"github.com/hashicorp/go-hclog"
    15  	"github.com/hashicorp/go-secure-stdlib/strutil"
    16  	"github.com/hashicorp/go-uuid"
    17  	"github.com/hashicorp/vault/api"
    18  	"github.com/hashicorp/vault/sdk/helper/consts"
    19  	"github.com/mitchellh/mapstructure"
    20  )
    21  
    22  func GetPerformanceToken(pri VaultCluster, id, secondaryPublicKey string) (string, error) {
    23  	client := pri.Nodes()[0].APIClient()
    24  	req := map[string]interface{}{
    25  		"id": id,
    26  	}
    27  	if secondaryPublicKey != "" {
    28  		req["secondary_public_key"] = secondaryPublicKey
    29  	}
    30  	secret, err := client.Logical().Write("sys/replication/performance/primary/secondary-token", req)
    31  	if err != nil {
    32  		return "", err
    33  	}
    34  
    35  	if secondaryPublicKey != "" {
    36  		return secret.Data["token"].(string), nil
    37  	}
    38  	return secret.WrapInfo.Token, nil
    39  }
    40  
    41  func EnablePerfPrimary(ctx context.Context, pri VaultCluster) error {
    42  	client := pri.Nodes()[0].APIClient()
    43  	_, err := client.Logical().WriteWithContext(ctx, "sys/replication/performance/primary/enable", nil)
    44  	if err != nil {
    45  		return fmt.Errorf("error enabling perf primary: %w", err)
    46  	}
    47  
    48  	err = WaitForPerfReplicationState(ctx, pri, consts.ReplicationPerformancePrimary)
    49  	if err != nil {
    50  		return fmt.Errorf("error waiting for perf primary to have the correct state: %w", err)
    51  	}
    52  	return WaitForActiveNodeAndPerfStandbys(ctx, pri)
    53  }
    54  
    55  func WaitForPerfReplicationState(ctx context.Context, cluster VaultCluster, state consts.ReplicationState) error {
    56  	client := cluster.Nodes()[0].APIClient()
    57  	var health *api.HealthResponse
    58  	var err error
    59  	for ctx.Err() == nil {
    60  		health, err = client.Sys().HealthWithContext(ctx)
    61  		if err == nil && health.ReplicationPerformanceMode == state.GetPerformanceString() {
    62  			return nil
    63  		}
    64  		time.Sleep(500 * time.Millisecond)
    65  	}
    66  	if err == nil {
    67  		err = ctx.Err()
    68  	}
    69  	return err
    70  }
    71  
    72  func EnablePerformanceSecondaryNoWait(ctx context.Context, perfToken string, pri, sec VaultCluster, updatePrimary bool) error {
    73  	postData := map[string]interface{}{
    74  		"token":   perfToken,
    75  		"ca_file": pri.GetCACertPEMFile(),
    76  	}
    77  	path := "sys/replication/performance/secondary/enable"
    78  	if updatePrimary {
    79  		path = "sys/replication/performance/secondary/update-primary"
    80  	}
    81  	err := WaitForActiveNodeAndPerfStandbys(ctx, sec)
    82  	if err != nil {
    83  		return err
    84  	}
    85  	_, err = sec.Nodes()[0].APIClient().Logical().Write(path, postData)
    86  	if err != nil {
    87  		return err
    88  	}
    89  
    90  	return WaitForPerfReplicationState(ctx, sec, consts.ReplicationPerformanceSecondary)
    91  }
    92  
    93  func EnablePerformanceSecondary(ctx context.Context, perfToken string, pri, sec VaultCluster, updatePrimary, skipPoisonPill bool) (string, error) {
    94  	if err := EnablePerformanceSecondaryNoWait(ctx, perfToken, pri, sec, updatePrimary); err != nil {
    95  		return "", err
    96  	}
    97  	if err := WaitForMatchingMerkleRoots(ctx, "sys/replication/performance/", pri, sec); err != nil {
    98  		return "", err
    99  	}
   100  	root, err := WaitForPerformanceSecondary(ctx, pri, sec, skipPoisonPill)
   101  	if err != nil {
   102  		return "", err
   103  	}
   104  	if err := WaitForPerfReplicationWorking(ctx, pri, sec); err != nil {
   105  		return "", err
   106  	}
   107  	return root, nil
   108  }
   109  
   110  func WaitForMatchingMerkleRoots(ctx context.Context, endpoint string, pri, sec VaultCluster) error {
   111  	return WaitForMatchingMerkleRootsClients(ctx, endpoint, pri.Nodes()[0].APIClient(), sec.Nodes()[0].APIClient())
   112  }
   113  
   114  func WaitForMatchingMerkleRootsClients(ctx context.Context, endpoint string, pri, sec *api.Client) error {
   115  	getRoot := func(mode string, cli *api.Client) (string, error) {
   116  		status, err := cli.Logical().Read(endpoint + "status")
   117  		if err != nil {
   118  			return "", err
   119  		}
   120  		if status == nil || status.Data == nil || status.Data["mode"] == nil {
   121  			return "", fmt.Errorf("got nil secret or data")
   122  		}
   123  		if status.Data["mode"].(string) != mode {
   124  			return "", fmt.Errorf("expected mode=%s, got %s", mode, status.Data["mode"].(string))
   125  		}
   126  		return status.Data["merkle_root"].(string), nil
   127  	}
   128  
   129  	var priRoot, secRoot string
   130  	var err error
   131  	genRet := func() error {
   132  		return fmt.Errorf("unequal merkle roots, pri=%s sec=%s, err=%w", priRoot, secRoot, err)
   133  	}
   134  	for ctx.Err() == nil {
   135  		secRoot, err = getRoot("secondary", sec)
   136  		if err != nil {
   137  			return genRet()
   138  		}
   139  		priRoot, err = getRoot("primary", pri)
   140  		if err != nil {
   141  			return genRet()
   142  		}
   143  
   144  		if reflect.DeepEqual(priRoot, secRoot) {
   145  			return nil
   146  		}
   147  		time.Sleep(time.Second)
   148  	}
   149  
   150  	return fmt.Errorf("roots did not become equal")
   151  }
   152  
   153  func WaitForPerformanceWAL(ctx context.Context, pri, sec VaultCluster) error {
   154  	endpoint := "sys/replication/performance/"
   155  	if err := WaitForMatchingMerkleRoots(ctx, endpoint, pri, sec); err != nil {
   156  		return nil
   157  	}
   158  	getWAL := func(mode, walKey string, cli *api.Client) (int64, error) {
   159  		status, err := cli.Logical().Read(endpoint + "status")
   160  		if err != nil {
   161  			return 0, err
   162  		}
   163  		if status == nil || status.Data == nil || status.Data["mode"] == nil {
   164  			return 0, fmt.Errorf("got nil secret or data")
   165  		}
   166  		if status.Data["mode"].(string) != mode {
   167  			return 0, fmt.Errorf("expected mode=%s, got %s", mode, status.Data["mode"].(string))
   168  		}
   169  		return status.Data[walKey].(json.Number).Int64()
   170  	}
   171  
   172  	secClient := sec.Nodes()[0].APIClient()
   173  	priClient := pri.Nodes()[0].APIClient()
   174  	for ctx.Err() == nil {
   175  		secLastRemoteWAL, err := getWAL("secondary", "last_remote_wal", secClient)
   176  		if err != nil {
   177  			return err
   178  		}
   179  		priLastPerfWAL, err := getWAL("primary", "last_performance_wal", priClient)
   180  		if err != nil {
   181  			return err
   182  		}
   183  
   184  		if secLastRemoteWAL >= priLastPerfWAL {
   185  			return nil
   186  		}
   187  		time.Sleep(time.Second)
   188  	}
   189  
   190  	return fmt.Errorf("performance WALs on the secondary did not catch up with the primary, context err: %w", ctx.Err())
   191  }
   192  
   193  func WaitForPerformanceSecondary(ctx context.Context, pri, sec VaultCluster, skipPoisonPill bool) (string, error) {
   194  	if len(pri.GetRecoveryKeys()) > 0 {
   195  		sec.SetBarrierKeys(pri.GetRecoveryKeys())
   196  		sec.SetRecoveryKeys(pri.GetRecoveryKeys())
   197  	} else {
   198  		sec.SetBarrierKeys(pri.GetBarrierKeys())
   199  		sec.SetRecoveryKeys(pri.GetBarrierKeys())
   200  	}
   201  
   202  	if len(sec.Nodes()) > 1 {
   203  		if skipPoisonPill {
   204  			// As part of prepareSecondary on the active node the keyring is
   205  			// deleted from storage.  Its absence can cause standbys to seal
   206  			// themselves. But it's not reliable, so we'll seal them
   207  			// ourselves to force the issue.
   208  			for i := range sec.Nodes()[1:] {
   209  				if err := SealNode(ctx, sec, i+1); err != nil {
   210  					return "", err
   211  				}
   212  			}
   213  		} else {
   214  			// We want to make sure we unseal all the nodes so we first need to wait
   215  			// until two of the nodes seal due to the poison pill being written
   216  			if err := WaitForNCoresSealed(ctx, sec, len(sec.Nodes())-1); err != nil {
   217  				return "", err
   218  			}
   219  		}
   220  	}
   221  	if _, err := WaitForActiveNode(ctx, sec); err != nil {
   222  		return "", err
   223  	}
   224  	if err := UnsealAllNodes(ctx, sec); err != nil {
   225  		return "", err
   226  	}
   227  
   228  	perfSecondaryRootToken, err := GenerateRoot(sec, GenerateRootRegular)
   229  	if err != nil {
   230  		return "", err
   231  	}
   232  	sec.SetRootToken(perfSecondaryRootToken)
   233  	if err := WaitForActiveNodeAndPerfStandbys(ctx, sec); err != nil {
   234  		return "", err
   235  	}
   236  
   237  	return perfSecondaryRootToken, nil
   238  }
   239  
   240  func WaitForPerfReplicationWorking(ctx context.Context, pri, sec VaultCluster) error {
   241  	priActiveIdx, err := WaitForActiveNode(ctx, pri)
   242  	if err != nil {
   243  		return err
   244  	}
   245  	secActiveIdx, err := WaitForActiveNode(ctx, sec)
   246  	if err != nil {
   247  		return err
   248  	}
   249  
   250  	priClient, secClient := pri.Nodes()[priActiveIdx].APIClient(), sec.Nodes()[secActiveIdx].APIClient()
   251  	mountPoint, err := uuid.GenerateUUID()
   252  	if err != nil {
   253  		return err
   254  	}
   255  	err = priClient.Sys().Mount(mountPoint, &api.MountInput{
   256  		Type:  "kv",
   257  		Local: false,
   258  	})
   259  	if err != nil {
   260  		return fmt.Errorf("unable to mount KV engine on primary")
   261  	}
   262  
   263  	path := mountPoint + "/foo"
   264  	_, err = priClient.Logical().Write(path, map[string]interface{}{
   265  		"bar": 1,
   266  	})
   267  	if err != nil {
   268  		return fmt.Errorf("unable to write KV on primary, path=%s", path)
   269  	}
   270  
   271  	for ctx.Err() == nil {
   272  		var secret *api.Secret
   273  		secret, err = secClient.Logical().Read(path)
   274  		if err == nil && secret != nil {
   275  			err = priClient.Sys().Unmount(mountPoint)
   276  			if err != nil {
   277  				return fmt.Errorf("unable to unmount KV engine on primary")
   278  			}
   279  			return nil
   280  		}
   281  		time.Sleep(100 * time.Millisecond)
   282  	}
   283  	if err == nil {
   284  		err = ctx.Err()
   285  	}
   286  	return fmt.Errorf("unable to read replicated KV on secondary, path=%s, err=%v", path, err)
   287  }
   288  
   289  func SetupTwoClusterPerfReplication(ctx context.Context, pri, sec VaultCluster) error {
   290  	if err := EnablePerfPrimary(ctx, pri); err != nil {
   291  		return fmt.Errorf("failed to enable perf primary: %w", err)
   292  	}
   293  	perfToken, err := GetPerformanceToken(pri, sec.ClusterID(), "")
   294  	if err != nil {
   295  		return fmt.Errorf("failed to get performance token from perf primary: %w", err)
   296  	}
   297  
   298  	_, err = EnablePerformanceSecondary(ctx, perfToken, pri, sec, false, false)
   299  	if err != nil {
   300  		return fmt.Errorf("failed to enable perf secondary: %w", err)
   301  	}
   302  	return nil
   303  }
   304  
   305  // PassiveWaitForActiveNodeAndPerfStandbys should be used instead of
   306  // WaitForActiveNodeAndPerfStandbys when you don't want to do any writes
   307  // as a side-effect. This returns perfStandby nodes in the cluster and
   308  // an error.
   309  func PassiveWaitForActiveNodeAndPerfStandbys(ctx context.Context, pri VaultCluster) (VaultClusterNode, []VaultClusterNode, error) {
   310  	leaderNode, standbys, err := GetActiveAndStandbys(ctx, pri)
   311  	if err != nil {
   312  		return nil, nil, fmt.Errorf("failed to derive standby nodes, %w", err)
   313  	}
   314  
   315  	for i, node := range standbys {
   316  		client := node.APIClient()
   317  		// Make sure we get perf standby nodes
   318  		if err = EnsureCoreIsPerfStandby(ctx, client); err != nil {
   319  			return nil, nil, fmt.Errorf("standby node %d is not a perfStandby, %w", i, err)
   320  		}
   321  	}
   322  
   323  	return leaderNode, standbys, nil
   324  }
   325  
   326  func GetActiveAndStandbys(ctx context.Context, cluster VaultCluster) (VaultClusterNode, []VaultClusterNode, error) {
   327  	var leaderIndex int
   328  	var err error
   329  	if leaderIndex, err = WaitForActiveNode(ctx, cluster); err != nil {
   330  		return nil, nil, err
   331  	}
   332  
   333  	var leaderNode VaultClusterNode
   334  	var nodes []VaultClusterNode
   335  	for i, node := range cluster.Nodes() {
   336  		if i == leaderIndex {
   337  			leaderNode = node
   338  			continue
   339  		}
   340  		nodes = append(nodes, node)
   341  	}
   342  
   343  	return leaderNode, nodes, nil
   344  }
   345  
   346  func EnsureCoreIsPerfStandby(ctx context.Context, client *api.Client) error {
   347  	var err error
   348  	var health *api.HealthResponse
   349  	for ctx.Err() == nil {
   350  		health, err = client.Sys().HealthWithContext(ctx)
   351  		if err == nil && health.PerformanceStandby {
   352  			return nil
   353  		}
   354  		time.Sleep(time.Millisecond * 500)
   355  	}
   356  	if err == nil {
   357  		err = ctx.Err()
   358  	}
   359  	return err
   360  }
   361  
   362  func WaitForDRReplicationState(ctx context.Context, cluster VaultCluster, state consts.ReplicationState) error {
   363  	client := cluster.Nodes()[0].APIClient()
   364  	var health *api.HealthResponse
   365  	var err error
   366  	for ctx.Err() == nil {
   367  		health, err = client.Sys().HealthWithContext(ctx)
   368  		if err == nil && health.ReplicationDRMode == state.GetDRString() {
   369  			return nil
   370  		}
   371  		time.Sleep(500 * time.Millisecond)
   372  	}
   373  	if err == nil {
   374  		err = ctx.Err()
   375  	}
   376  	return err
   377  }
   378  
   379  func EnableDrPrimary(ctx context.Context, pri VaultCluster) error {
   380  	client := pri.Nodes()[0].APIClient()
   381  	_, err := client.Logical().Write("sys/replication/dr/primary/enable", nil)
   382  	if err != nil {
   383  		return err
   384  	}
   385  
   386  	err = WaitForDRReplicationState(ctx, pri, consts.ReplicationDRPrimary)
   387  	if err != nil {
   388  		return err
   389  	}
   390  	return WaitForActiveNodeAndPerfStandbys(ctx, pri)
   391  }
   392  
   393  func GenerateDRActivationToken(pri VaultCluster, id, secondaryPublicKey string) (string, error) {
   394  	client := pri.Nodes()[0].APIClient()
   395  	req := map[string]interface{}{
   396  		"id": id,
   397  	}
   398  	if secondaryPublicKey != "" {
   399  		req["secondary_public_key"] = secondaryPublicKey
   400  	}
   401  	secret, err := client.Logical().Write("sys/replication/dr/primary/secondary-token", req)
   402  	if err != nil {
   403  		return "", err
   404  	}
   405  
   406  	if secondaryPublicKey != "" {
   407  		return secret.Data["token"].(string), nil
   408  	}
   409  	return secret.WrapInfo.Token, nil
   410  }
   411  
   412  func WaitForDRSecondary(ctx context.Context, pri, sec VaultCluster, skipPoisonPill bool) error {
   413  	if len(pri.GetRecoveryKeys()) > 0 {
   414  		sec.SetBarrierKeys(pri.GetRecoveryKeys())
   415  		sec.SetRecoveryKeys(pri.GetRecoveryKeys())
   416  	} else {
   417  		sec.SetBarrierKeys(pri.GetBarrierKeys())
   418  		sec.SetRecoveryKeys(pri.GetBarrierKeys())
   419  	}
   420  
   421  	if len(sec.Nodes()) > 1 {
   422  		if skipPoisonPill {
   423  			// As part of prepareSecondary on the active node the keyring is
   424  			// deleted from storage.  Its absence can cause standbys to seal
   425  			// themselves. But it's not reliable, so we'll seal them
   426  			// ourselves to force the issue.
   427  			for i := range sec.Nodes()[1:] {
   428  				if err := SealNode(ctx, sec, i+1); err != nil {
   429  					return err
   430  				}
   431  			}
   432  		} else {
   433  			// We want to make sure we unseal all the nodes so we first need to wait
   434  			// until two of the nodes seal due to the poison pill being written
   435  			if err := WaitForNCoresSealed(ctx, sec, len(sec.Nodes())-1); err != nil {
   436  				return err
   437  			}
   438  		}
   439  	}
   440  	if _, err := WaitForActiveNode(ctx, sec); err != nil {
   441  		return err
   442  	}
   443  
   444  	// unseal nodes
   445  	for i := range sec.Nodes() {
   446  		if err := UnsealNode(ctx, sec, i); err != nil {
   447  			// Sometimes when we get here it's already unsealed on its own
   448  			// and then this fails for DR secondaries so check again
   449  			// The error is "path disabled in replication DR secondary mode".
   450  			if healthErr := NodeHealthy(ctx, sec, i); healthErr != nil {
   451  				// return the original error
   452  				return err
   453  			}
   454  		}
   455  	}
   456  
   457  	sec.SetRootToken(pri.GetRootToken())
   458  
   459  	if _, err := WaitForActiveNode(ctx, sec); err != nil {
   460  		return err
   461  	}
   462  
   463  	return nil
   464  }
   465  
   466  func EnableDRSecondaryNoWait(ctx context.Context, sec VaultCluster, drToken string) error {
   467  	postData := map[string]interface{}{
   468  		"token":   drToken,
   469  		"ca_file": sec.GetCACertPEMFile(),
   470  	}
   471  
   472  	_, err := sec.Nodes()[0].APIClient().Logical().Write("sys/replication/dr/secondary/enable", postData)
   473  	if err != nil {
   474  		return err
   475  	}
   476  
   477  	return WaitForDRReplicationState(ctx, sec, consts.ReplicationDRSecondary)
   478  }
   479  
   480  func WaitForReplicationStatus(ctx context.Context, client *api.Client, dr bool, accept func(map[string]interface{}) error) error {
   481  	url := "sys/replication/performance/status"
   482  	if dr {
   483  		url = "sys/replication/dr/status"
   484  	}
   485  
   486  	var err error
   487  	var secret *api.Secret
   488  	for ctx.Err() == nil {
   489  		secret, err = client.Logical().Read(url)
   490  		if err == nil && secret != nil && secret.Data != nil {
   491  			if err = accept(secret.Data); err == nil {
   492  				return nil
   493  			}
   494  		}
   495  		time.Sleep(500 * time.Millisecond)
   496  	}
   497  	if err == nil {
   498  		err = ctx.Err()
   499  	}
   500  
   501  	return fmt.Errorf("unable to get acceptable replication status: error=%v secret=%#v", err, secret)
   502  }
   503  
   504  func WaitForDRReplicationWorking(ctx context.Context, pri, sec VaultCluster) error {
   505  	priClient := pri.Nodes()[0].APIClient()
   506  	secClient := sec.Nodes()[0].APIClient()
   507  
   508  	// Make sure we've entered stream-wals mode
   509  	err := WaitForReplicationStatus(ctx, secClient, true, func(secret map[string]interface{}) error {
   510  		state := secret["state"]
   511  		if state == string("stream-wals") {
   512  			return nil
   513  		}
   514  		return fmt.Errorf("expected stream-wals replication state, got %v", state)
   515  	})
   516  	if err != nil {
   517  		return err
   518  	}
   519  
   520  	// Now write some data and make sure that we see last_remote_wal nonzero, i.e.
   521  	// at least one WAL has been streamed.
   522  	secret, err := priClient.Auth().Token().Create(&api.TokenCreateRequest{})
   523  	if err != nil {
   524  		return err
   525  	}
   526  
   527  	// Revoke the token since some tests won't be happy to see it.
   528  	err = priClient.Auth().Token().RevokeTree(secret.Auth.ClientToken)
   529  	if err != nil {
   530  		return err
   531  	}
   532  
   533  	err = WaitForReplicationStatus(ctx, secClient, true, func(secret map[string]interface{}) error {
   534  		state := secret["state"]
   535  		if state != string("stream-wals") {
   536  			return fmt.Errorf("expected stream-wals replication state, got %v", state)
   537  		}
   538  
   539  		if secret["last_remote_wal"] != nil {
   540  			lastRemoteWal, _ := secret["last_remote_wal"].(json.Number).Int64()
   541  			if lastRemoteWal <= 0 {
   542  				return fmt.Errorf("expected last_remote_wal to be greater than zero")
   543  			}
   544  			return nil
   545  		}
   546  
   547  		return fmt.Errorf("replication seems to be still catching up, maybe need to wait more")
   548  	})
   549  	if err != nil {
   550  		return err
   551  	}
   552  	return nil
   553  }
   554  
   555  func EnableDrSecondary(ctx context.Context, pri, sec VaultCluster, drToken string) error {
   556  	err := EnableDRSecondaryNoWait(ctx, sec, drToken)
   557  	if err != nil {
   558  		return err
   559  	}
   560  
   561  	if err = WaitForMatchingMerkleRoots(ctx, "sys/replication/dr/", pri, sec); err != nil {
   562  		return err
   563  	}
   564  
   565  	err = WaitForDRSecondary(ctx, pri, sec, false)
   566  	if err != nil {
   567  		return err
   568  	}
   569  
   570  	if err = WaitForDRReplicationWorking(ctx, pri, sec); err != nil {
   571  		return err
   572  	}
   573  	return nil
   574  }
   575  
   576  func SetupTwoClusterDRReplication(ctx context.Context, pri, sec VaultCluster) error {
   577  	if err := EnableDrPrimary(ctx, pri); err != nil {
   578  		return err
   579  	}
   580  
   581  	drToken, err := GenerateDRActivationToken(pri, sec.ClusterID(), "")
   582  	if err != nil {
   583  		return err
   584  	}
   585  	err = EnableDrSecondary(ctx, pri, sec, drToken)
   586  	if err != nil {
   587  		return err
   588  	}
   589  	return nil
   590  }
   591  
   592  func DemoteDRPrimary(client *api.Client) error {
   593  	_, err := client.Logical().Write("sys/replication/dr/primary/demote", map[string]interface{}{})
   594  	return err
   595  }
   596  
   597  func createBatchToken(client *api.Client, path string) (string, error) {
   598  	// TODO: should these be more random in case more than one batch token needs to be created?
   599  	suffix := strings.Replace(path, "/", "", -1)
   600  	policyName := "path-batch-policy-" + suffix
   601  	roleName := "path-batch-role-" + suffix
   602  
   603  	rules := fmt.Sprintf(`path "%s" { capabilities = [ "read", "update" ] }`, path)
   604  
   605  	// create policy
   606  	_, err := client.Logical().Write("sys/policy/"+policyName, map[string]interface{}{
   607  		"policy": rules,
   608  	})
   609  	if err != nil {
   610  		return "", err
   611  	}
   612  
   613  	// create a role
   614  	_, err = client.Logical().Write("auth/token/roles/"+roleName, map[string]interface{}{
   615  		"allowed_policies": policyName,
   616  		"orphan":           true,
   617  		"renewable":        false,
   618  		"token_type":       "batch",
   619  	})
   620  	if err != nil {
   621  		return "", err
   622  	}
   623  
   624  	// create batch token
   625  	secret, err := client.Logical().Write("auth/token/create/"+roleName, nil)
   626  	if err != nil {
   627  		return "", err
   628  	}
   629  
   630  	return secret.Auth.ClientToken, nil
   631  }
   632  
   633  // PromoteDRSecondaryWithBatchToken creates a batch token for DR promotion
   634  // before promotion, it demotes the primary cluster. The primary cluster needs
   635  // to be functional for the generation of the batch token
   636  func PromoteDRSecondaryWithBatchToken(ctx context.Context, pri, sec VaultCluster) error {
   637  	client := pri.Nodes()[0].APIClient()
   638  	drToken, err := createBatchToken(client, "sys/replication/dr/secondary/promote")
   639  	if err != nil {
   640  		return err
   641  	}
   642  
   643  	err = DemoteDRPrimary(client)
   644  	if err != nil {
   645  		return err
   646  	}
   647  
   648  	return promoteDRSecondaryInternal(ctx, sec, drToken)
   649  }
   650  
   651  // PromoteDRSecondary generates a DR operation token on the secondary using
   652  // unseal/recovery keys. Therefore, the primary cluster could potentially
   653  // be out of service.
   654  func PromoteDRSecondary(ctx context.Context, sec VaultCluster) error {
   655  	// generate DR operation token to do update primary on vC to point to
   656  	// the new perfSec primary vD
   657  	drToken, err := GenerateRoot(sec, GenerateRootDR)
   658  	if err != nil {
   659  		return err
   660  	}
   661  	return promoteDRSecondaryInternal(ctx, sec, drToken)
   662  }
   663  
   664  func promoteDRSecondaryInternal(ctx context.Context, sec VaultCluster, drToken string) error {
   665  	secClient := sec.Nodes()[0].APIClient()
   666  
   667  	// Allow retries of 503s, e.g.: replication is still catching up,
   668  	// try again later or provide the "force" argument
   669  	oldMaxRetries := secClient.MaxRetries()
   670  	secClient.SetMaxRetries(10)
   671  	defer secClient.SetMaxRetries(oldMaxRetries)
   672  	resp, err := secClient.Logical().Write("sys/replication/dr/secondary/promote", map[string]interface{}{
   673  		"dr_operation_token": drToken,
   674  	})
   675  	if err != nil {
   676  		return err
   677  	}
   678  	if resp == nil {
   679  		return fmt.Errorf("nil status response during DR promotion")
   680  	}
   681  
   682  	if _, err := WaitForActiveNode(ctx, sec); err != nil {
   683  		return err
   684  	}
   685  
   686  	return WaitForDRReplicationState(ctx, sec, consts.ReplicationDRPrimary)
   687  }
   688  
   689  func checkClusterAddr(ctx context.Context, pri, sec VaultCluster) error {
   690  	priClient := pri.Nodes()[0].APIClient()
   691  	priLeader, err := priClient.Sys().LeaderWithContext(ctx)
   692  	if err != nil {
   693  		return err
   694  	}
   695  	secClient := sec.Nodes()[0].APIClient()
   696  	endpoint := "sys/replication/dr/"
   697  	status, err := secClient.Logical().Read(endpoint + "status")
   698  	if err != nil {
   699  		return err
   700  	}
   701  	if status == nil || status.Data == nil {
   702  		return fmt.Errorf("got nil secret or data")
   703  	}
   704  
   705  	var priAddrs []string
   706  	err = mapstructure.Decode(status.Data["known_primary_cluster_addrs"], &priAddrs)
   707  	if err != nil {
   708  		return err
   709  	}
   710  	if !strutil.StrListContains(priAddrs, priLeader.LeaderClusterAddress) {
   711  		return fmt.Errorf("failed to fine the expected primary cluster address %v in known_primary_cluster_addrs", priLeader.LeaderClusterAddress)
   712  	}
   713  
   714  	return nil
   715  }
   716  
   717  func UpdatePrimary(ctx context.Context, pri, sec VaultCluster) error {
   718  	// generate DR operation token to do update primary on vC to point to
   719  	// the new perfSec primary vD
   720  	rootToken, err := GenerateRoot(sec, GenerateRootDR)
   721  	if err != nil {
   722  		return err
   723  	}
   724  
   725  	// secondary activation token
   726  	drToken, err := GenerateDRActivationToken(pri, sec.ClusterID(), "")
   727  	if err != nil {
   728  		return err
   729  	}
   730  
   731  	// update-primary on vC (new perfSec Dr secondary) to point to
   732  	// the new perfSec Dr primary
   733  	secClient := sec.Nodes()[0].APIClient()
   734  	resp, err := secClient.Logical().Write("sys/replication/dr/secondary/update-primary", map[string]interface{}{
   735  		"dr_operation_token": rootToken,
   736  		"token":              drToken,
   737  		"ca_file":            sec.GetCACertPEMFile(),
   738  	})
   739  	if err != nil {
   740  		return err
   741  	}
   742  	if resp == nil {
   743  		return fmt.Errorf("nil status response during update primary")
   744  	}
   745  
   746  	if _, err = WaitForActiveNode(ctx, sec); err != nil {
   747  		return err
   748  	}
   749  
   750  	if err = WaitForDRReplicationState(ctx, sec, consts.ReplicationDRSecondary); err != nil {
   751  		return err
   752  	}
   753  
   754  	if err = checkClusterAddr(ctx, pri, sec); err != nil {
   755  		return err
   756  	}
   757  
   758  	return nil
   759  }
   760  
   761  func SetupFourClusterReplication(ctx context.Context, pri, sec, pridr, secdr VaultCluster) error {
   762  	err := SetupTwoClusterPerfReplication(ctx, pri, sec)
   763  	if err != nil {
   764  		return err
   765  	}
   766  	err = SetupTwoClusterDRReplication(ctx, pri, pridr)
   767  	if err != nil {
   768  		return err
   769  	}
   770  	err = SetupTwoClusterDRReplication(ctx, sec, secdr)
   771  	if err != nil {
   772  		return err
   773  	}
   774  	return nil
   775  }
   776  
   777  type ReplicationSet struct {
   778  	// By convention, we recommend the following naming scheme for
   779  	// clusters in this map:
   780  	// A: perf primary
   781  	// B: primary's DR
   782  	// C: first perf secondary of A
   783  	// D: C's DR
   784  	// E: second perf secondary of A
   785  	// F: E's DR
   786  	// ... etc.
   787  	//
   788  	// We use generic names rather than role-specific names because
   789  	// that's less confusing when promotions take place that result in role
   790  	// changes. In other words, if D gets promoted to replace C as a perf
   791  	// secondary, and C gets demoted and updated to become D's DR secondary,
   792  	// they should maintain their initial names of D and C throughout.
   793  	Clusters map[string]VaultCluster
   794  	Builder  ClusterBuilder
   795  	Logger   hclog.Logger
   796  	CA       *CA
   797  }
   798  
   799  type ClusterBuilder func(ctx context.Context, name string, logger hclog.Logger) (VaultCluster, error)
   800  
   801  func NewReplicationSet(b ClusterBuilder) (*ReplicationSet, error) {
   802  	return &ReplicationSet{
   803  		Clusters: map[string]VaultCluster{},
   804  		Builder:  b,
   805  		Logger:   hclog.NewNullLogger(),
   806  	}, nil
   807  }
   808  
   809  func (r *ReplicationSet) StandardPerfReplication(ctx context.Context) error {
   810  	for _, name := range []string{"A", "C"} {
   811  		if _, ok := r.Clusters[name]; !ok {
   812  			cluster, err := r.Builder(ctx, name, r.Logger)
   813  			if err != nil {
   814  				return err
   815  			}
   816  			r.Clusters[name] = cluster
   817  		}
   818  	}
   819  
   820  	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
   821  	defer cancel()
   822  	err := SetupTwoClusterPerfReplication(ctx, r.Clusters["A"], r.Clusters["C"])
   823  	if err != nil {
   824  		return err
   825  	}
   826  
   827  	return nil
   828  }
   829  
   830  func (r *ReplicationSet) StandardDRReplication(ctx context.Context) error {
   831  	for _, name := range []string{"A", "B"} {
   832  		if _, ok := r.Clusters[name]; !ok {
   833  			cluster, err := r.Builder(ctx, name, r.Logger)
   834  			if err != nil {
   835  				return err
   836  			}
   837  			r.Clusters[name] = cluster
   838  		}
   839  	}
   840  
   841  	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
   842  	defer cancel()
   843  	err := SetupTwoClusterDRReplication(ctx, r.Clusters["A"], r.Clusters["B"])
   844  	if err != nil {
   845  		return err
   846  	}
   847  
   848  	return nil
   849  }
   850  
   851  func (r *ReplicationSet) GetFourReplicationCluster(ctx context.Context) error {
   852  	for _, name := range []string{"A", "B", "C", "D"} {
   853  		if _, ok := r.Clusters[name]; !ok {
   854  			cluster, err := r.Builder(ctx, name, r.Logger)
   855  			if err != nil {
   856  				return err
   857  			}
   858  			r.Clusters[name] = cluster
   859  		}
   860  	}
   861  
   862  	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
   863  	defer cancel()
   864  	err := SetupFourClusterReplication(ctx, r.Clusters["A"], r.Clusters["C"], r.Clusters["B"], r.Clusters["D"])
   865  	if err != nil {
   866  		return err
   867  	}
   868  	return nil
   869  }
   870  
   871  func (r *ReplicationSet) Cleanup() {
   872  	for _, cluster := range r.Clusters {
   873  		cluster.Cleanup()
   874  	}
   875  }
   876  
   877  func WaitForPerfReplicationConnectionStatus(ctx context.Context, client *api.Client) error {
   878  	type Primary struct {
   879  		APIAddress       string `mapstructure:"api_address"`
   880  		ConnectionStatus string `mapstructure:"connection_status"`
   881  		ClusterAddress   string `mapstructure:"cluster_address"`
   882  		LastHeartbeat    string `mapstructure:"last_heartbeat"`
   883  	}
   884  	type Status struct {
   885  		Primaries []Primary `mapstructure:"primaries"`
   886  	}
   887  	return WaitForPerfReplicationStatus(ctx, client, func(m map[string]interface{}) error {
   888  		var status Status
   889  		err := mapstructure.Decode(m, &status)
   890  		if err != nil {
   891  			return err
   892  		}
   893  		if len(status.Primaries) == 0 {
   894  			return fmt.Errorf("primaries is zero")
   895  		}
   896  		for _, v := range status.Primaries {
   897  			if v.ConnectionStatus == "connected" {
   898  				return nil
   899  			}
   900  		}
   901  		return fmt.Errorf("no primaries connected")
   902  	})
   903  }
   904  
   905  func WaitForPerfReplicationStatus(ctx context.Context, client *api.Client, accept func(map[string]interface{}) error) error {
   906  	var err error
   907  	var secret *api.Secret
   908  	for ctx.Err() == nil {
   909  		secret, err = client.Logical().Read("sys/replication/performance/status")
   910  		if err == nil && secret != nil && secret.Data != nil {
   911  			if err = accept(secret.Data); err == nil {
   912  				return nil
   913  			}
   914  		}
   915  		time.Sleep(500 * time.Millisecond)
   916  	}
   917  	return fmt.Errorf("unable to get acceptable replication status within allotted time: error=%v secret=%#v", err, secret)
   918  }