github.com/hashicorp/vault/sdk@v0.11.0/helper/testcluster/replication.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package testcluster
     5  
     6  import (
     7  	"context"
     8  	"encoding/json"
     9  	"fmt"
    10  	"reflect"
    11  	"strings"
    12  	"time"
    13  
    14  	"github.com/hashicorp/go-hclog"
    15  	"github.com/hashicorp/go-secure-stdlib/strutil"
    16  	"github.com/hashicorp/go-uuid"
    17  	"github.com/hashicorp/vault/api"
    18  	"github.com/hashicorp/vault/sdk/helper/consts"
    19  	"github.com/mitchellh/mapstructure"
    20  )
    21  
    22  func GetPerformanceToken(pri VaultCluster, id, secondaryPublicKey string) (string, error) {
    23  	client := pri.Nodes()[0].APIClient()
    24  	req := map[string]interface{}{
    25  		"id": id,
    26  	}
    27  	if secondaryPublicKey != "" {
    28  		req["secondary_public_key"] = secondaryPublicKey
    29  	}
    30  	secret, err := client.Logical().Write("sys/replication/performance/primary/secondary-token", req)
    31  	if err != nil {
    32  		return "", err
    33  	}
    34  
    35  	if secondaryPublicKey != "" {
    36  		return secret.Data["token"].(string), nil
    37  	}
    38  	return secret.WrapInfo.Token, nil
    39  }
    40  
    41  func EnablePerfPrimary(ctx context.Context, pri VaultCluster) error {
    42  	client := pri.Nodes()[0].APIClient()
    43  	_, err := client.Logical().WriteWithContext(ctx, "sys/replication/performance/primary/enable", nil)
    44  	if err != nil {
    45  		return err
    46  	}
    47  
    48  	err = WaitForPerfReplicationState(ctx, pri, consts.ReplicationPerformancePrimary)
    49  	if err != nil {
    50  		return err
    51  	}
    52  	return WaitForActiveNodeAndPerfStandbys(ctx, pri)
    53  }
    54  
    55  func WaitForPerfReplicationState(ctx context.Context, cluster VaultCluster, state consts.ReplicationState) error {
    56  	client := cluster.Nodes()[0].APIClient()
    57  	var health *api.HealthResponse
    58  	var err error
    59  	for ctx.Err() == nil {
    60  		health, err = client.Sys().HealthWithContext(ctx)
    61  		if err == nil && health.ReplicationPerformanceMode == state.GetPerformanceString() {
    62  			return nil
    63  		}
    64  		time.Sleep(500 * time.Millisecond)
    65  	}
    66  	if err == nil {
    67  		err = ctx.Err()
    68  	}
    69  	return err
    70  }
    71  
    72  func EnablePerformanceSecondaryNoWait(ctx context.Context, perfToken string, pri, sec VaultCluster, updatePrimary bool) error {
    73  	postData := map[string]interface{}{
    74  		"token":   perfToken,
    75  		"ca_file": DefaultCAFile,
    76  	}
    77  	path := "sys/replication/performance/secondary/enable"
    78  	if updatePrimary {
    79  		path = "sys/replication/performance/secondary/update-primary"
    80  	}
    81  	err := WaitForActiveNodeAndPerfStandbys(ctx, sec)
    82  	if err != nil {
    83  		return err
    84  	}
    85  	_, err = sec.Nodes()[0].APIClient().Logical().Write(path, postData)
    86  	if err != nil {
    87  		return err
    88  	}
    89  
    90  	return WaitForPerfReplicationState(ctx, sec, consts.ReplicationPerformanceSecondary)
    91  }
    92  
    93  func EnablePerformanceSecondary(ctx context.Context, perfToken string, pri, sec VaultCluster, updatePrimary, skipPoisonPill bool) (string, error) {
    94  	if err := EnablePerformanceSecondaryNoWait(ctx, perfToken, pri, sec, updatePrimary); err != nil {
    95  		return "", err
    96  	}
    97  	if err := WaitForMatchingMerkleRoots(ctx, "sys/replication/performance/", pri, sec); err != nil {
    98  		return "", err
    99  	}
   100  	root, err := WaitForPerformanceSecondary(ctx, pri, sec, skipPoisonPill)
   101  	if err != nil {
   102  		return "", err
   103  	}
   104  	if err := WaitForPerfReplicationWorking(ctx, pri, sec); err != nil {
   105  		return "", err
   106  	}
   107  	return root, nil
   108  }
   109  
   110  func WaitForMatchingMerkleRoots(ctx context.Context, endpoint string, pri, sec VaultCluster) error {
   111  	getRoot := func(mode string, cli *api.Client) (string, error) {
   112  		status, err := cli.Logical().Read(endpoint + "status")
   113  		if err != nil {
   114  			return "", err
   115  		}
   116  		if status == nil || status.Data == nil || status.Data["mode"] == nil {
   117  			return "", fmt.Errorf("got nil secret or data")
   118  		}
   119  		if status.Data["mode"].(string) != mode {
   120  			return "", fmt.Errorf("expected mode=%s, got %s", mode, status.Data["mode"].(string))
   121  		}
   122  		return status.Data["merkle_root"].(string), nil
   123  	}
   124  
   125  	secClient := sec.Nodes()[0].APIClient()
   126  	priClient := pri.Nodes()[0].APIClient()
   127  	for i := 0; i < 30; i++ {
   128  		secRoot, err := getRoot("secondary", secClient)
   129  		if err != nil {
   130  			return err
   131  		}
   132  		priRoot, err := getRoot("primary", priClient)
   133  		if err != nil {
   134  			return err
   135  		}
   136  
   137  		if reflect.DeepEqual(priRoot, secRoot) {
   138  			return nil
   139  		}
   140  		time.Sleep(time.Second)
   141  	}
   142  
   143  	return fmt.Errorf("roots did not become equal")
   144  }
   145  
   146  func WaitForPerformanceWAL(ctx context.Context, pri, sec VaultCluster) error {
   147  	endpoint := "sys/replication/performance/"
   148  	if err := WaitForMatchingMerkleRoots(ctx, endpoint, pri, sec); err != nil {
   149  		return nil
   150  	}
   151  	getWAL := func(mode, walKey string, cli *api.Client) (int64, error) {
   152  		status, err := cli.Logical().Read(endpoint + "status")
   153  		if err != nil {
   154  			return 0, err
   155  		}
   156  		if status == nil || status.Data == nil || status.Data["mode"] == nil {
   157  			return 0, fmt.Errorf("got nil secret or data")
   158  		}
   159  		if status.Data["mode"].(string) != mode {
   160  			return 0, fmt.Errorf("expected mode=%s, got %s", mode, status.Data["mode"].(string))
   161  		}
   162  		return status.Data[walKey].(json.Number).Int64()
   163  	}
   164  
   165  	secClient := sec.Nodes()[0].APIClient()
   166  	priClient := pri.Nodes()[0].APIClient()
   167  	for ctx.Err() == nil {
   168  		secLastRemoteWAL, err := getWAL("secondary", "last_remote_wal", secClient)
   169  		if err != nil {
   170  			return err
   171  		}
   172  		priLastPerfWAL, err := getWAL("primary", "last_performance_wal", priClient)
   173  		if err != nil {
   174  			return err
   175  		}
   176  
   177  		if secLastRemoteWAL >= priLastPerfWAL {
   178  			return nil
   179  		}
   180  		time.Sleep(time.Second)
   181  	}
   182  
   183  	return fmt.Errorf("performance WALs on the secondary did not catch up with the primary, context err: %w", ctx.Err())
   184  }
   185  
   186  func WaitForPerformanceSecondary(ctx context.Context, pri, sec VaultCluster, skipPoisonPill bool) (string, error) {
   187  	if len(pri.GetRecoveryKeys()) > 0 {
   188  		sec.SetBarrierKeys(pri.GetRecoveryKeys())
   189  		sec.SetRecoveryKeys(pri.GetRecoveryKeys())
   190  	} else {
   191  		sec.SetBarrierKeys(pri.GetBarrierKeys())
   192  		sec.SetRecoveryKeys(pri.GetBarrierKeys())
   193  	}
   194  
   195  	if len(sec.Nodes()) > 1 {
   196  		if skipPoisonPill {
   197  			// As part of prepareSecondary on the active node the keyring is
   198  			// deleted from storage.  Its absence can cause standbys to seal
   199  			// themselves. But it's not reliable, so we'll seal them
   200  			// ourselves to force the issue.
   201  			for i := range sec.Nodes()[1:] {
   202  				if err := SealNode(ctx, sec, i+1); err != nil {
   203  					return "", err
   204  				}
   205  			}
   206  		} else {
   207  			// We want to make sure we unseal all the nodes so we first need to wait
   208  			// until two of the nodes seal due to the poison pill being written
   209  			if err := WaitForNCoresSealed(ctx, sec, len(sec.Nodes())-1); err != nil {
   210  				return "", err
   211  			}
   212  		}
   213  	}
   214  	if _, err := WaitForActiveNode(ctx, sec); err != nil {
   215  		return "", err
   216  	}
   217  	if err := UnsealAllNodes(ctx, sec); err != nil {
   218  		return "", err
   219  	}
   220  
   221  	perfSecondaryRootToken, err := GenerateRoot(sec, GenerateRootRegular)
   222  	if err != nil {
   223  		return "", err
   224  	}
   225  	sec.SetRootToken(perfSecondaryRootToken)
   226  	if err := WaitForActiveNodeAndPerfStandbys(ctx, sec); err != nil {
   227  		return "", err
   228  	}
   229  
   230  	return perfSecondaryRootToken, nil
   231  }
   232  
   233  func WaitForPerfReplicationWorking(ctx context.Context, pri, sec VaultCluster) error {
   234  	priActiveIdx, err := WaitForActiveNode(ctx, pri)
   235  	if err != nil {
   236  		return err
   237  	}
   238  	secActiveIdx, err := WaitForActiveNode(ctx, sec)
   239  	if err != nil {
   240  		return err
   241  	}
   242  
   243  	priClient, secClient := pri.Nodes()[priActiveIdx].APIClient(), sec.Nodes()[secActiveIdx].APIClient()
   244  	mountPoint, err := uuid.GenerateUUID()
   245  	if err != nil {
   246  		return err
   247  	}
   248  	err = priClient.Sys().Mount(mountPoint, &api.MountInput{
   249  		Type:  "kv",
   250  		Local: false,
   251  	})
   252  	if err != nil {
   253  		return fmt.Errorf("unable to mount KV engine on primary")
   254  	}
   255  
   256  	path := mountPoint + "/foo"
   257  	_, err = priClient.Logical().Write(path, map[string]interface{}{
   258  		"bar": 1,
   259  	})
   260  	if err != nil {
   261  		return fmt.Errorf("unable to write KV on primary, path=%s", path)
   262  	}
   263  
   264  	for ctx.Err() == nil {
   265  		var secret *api.Secret
   266  		secret, err = secClient.Logical().Read(path)
   267  		if err == nil && secret != nil {
   268  			err = priClient.Sys().Unmount(mountPoint)
   269  			if err != nil {
   270  				return fmt.Errorf("unable to unmount KV engine on primary")
   271  			}
   272  			return nil
   273  		}
   274  		time.Sleep(100 * time.Millisecond)
   275  	}
   276  	if err == nil {
   277  		err = ctx.Err()
   278  	}
   279  	return fmt.Errorf("unable to read replicated KV on secondary, path=%s, err=%v", path, err)
   280  }
   281  
   282  func SetupTwoClusterPerfReplication(ctx context.Context, pri, sec VaultCluster) error {
   283  	if err := EnablePerfPrimary(ctx, pri); err != nil {
   284  		return err
   285  	}
   286  	perfToken, err := GetPerformanceToken(pri, sec.ClusterID(), "")
   287  	if err != nil {
   288  		return err
   289  	}
   290  
   291  	_, err = EnablePerformanceSecondary(ctx, perfToken, pri, sec, false, false)
   292  	return err
   293  }
   294  
   295  // PassiveWaitForActiveNodeAndPerfStandbys should be used instead of
   296  // WaitForActiveNodeAndPerfStandbys when you don't want to do any writes
   297  // as a side-effect. This returns perfStandby nodes in the cluster and
   298  // an error.
   299  func PassiveWaitForActiveNodeAndPerfStandbys(ctx context.Context, pri VaultCluster) (VaultClusterNode, []VaultClusterNode, error) {
   300  	leaderNode, standbys, err := GetActiveAndStandbys(ctx, pri)
   301  	if err != nil {
   302  		return nil, nil, fmt.Errorf("failed to derive standby nodes, %w", err)
   303  	}
   304  
   305  	for i, node := range standbys {
   306  		client := node.APIClient()
   307  		// Make sure we get perf standby nodes
   308  		if err = EnsureCoreIsPerfStandby(ctx, client); err != nil {
   309  			return nil, nil, fmt.Errorf("standby node %d is not a perfStandby, %w", i, err)
   310  		}
   311  	}
   312  
   313  	return leaderNode, standbys, nil
   314  }
   315  
   316  func GetActiveAndStandbys(ctx context.Context, cluster VaultCluster) (VaultClusterNode, []VaultClusterNode, error) {
   317  	var leaderIndex int
   318  	var err error
   319  	if leaderIndex, err = WaitForActiveNode(ctx, cluster); err != nil {
   320  		return nil, nil, err
   321  	}
   322  
   323  	var leaderNode VaultClusterNode
   324  	var nodes []VaultClusterNode
   325  	for i, node := range cluster.Nodes() {
   326  		if i == leaderIndex {
   327  			leaderNode = node
   328  			continue
   329  		}
   330  		nodes = append(nodes, node)
   331  	}
   332  
   333  	return leaderNode, nodes, nil
   334  }
   335  
   336  func EnsureCoreIsPerfStandby(ctx context.Context, client *api.Client) error {
   337  	var err error
   338  	var health *api.HealthResponse
   339  	for ctx.Err() == nil {
   340  		health, err = client.Sys().HealthWithContext(ctx)
   341  		if err == nil && health.PerformanceStandby {
   342  			return nil
   343  		}
   344  		time.Sleep(time.Millisecond * 500)
   345  	}
   346  	if err == nil {
   347  		err = ctx.Err()
   348  	}
   349  	return err
   350  }
   351  
   352  func WaitForDRReplicationState(ctx context.Context, cluster VaultCluster, state consts.ReplicationState) error {
   353  	client := cluster.Nodes()[0].APIClient()
   354  	var health *api.HealthResponse
   355  	var err error
   356  	for ctx.Err() == nil {
   357  		health, err = client.Sys().HealthWithContext(ctx)
   358  		if err == nil && health.ReplicationDRMode == state.GetDRString() {
   359  			return nil
   360  		}
   361  		time.Sleep(500 * time.Millisecond)
   362  	}
   363  	if err == nil {
   364  		err = ctx.Err()
   365  	}
   366  	return err
   367  }
   368  
   369  func EnableDrPrimary(ctx context.Context, pri VaultCluster) error {
   370  	client := pri.Nodes()[0].APIClient()
   371  	_, err := client.Logical().Write("sys/replication/dr/primary/enable", nil)
   372  	if err != nil {
   373  		return err
   374  	}
   375  
   376  	err = WaitForDRReplicationState(ctx, pri, consts.ReplicationDRPrimary)
   377  	if err != nil {
   378  		return err
   379  	}
   380  	return WaitForActiveNodeAndPerfStandbys(ctx, pri)
   381  }
   382  
   383  func GenerateDRActivationToken(pri VaultCluster, id, secondaryPublicKey string) (string, error) {
   384  	client := pri.Nodes()[0].APIClient()
   385  	req := map[string]interface{}{
   386  		"id": id,
   387  	}
   388  	if secondaryPublicKey != "" {
   389  		req["secondary_public_key"] = secondaryPublicKey
   390  	}
   391  	secret, err := client.Logical().Write("sys/replication/dr/primary/secondary-token", req)
   392  	if err != nil {
   393  		return "", err
   394  	}
   395  
   396  	if secondaryPublicKey != "" {
   397  		return secret.Data["token"].(string), nil
   398  	}
   399  	return secret.WrapInfo.Token, nil
   400  }
   401  
   402  func WaitForDRSecondary(ctx context.Context, pri, sec VaultCluster, skipPoisonPill bool) error {
   403  	if len(pri.GetRecoveryKeys()) > 0 {
   404  		sec.SetBarrierKeys(pri.GetRecoveryKeys())
   405  		sec.SetRecoveryKeys(pri.GetRecoveryKeys())
   406  	} else {
   407  		sec.SetBarrierKeys(pri.GetBarrierKeys())
   408  		sec.SetRecoveryKeys(pri.GetBarrierKeys())
   409  	}
   410  
   411  	if len(sec.Nodes()) > 1 {
   412  		if skipPoisonPill {
   413  			// As part of prepareSecondary on the active node the keyring is
   414  			// deleted from storage.  Its absence can cause standbys to seal
   415  			// themselves. But it's not reliable, so we'll seal them
   416  			// ourselves to force the issue.
   417  			for i := range sec.Nodes()[1:] {
   418  				if err := SealNode(ctx, sec, i+1); err != nil {
   419  					return err
   420  				}
   421  			}
   422  		} else {
   423  			// We want to make sure we unseal all the nodes so we first need to wait
   424  			// until two of the nodes seal due to the poison pill being written
   425  			if err := WaitForNCoresSealed(ctx, sec, len(sec.Nodes())-1); err != nil {
   426  				return err
   427  			}
   428  		}
   429  	}
   430  	if _, err := WaitForActiveNode(ctx, sec); err != nil {
   431  		return err
   432  	}
   433  
   434  	// unseal nodes
   435  	for i := range sec.Nodes() {
   436  		if err := UnsealNode(ctx, sec, i); err != nil {
   437  			// Sometimes when we get here it's already unsealed on its own
   438  			// and then this fails for DR secondaries so check again
   439  			// The error is "path disabled in replication DR secondary mode".
   440  			if healthErr := NodeHealthy(ctx, sec, i); healthErr != nil {
   441  				// return the original error
   442  				return err
   443  			}
   444  		}
   445  	}
   446  
   447  	sec.SetRootToken(pri.GetRootToken())
   448  
   449  	if _, err := WaitForActiveNode(ctx, sec); err != nil {
   450  		return err
   451  	}
   452  
   453  	return nil
   454  }
   455  
   456  func EnableDRSecondaryNoWait(ctx context.Context, sec VaultCluster, drToken string) error {
   457  	postData := map[string]interface{}{
   458  		"token":   drToken,
   459  		"ca_file": DefaultCAFile,
   460  	}
   461  
   462  	_, err := sec.Nodes()[0].APIClient().Logical().Write("sys/replication/dr/secondary/enable", postData)
   463  	if err != nil {
   464  		return err
   465  	}
   466  
   467  	return WaitForDRReplicationState(ctx, sec, consts.ReplicationDRSecondary)
   468  }
   469  
   470  func WaitForReplicationStatus(ctx context.Context, client *api.Client, dr bool, accept func(map[string]interface{}) error) error {
   471  	url := "sys/replication/performance/status"
   472  	if dr {
   473  		url = "sys/replication/dr/status"
   474  	}
   475  
   476  	var err error
   477  	var secret *api.Secret
   478  	for ctx.Err() == nil {
   479  		secret, err = client.Logical().Read(url)
   480  		if err == nil && secret != nil && secret.Data != nil {
   481  			if err = accept(secret.Data); err == nil {
   482  				return nil
   483  			}
   484  		}
   485  		time.Sleep(500 * time.Millisecond)
   486  	}
   487  	if err == nil {
   488  		err = ctx.Err()
   489  	}
   490  
   491  	return fmt.Errorf("unable to get acceptable replication status: error=%v secret=%#v", err, secret)
   492  }
   493  
   494  func WaitForDRReplicationWorking(ctx context.Context, pri, sec VaultCluster) error {
   495  	priClient := pri.Nodes()[0].APIClient()
   496  	secClient := sec.Nodes()[0].APIClient()
   497  
   498  	// Make sure we've entered stream-wals mode
   499  	err := WaitForReplicationStatus(ctx, secClient, true, func(secret map[string]interface{}) error {
   500  		state := secret["state"]
   501  		if state == string("stream-wals") {
   502  			return nil
   503  		}
   504  		return fmt.Errorf("expected stream-wals replication state, got %v", state)
   505  	})
   506  	if err != nil {
   507  		return err
   508  	}
   509  
   510  	// Now write some data and make sure that we see last_remote_wal nonzero, i.e.
   511  	// at least one WAL has been streamed.
   512  	secret, err := priClient.Auth().Token().Create(&api.TokenCreateRequest{})
   513  	if err != nil {
   514  		return err
   515  	}
   516  
   517  	// Revoke the token since some tests won't be happy to see it.
   518  	err = priClient.Auth().Token().RevokeTree(secret.Auth.ClientToken)
   519  	if err != nil {
   520  		return err
   521  	}
   522  
   523  	err = WaitForReplicationStatus(ctx, secClient, true, func(secret map[string]interface{}) error {
   524  		state := secret["state"]
   525  		if state != string("stream-wals") {
   526  			return fmt.Errorf("expected stream-wals replication state, got %v", state)
   527  		}
   528  
   529  		if secret["last_remote_wal"] != nil {
   530  			lastRemoteWal, _ := secret["last_remote_wal"].(json.Number).Int64()
   531  			if lastRemoteWal <= 0 {
   532  				return fmt.Errorf("expected last_remote_wal to be greater than zero")
   533  			}
   534  			return nil
   535  		}
   536  
   537  		return fmt.Errorf("replication seems to be still catching up, maybe need to wait more")
   538  	})
   539  	if err != nil {
   540  		return err
   541  	}
   542  	return nil
   543  }
   544  
   545  func EnableDrSecondary(ctx context.Context, pri, sec VaultCluster, drToken string) error {
   546  	err := EnableDRSecondaryNoWait(ctx, sec, drToken)
   547  	if err != nil {
   548  		return err
   549  	}
   550  
   551  	if err = WaitForMatchingMerkleRoots(ctx, "sys/replication/dr/", pri, sec); err != nil {
   552  		return err
   553  	}
   554  
   555  	err = WaitForDRSecondary(ctx, pri, sec, false)
   556  	if err != nil {
   557  		return err
   558  	}
   559  
   560  	if err = WaitForDRReplicationWorking(ctx, pri, sec); err != nil {
   561  		return err
   562  	}
   563  	return nil
   564  }
   565  
   566  func SetupTwoClusterDRReplication(ctx context.Context, pri, sec VaultCluster) error {
   567  	if err := EnableDrPrimary(ctx, pri); err != nil {
   568  		return err
   569  	}
   570  
   571  	drToken, err := GenerateDRActivationToken(pri, sec.ClusterID(), "")
   572  	if err != nil {
   573  		return err
   574  	}
   575  	err = EnableDrSecondary(ctx, pri, sec, drToken)
   576  	if err != nil {
   577  		return err
   578  	}
   579  	return nil
   580  }
   581  
   582  func DemoteDRPrimary(client *api.Client) error {
   583  	_, err := client.Logical().Write("sys/replication/dr/primary/demote", map[string]interface{}{})
   584  	return err
   585  }
   586  
   587  func createBatchToken(client *api.Client, path string) (string, error) {
   588  	// TODO: should these be more random in case more than one batch token needs to be created?
   589  	suffix := strings.Replace(path, "/", "", -1)
   590  	policyName := "path-batch-policy-" + suffix
   591  	roleName := "path-batch-role-" + suffix
   592  
   593  	rules := fmt.Sprintf(`path "%s" { capabilities = [ "read", "update" ] }`, path)
   594  
   595  	// create policy
   596  	_, err := client.Logical().Write("sys/policy/"+policyName, map[string]interface{}{
   597  		"policy": rules,
   598  	})
   599  	if err != nil {
   600  		return "", err
   601  	}
   602  
   603  	// create a role
   604  	_, err = client.Logical().Write("auth/token/roles/"+roleName, map[string]interface{}{
   605  		"allowed_policies": policyName,
   606  		"orphan":           true,
   607  		"renewable":        false,
   608  		"token_type":       "batch",
   609  	})
   610  	if err != nil {
   611  		return "", err
   612  	}
   613  
   614  	// create batch token
   615  	secret, err := client.Logical().Write("auth/token/create/"+roleName, nil)
   616  	if err != nil {
   617  		return "", err
   618  	}
   619  
   620  	return secret.Auth.ClientToken, nil
   621  }
   622  
   623  // PromoteDRSecondaryWithBatchToken creates a batch token for DR promotion
   624  // before promotion, it demotes the primary cluster. The primary cluster needs
   625  // to be functional for the generation of the batch token
   626  func PromoteDRSecondaryWithBatchToken(ctx context.Context, pri, sec VaultCluster) error {
   627  	client := pri.Nodes()[0].APIClient()
   628  	drToken, err := createBatchToken(client, "sys/replication/dr/secondary/promote")
   629  	if err != nil {
   630  		return err
   631  	}
   632  
   633  	err = DemoteDRPrimary(client)
   634  	if err != nil {
   635  		return err
   636  	}
   637  
   638  	return promoteDRSecondaryInternal(ctx, sec, drToken)
   639  }
   640  
   641  // PromoteDRSecondary generates a DR operation token on the secondary using
   642  // unseal/recovery keys. Therefore, the primary cluster could potentially
   643  // be out of service.
   644  func PromoteDRSecondary(ctx context.Context, sec VaultCluster) error {
   645  	// generate DR operation token to do update primary on vC to point to
   646  	// the new perfSec primary vD
   647  	drToken, err := GenerateRoot(sec, GenerateRootDR)
   648  	if err != nil {
   649  		return err
   650  	}
   651  	return promoteDRSecondaryInternal(ctx, sec, drToken)
   652  }
   653  
   654  func promoteDRSecondaryInternal(ctx context.Context, sec VaultCluster, drToken string) error {
   655  	secClient := sec.Nodes()[0].APIClient()
   656  
   657  	// Allow retries of 503s, e.g.: replication is still catching up,
   658  	// try again later or provide the "force" argument
   659  	oldMaxRetries := secClient.MaxRetries()
   660  	secClient.SetMaxRetries(10)
   661  	defer secClient.SetMaxRetries(oldMaxRetries)
   662  	resp, err := secClient.Logical().Write("sys/replication/dr/secondary/promote", map[string]interface{}{
   663  		"dr_operation_token": drToken,
   664  	})
   665  	if err != nil {
   666  		return err
   667  	}
   668  	if resp == nil {
   669  		return fmt.Errorf("nil status response during DR promotion")
   670  	}
   671  
   672  	if _, err := WaitForActiveNode(ctx, sec); err != nil {
   673  		return err
   674  	}
   675  
   676  	return WaitForDRReplicationState(ctx, sec, consts.ReplicationDRPrimary)
   677  }
   678  
   679  func checkClusterAddr(ctx context.Context, pri, sec VaultCluster) error {
   680  	priClient := pri.Nodes()[0].APIClient()
   681  	priLeader, err := priClient.Sys().LeaderWithContext(ctx)
   682  	if err != nil {
   683  		return err
   684  	}
   685  	secClient := sec.Nodes()[0].APIClient()
   686  	endpoint := "sys/replication/dr/"
   687  	status, err := secClient.Logical().Read(endpoint + "status")
   688  	if err != nil {
   689  		return err
   690  	}
   691  	if status == nil || status.Data == nil {
   692  		return fmt.Errorf("got nil secret or data")
   693  	}
   694  
   695  	var priAddrs []string
   696  	err = mapstructure.Decode(status.Data["known_primary_cluster_addrs"], &priAddrs)
   697  	if err != nil {
   698  		return err
   699  	}
   700  	if !strutil.StrListContains(priAddrs, priLeader.LeaderClusterAddress) {
   701  		return fmt.Errorf("failed to fine the expected primary cluster address %v in known_primary_cluster_addrs", priLeader.LeaderClusterAddress)
   702  	}
   703  
   704  	return nil
   705  }
   706  
   707  func UpdatePrimary(ctx context.Context, pri, sec VaultCluster) error {
   708  	// generate DR operation token to do update primary on vC to point to
   709  	// the new perfSec primary vD
   710  	rootToken, err := GenerateRoot(sec, GenerateRootDR)
   711  	if err != nil {
   712  		return err
   713  	}
   714  
   715  	// secondary activation token
   716  	drToken, err := GenerateDRActivationToken(pri, sec.ClusterID(), "")
   717  	if err != nil {
   718  		return err
   719  	}
   720  
   721  	// update-primary on vC (new perfSec Dr secondary) to point to
   722  	// the new perfSec Dr primary
   723  	secClient := sec.Nodes()[0].APIClient()
   724  	resp, err := secClient.Logical().Write("sys/replication/dr/secondary/update-primary", map[string]interface{}{
   725  		"dr_operation_token": rootToken,
   726  		"token":              drToken,
   727  		"ca_file":            DefaultCAFile,
   728  	})
   729  	if err != nil {
   730  		return err
   731  	}
   732  	if resp == nil {
   733  		return fmt.Errorf("nil status response during update primary")
   734  	}
   735  
   736  	if _, err = WaitForActiveNode(ctx, sec); err != nil {
   737  		return err
   738  	}
   739  
   740  	if err = WaitForDRReplicationState(ctx, sec, consts.ReplicationDRSecondary); err != nil {
   741  		return err
   742  	}
   743  
   744  	if err = checkClusterAddr(ctx, pri, sec); err != nil {
   745  		return err
   746  	}
   747  
   748  	return nil
   749  }
   750  
   751  func SetupFourClusterReplication(ctx context.Context, pri, sec, pridr, secdr VaultCluster) error {
   752  	err := SetupTwoClusterPerfReplication(ctx, pri, sec)
   753  	if err != nil {
   754  		return err
   755  	}
   756  	err = SetupTwoClusterDRReplication(ctx, pri, pridr)
   757  	if err != nil {
   758  		return err
   759  	}
   760  	err = SetupTwoClusterDRReplication(ctx, sec, secdr)
   761  	if err != nil {
   762  		return err
   763  	}
   764  	return nil
   765  }
   766  
   767  type ReplicationSet struct {
   768  	// By convention, we recommend the following naming scheme for
   769  	// clusters in this map:
   770  	// A: perf primary
   771  	// B: primary's DR
   772  	// C: first perf secondary of A
   773  	// D: C's DR
   774  	// E: second perf secondary of A
   775  	// F: E's DR
   776  	// ... etc.
   777  	//
   778  	// We use generic names rather than role-specific names because
   779  	// that's less confusing when promotions take place that result in role
   780  	// changes. In other words, if D gets promoted to replace C as a perf
   781  	// secondary, and C gets demoted and updated to become D's DR secondary,
   782  	// they should maintain their initial names of D and C throughout.
   783  	Clusters map[string]VaultCluster
   784  	Builder  ClusterBuilder
   785  	Logger   hclog.Logger
   786  	CA       *CA
   787  }
   788  
   789  type ClusterBuilder func(ctx context.Context, name string, logger hclog.Logger) (VaultCluster, error)
   790  
   791  func NewReplicationSet(b ClusterBuilder) (*ReplicationSet, error) {
   792  	return &ReplicationSet{
   793  		Clusters: map[string]VaultCluster{},
   794  		Builder:  b,
   795  		Logger:   hclog.NewNullLogger(),
   796  	}, nil
   797  }
   798  
   799  func (r *ReplicationSet) StandardPerfReplication(ctx context.Context) error {
   800  	for _, name := range []string{"A", "C"} {
   801  		if _, ok := r.Clusters[name]; !ok {
   802  			cluster, err := r.Builder(ctx, name, r.Logger)
   803  			if err != nil {
   804  				return err
   805  			}
   806  			r.Clusters[name] = cluster
   807  		}
   808  	}
   809  
   810  	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
   811  	defer cancel()
   812  	err := SetupTwoClusterPerfReplication(ctx, r.Clusters["A"], r.Clusters["C"])
   813  	if err != nil {
   814  		return err
   815  	}
   816  
   817  	return nil
   818  }
   819  
   820  func (r *ReplicationSet) StandardDRReplication(ctx context.Context) error {
   821  	for _, name := range []string{"A", "B"} {
   822  		if _, ok := r.Clusters[name]; !ok {
   823  			cluster, err := r.Builder(ctx, name, r.Logger)
   824  			if err != nil {
   825  				return err
   826  			}
   827  			r.Clusters[name] = cluster
   828  		}
   829  	}
   830  
   831  	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
   832  	defer cancel()
   833  	err := SetupTwoClusterDRReplication(ctx, r.Clusters["A"], r.Clusters["B"])
   834  	if err != nil {
   835  		return err
   836  	}
   837  
   838  	return nil
   839  }
   840  
   841  func (r *ReplicationSet) GetFourReplicationCluster(ctx context.Context) error {
   842  	for _, name := range []string{"A", "B", "C", "D"} {
   843  		if _, ok := r.Clusters[name]; !ok {
   844  			cluster, err := r.Builder(ctx, name, r.Logger)
   845  			if err != nil {
   846  				return err
   847  			}
   848  			r.Clusters[name] = cluster
   849  		}
   850  	}
   851  
   852  	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
   853  	defer cancel()
   854  	err := SetupFourClusterReplication(ctx, r.Clusters["A"], r.Clusters["C"], r.Clusters["B"], r.Clusters["D"])
   855  	if err != nil {
   856  		return err
   857  	}
   858  	return nil
   859  }
   860  
   861  func (r *ReplicationSet) Cleanup() {
   862  	for _, cluster := range r.Clusters {
   863  		cluster.Cleanup()
   864  	}
   865  }
   866  
   867  func WaitForPerfReplicationConnectionStatus(ctx context.Context, client *api.Client) error {
   868  	type Primary struct {
   869  		APIAddress       string `mapstructure:"api_address"`
   870  		ConnectionStatus string `mapstructure:"connection_status"`
   871  		ClusterAddress   string `mapstructure:"cluster_address"`
   872  		LastHeartbeat    string `mapstructure:"last_heartbeat"`
   873  	}
   874  	type Status struct {
   875  		Primaries []Primary `mapstructure:"primaries"`
   876  	}
   877  	return WaitForPerfReplicationStatus(ctx, client, func(m map[string]interface{}) error {
   878  		var status Status
   879  		err := mapstructure.Decode(m, &status)
   880  		if err != nil {
   881  			return err
   882  		}
   883  		if len(status.Primaries) == 0 {
   884  			return fmt.Errorf("primaries is zero")
   885  		}
   886  		for _, v := range status.Primaries {
   887  			if v.ConnectionStatus == "connected" {
   888  				return nil
   889  			}
   890  		}
   891  		return fmt.Errorf("no primaries connected")
   892  	})
   893  }
   894  
   895  func WaitForPerfReplicationStatus(ctx context.Context, client *api.Client, accept func(map[string]interface{}) error) error {
   896  	var err error
   897  	var secret *api.Secret
   898  	for ctx.Err() == nil {
   899  		secret, err = client.Logical().Read("sys/replication/performance/status")
   900  		if err == nil && secret != nil && secret.Data != nil {
   901  			if err = accept(secret.Data); err == nil {
   902  				return nil
   903  			}
   904  		}
   905  		time.Sleep(500 * time.Millisecond)
   906  	}
   907  	return fmt.Errorf("unable to get acceptable replication status within allotted time: error=%v secret=%#v", err, secret)
   908  }