github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/alertmanager/state_replication_test.go (about)

     1  package alertmanager
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"sort"
     7  	"strings"
     8  	"sync"
     9  	"testing"
    10  	"time"
    11  
    12  	"github.com/go-kit/log"
    13  	"github.com/grafana/dskit/services"
    14  	"github.com/prometheus/alertmanager/cluster/clusterpb"
    15  	"github.com/prometheus/client_golang/prometheus"
    16  	"github.com/prometheus/client_golang/prometheus/testutil"
    17  	"github.com/stretchr/testify/assert"
    18  	"github.com/stretchr/testify/require"
    19  
    20  	"github.com/cortexproject/cortex/pkg/alertmanager/alertspb"
    21  	"github.com/cortexproject/cortex/pkg/alertmanager/alertstore"
    22  )
    23  
    24  type fakeState struct {
    25  	binary []byte
    26  	merges [][]byte
    27  }
    28  
    29  func (s *fakeState) MarshalBinary() ([]byte, error) {
    30  	return s.binary, nil
    31  }
    32  
    33  func (s *fakeState) Merge(data []byte) error {
    34  	s.merges = append(s.merges, data)
    35  	return nil
    36  }
    37  
    38  type readStateResult struct {
    39  	res      []*clusterpb.FullState
    40  	err      error
    41  	blocking bool
    42  }
    43  
    44  type fakeReplicator struct {
    45  	mtx     sync.Mutex
    46  	results map[string]*clusterpb.Part
    47  	read    readStateResult
    48  }
    49  
    50  func newFakeReplicator() *fakeReplicator {
    51  	return &fakeReplicator{
    52  		results: make(map[string]*clusterpb.Part),
    53  	}
    54  }
    55  
    56  func (f *fakeReplicator) ReplicateStateForUser(ctx context.Context, userID string, p *clusterpb.Part) error {
    57  	f.mtx.Lock()
    58  	f.results[userID] = p
    59  	f.mtx.Unlock()
    60  	return nil
    61  }
    62  
    63  func (f *fakeReplicator) GetPositionForUser(_ string) int {
    64  	return 0
    65  }
    66  
    67  func (f *fakeReplicator) ReadFullStateForUser(ctx context.Context, userID string) ([]*clusterpb.FullState, error) {
    68  	if userID != "user-1" {
    69  		return nil, errors.New("Unexpected userID")
    70  	}
    71  
    72  	if f.read.blocking {
    73  		<-ctx.Done()
    74  		return nil, ctx.Err()
    75  	}
    76  	return f.read.res, f.read.err
    77  }
    78  
    79  type fakeAlertStore struct {
    80  	alertstore.AlertStore
    81  
    82  	states map[string]alertspb.FullStateDesc
    83  }
    84  
    85  func newFakeAlertStore() *fakeAlertStore {
    86  	return &fakeAlertStore{
    87  		states: make(map[string]alertspb.FullStateDesc),
    88  	}
    89  }
    90  
    91  func (f *fakeAlertStore) GetFullState(ctx context.Context, user string) (alertspb.FullStateDesc, error) {
    92  	if result, ok := f.states[user]; ok {
    93  		return result, nil
    94  	}
    95  	return alertspb.FullStateDesc{}, alertspb.ErrNotFound
    96  }
    97  
    98  func TestStateReplication(t *testing.T) {
    99  	tc := []struct {
   100  		name              string
   101  		replicationFactor int
   102  		message           *clusterpb.Part
   103  		results           map[string]*clusterpb.Part
   104  	}{
   105  		{
   106  			name:              "with a replication factor of <= 1, state is not replicated.",
   107  			replicationFactor: 1,
   108  			message:           &clusterpb.Part{Key: "nflog", Data: []byte("OK")},
   109  			results:           map[string]*clusterpb.Part{},
   110  		},
   111  		{
   112  			name:              "with a replication factor of > 1, state is broadcasted for replication.",
   113  			replicationFactor: 3,
   114  			message:           &clusterpb.Part{Key: "nflog", Data: []byte("OK")},
   115  			results:           map[string]*clusterpb.Part{"user-1": {Key: "nflog", Data: []byte("OK")}},
   116  		},
   117  	}
   118  
   119  	for _, tt := range tc {
   120  		t.Run(tt.name, func(t *testing.T) {
   121  			reg := prometheus.NewPedanticRegistry()
   122  			replicator := newFakeReplicator()
   123  			replicator.read = readStateResult{res: nil, err: nil}
   124  			store := newFakeAlertStore()
   125  			s := newReplicatedStates("user-1", tt.replicationFactor, replicator, store, log.NewNopLogger(), reg)
   126  
   127  			require.False(t, s.Ready())
   128  			{
   129  				ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
   130  				defer cancel()
   131  				require.Equal(t, context.DeadlineExceeded, s.WaitReady(ctx))
   132  			}
   133  
   134  			require.NoError(t, services.StartAndAwaitRunning(context.Background(), s))
   135  			t.Cleanup(func() {
   136  				require.NoError(t, services.StopAndAwaitTerminated(context.Background(), s))
   137  			})
   138  
   139  			require.True(t, s.Ready())
   140  			{
   141  				ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
   142  				defer cancel()
   143  				require.NoError(t, s.WaitReady(ctx))
   144  			}
   145  
   146  			ch := s.AddState("nflog", &fakeState{}, reg)
   147  
   148  			part := tt.message
   149  			d, err := part.Marshal()
   150  			require.NoError(t, err)
   151  			ch.Broadcast(d)
   152  
   153  			require.Eventually(t, func() bool {
   154  				replicator.mtx.Lock()
   155  				defer replicator.mtx.Unlock()
   156  				return len(replicator.results) == len(tt.results)
   157  			}, time.Second, time.Millisecond)
   158  
   159  			if tt.replicationFactor > 1 {
   160  				assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(`
   161  # HELP alertmanager_state_fetch_replica_state_failed_total Number of times we have failed to read and merge the full state from another replica.
   162  # TYPE alertmanager_state_fetch_replica_state_failed_total counter
   163  alertmanager_state_fetch_replica_state_failed_total 0
   164  # HELP alertmanager_state_fetch_replica_state_total Number of times we have tried to read and merge the full state from another replica.
   165  # TYPE alertmanager_state_fetch_replica_state_total counter
   166  alertmanager_state_fetch_replica_state_total 1
   167  # HELP alertmanager_partial_state_merges_failed_total Number of times we have failed to merge a partial state received for a key.
   168  # TYPE alertmanager_partial_state_merges_failed_total counter
   169  alertmanager_partial_state_merges_failed_total{key="nflog"} 0
   170  # HELP alertmanager_partial_state_merges_total Number of times we have received a partial state to merge for a key.
   171  # TYPE alertmanager_partial_state_merges_total counter
   172  alertmanager_partial_state_merges_total{key="nflog"} 0
   173  # HELP alertmanager_state_initial_sync_completed_total Number of times we have completed syncing initial state for each possible outcome.
   174  # TYPE alertmanager_state_initial_sync_completed_total counter
   175  alertmanager_state_initial_sync_completed_total{outcome="failed"} 0
   176  alertmanager_state_initial_sync_completed_total{outcome="from-replica"} 1
   177  alertmanager_state_initial_sync_completed_total{outcome="from-storage"} 0
   178  alertmanager_state_initial_sync_completed_total{outcome="user-not-found"} 0
   179  # HELP alertmanager_state_initial_sync_total Number of times we have tried to sync initial state from peers or remote storage.
   180  # TYPE alertmanager_state_initial_sync_total counter
   181  alertmanager_state_initial_sync_total 1
   182  # HELP alertmanager_state_replication_failed_total Number of times we have failed to replicate a state to other alertmanagers.
   183  # TYPE alertmanager_state_replication_failed_total counter
   184  alertmanager_state_replication_failed_total{key="nflog"} 0
   185  # HELP alertmanager_state_replication_total Number of times we have tried to replicate a state to other alertmanagers.
   186  # TYPE alertmanager_state_replication_total counter
   187  alertmanager_state_replication_total{key="nflog"} 1
   188  	`),
   189  					"alertmanager_state_fetch_replica_state_failed_total",
   190  					"alertmanager_state_fetch_replica_state_total",
   191  					"alertmanager_partial_state_merges_failed_total",
   192  					"alertmanager_partial_state_merges_total",
   193  					"alertmanager_state_initial_sync_completed_total",
   194  					"alertmanager_state_initial_sync_total",
   195  					"alertmanager_state_replication_failed_total",
   196  					"alertmanager_state_replication_total",
   197  				))
   198  
   199  			}
   200  		})
   201  	}
   202  }
   203  
   204  func TestStateReplication_Settle(t *testing.T) {
   205  
   206  	tc := []struct {
   207  		name              string
   208  		replicationFactor int
   209  		read              readStateResult
   210  		storeStates       map[string]alertspb.FullStateDesc
   211  		results           map[string][][]byte
   212  	}{
   213  		{
   214  			name:              "with a replication factor of <= 1, no state can be read from peers.",
   215  			replicationFactor: 1,
   216  			read:              readStateResult{},
   217  			results: map[string][][]byte{
   218  				"key1": nil,
   219  				"key2": nil,
   220  			},
   221  		},
   222  		{
   223  			name:              "with a replication factor of > 1, state is read from all peers.",
   224  			replicationFactor: 3,
   225  			read: readStateResult{
   226  				res: []*clusterpb.FullState{
   227  					{Parts: []clusterpb.Part{{Key: "key1", Data: []byte("Datum1")}, {Key: "key2", Data: []byte("Datum2")}}},
   228  					{Parts: []clusterpb.Part{{Key: "key1", Data: []byte("Datum3")}, {Key: "key2", Data: []byte("Datum4")}}},
   229  				},
   230  			},
   231  			results: map[string][][]byte{
   232  				"key1": {[]byte("Datum1"), []byte("Datum3")},
   233  				"key2": {[]byte("Datum2"), []byte("Datum4")},
   234  			},
   235  		},
   236  		{
   237  			name:              "with full state having no parts, nothing is merged.",
   238  			replicationFactor: 3,
   239  			read: readStateResult{
   240  				res: []*clusterpb.FullState{{Parts: []clusterpb.Part{}}},
   241  			},
   242  			results: map[string][][]byte{
   243  				"key1": nil,
   244  				"key2": nil,
   245  			},
   246  		},
   247  		{
   248  			name:              "with an unknown key, parts in the same state are merged.",
   249  			replicationFactor: 3,
   250  			read: readStateResult{
   251  				res: []*clusterpb.FullState{{Parts: []clusterpb.Part{
   252  					{Key: "unknown", Data: []byte("Wow")},
   253  					{Key: "key1", Data: []byte("Datum1")},
   254  				}}},
   255  			},
   256  			results: map[string][][]byte{
   257  				"key1": {[]byte("Datum1")},
   258  				"key2": nil,
   259  			},
   260  		},
   261  		{
   262  			name:              "with an unknown key, parts in other states are merged.",
   263  			replicationFactor: 3,
   264  			read: readStateResult{
   265  				res: []*clusterpb.FullState{
   266  					{Parts: []clusterpb.Part{{Key: "unknown", Data: []byte("Wow")}}},
   267  					{Parts: []clusterpb.Part{{Key: "key1", Data: []byte("Datum1")}}},
   268  				},
   269  			},
   270  			results: map[string][][]byte{
   271  				"key1": {[]byte("Datum1")},
   272  				"key2": nil,
   273  			},
   274  		},
   275  		{
   276  			name:              "when reading from replicas fails, state is read from storage.",
   277  			replicationFactor: 3,
   278  			read:              readStateResult{err: errors.New("Read Error 1")},
   279  			storeStates: map[string]alertspb.FullStateDesc{
   280  				"user-1": {
   281  					State: &clusterpb.FullState{
   282  						Parts: []clusterpb.Part{{Key: "key1", Data: []byte("Datum1")}},
   283  					},
   284  				},
   285  			},
   286  			results: map[string][][]byte{
   287  				"key1": {[]byte("Datum1")},
   288  				"key2": nil,
   289  			},
   290  		},
   291  		{
   292  			name:              "when reading from replicas and from storage fails, still become ready.",
   293  			replicationFactor: 3,
   294  			read:              readStateResult{err: errors.New("Read Error 1")},
   295  			storeStates:       map[string]alertspb.FullStateDesc{},
   296  			results: map[string][][]byte{
   297  				"key1": nil,
   298  				"key2": nil,
   299  			},
   300  		},
   301  		{
   302  			name:              "when reading the full state takes too long, hit timeout but become ready.",
   303  			replicationFactor: 3,
   304  			read:              readStateResult{blocking: true},
   305  			results: map[string][][]byte{
   306  				"key1": nil,
   307  				"key2": nil,
   308  			},
   309  		},
   310  	}
   311  
   312  	for _, tt := range tc {
   313  		t.Run(tt.name, func(t *testing.T) {
   314  			reg := prometheus.NewPedanticRegistry()
   315  
   316  			replicator := newFakeReplicator()
   317  			replicator.read = tt.read
   318  			store := newFakeAlertStore()
   319  			store.states = tt.storeStates
   320  			s := newReplicatedStates("user-1", tt.replicationFactor, replicator, store, log.NewNopLogger(), reg)
   321  
   322  			key1State := &fakeState{}
   323  			key2State := &fakeState{}
   324  
   325  			s.AddState("key1", key1State, reg)
   326  			s.AddState("key2", key2State, reg)
   327  
   328  			s.settleReadTimeout = 1 * time.Second
   329  
   330  			assert.False(t, s.Ready())
   331  
   332  			require.NoError(t, services.StartAndAwaitRunning(context.Background(), s))
   333  			t.Cleanup(func() {
   334  				require.NoError(t, services.StopAndAwaitTerminated(context.Background(), s))
   335  			})
   336  
   337  			assert.True(t, s.Ready())
   338  
   339  			// Note: We don't actually test beyond Merge() here, just that all data is forwarded.
   340  			assert.Equal(t, tt.results["key1"], key1State.merges)
   341  			assert.Equal(t, tt.results["key2"], key2State.merges)
   342  		})
   343  	}
   344  }
   345  
   346  func TestStateReplication_GetFullState(t *testing.T) {
   347  
   348  	tc := []struct {
   349  		name   string
   350  		data   map[string][]byte
   351  		result *clusterpb.FullState
   352  	}{
   353  		{
   354  			name: "no keys",
   355  			data: map[string][]byte{},
   356  			result: &clusterpb.FullState{
   357  				Parts: []clusterpb.Part{},
   358  			},
   359  		},
   360  		{
   361  			name: "zero length data",
   362  			data: map[string][]byte{
   363  				"key1": {},
   364  			},
   365  			result: &clusterpb.FullState{
   366  				Parts: []clusterpb.Part{
   367  					{Key: "key1", Data: []byte{}},
   368  				},
   369  			},
   370  		},
   371  		{
   372  			name: "keys with data",
   373  			data: map[string][]byte{
   374  				"key1": []byte("Datum1"),
   375  				"key2": []byte("Datum2"),
   376  			},
   377  			result: &clusterpb.FullState{
   378  				Parts: []clusterpb.Part{
   379  					{Key: "key1", Data: []byte("Datum1")},
   380  					{Key: "key2", Data: []byte("Datum2")},
   381  				},
   382  			},
   383  		},
   384  	}
   385  
   386  	for _, tt := range tc {
   387  		t.Run(tt.name, func(t *testing.T) {
   388  			reg := prometheus.NewPedanticRegistry()
   389  			s := newReplicatedStates("user-1", 1, nil, nil, log.NewNopLogger(), reg)
   390  
   391  			for key, datum := range tt.data {
   392  				state := &fakeState{binary: datum}
   393  				s.AddState(key, state, reg)
   394  			}
   395  
   396  			result, err := s.GetFullState()
   397  			require.NoError(t, err)
   398  
   399  			// Key ordering is undefined for the code under test.
   400  			sort.Slice(result.Parts, func(i, j int) bool { return result.Parts[i].Key < result.Parts[j].Key })
   401  
   402  			assert.Equal(t, tt.result, result)
   403  		})
   404  	}
   405  }