github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/alertmanager/state_replication_test.go (about) 1 package alertmanager 2 3 import ( 4 "context" 5 "errors" 6 "sort" 7 "strings" 8 "sync" 9 "testing" 10 "time" 11 12 "github.com/go-kit/log" 13 "github.com/grafana/dskit/services" 14 "github.com/prometheus/alertmanager/cluster/clusterpb" 15 "github.com/prometheus/client_golang/prometheus" 16 "github.com/prometheus/client_golang/prometheus/testutil" 17 "github.com/stretchr/testify/assert" 18 "github.com/stretchr/testify/require" 19 20 "github.com/cortexproject/cortex/pkg/alertmanager/alertspb" 21 "github.com/cortexproject/cortex/pkg/alertmanager/alertstore" 22 ) 23 24 type fakeState struct { 25 binary []byte 26 merges [][]byte 27 } 28 29 func (s *fakeState) MarshalBinary() ([]byte, error) { 30 return s.binary, nil 31 } 32 33 func (s *fakeState) Merge(data []byte) error { 34 s.merges = append(s.merges, data) 35 return nil 36 } 37 38 type readStateResult struct { 39 res []*clusterpb.FullState 40 err error 41 blocking bool 42 } 43 44 type fakeReplicator struct { 45 mtx sync.Mutex 46 results map[string]*clusterpb.Part 47 read readStateResult 48 } 49 50 func newFakeReplicator() *fakeReplicator { 51 return &fakeReplicator{ 52 results: make(map[string]*clusterpb.Part), 53 } 54 } 55 56 func (f *fakeReplicator) ReplicateStateForUser(ctx context.Context, userID string, p *clusterpb.Part) error { 57 f.mtx.Lock() 58 f.results[userID] = p 59 f.mtx.Unlock() 60 return nil 61 } 62 63 func (f *fakeReplicator) GetPositionForUser(_ string) int { 64 return 0 65 } 66 67 func (f *fakeReplicator) ReadFullStateForUser(ctx context.Context, userID string) ([]*clusterpb.FullState, error) { 68 if userID != "user-1" { 69 return nil, errors.New("Unexpected userID") 70 } 71 72 if f.read.blocking { 73 <-ctx.Done() 74 return nil, ctx.Err() 75 } 76 return f.read.res, f.read.err 77 } 78 79 type fakeAlertStore struct { 80 alertstore.AlertStore 81 82 states map[string]alertspb.FullStateDesc 83 } 84 85 func newFakeAlertStore() *fakeAlertStore { 86 return &fakeAlertStore{ 87 states: make(map[string]alertspb.FullStateDesc), 88 } 89 } 90 91 func (f *fakeAlertStore) GetFullState(ctx context.Context, user string) (alertspb.FullStateDesc, error) { 92 if result, ok := f.states[user]; ok { 93 return result, nil 94 } 95 return alertspb.FullStateDesc{}, alertspb.ErrNotFound 96 } 97 98 func TestStateReplication(t *testing.T) { 99 tc := []struct { 100 name string 101 replicationFactor int 102 message *clusterpb.Part 103 results map[string]*clusterpb.Part 104 }{ 105 { 106 name: "with a replication factor of <= 1, state is not replicated.", 107 replicationFactor: 1, 108 message: &clusterpb.Part{Key: "nflog", Data: []byte("OK")}, 109 results: map[string]*clusterpb.Part{}, 110 }, 111 { 112 name: "with a replication factor of > 1, state is broadcasted for replication.", 113 replicationFactor: 3, 114 message: &clusterpb.Part{Key: "nflog", Data: []byte("OK")}, 115 results: map[string]*clusterpb.Part{"user-1": {Key: "nflog", Data: []byte("OK")}}, 116 }, 117 } 118 119 for _, tt := range tc { 120 t.Run(tt.name, func(t *testing.T) { 121 reg := prometheus.NewPedanticRegistry() 122 replicator := newFakeReplicator() 123 replicator.read = readStateResult{res: nil, err: nil} 124 store := newFakeAlertStore() 125 s := newReplicatedStates("user-1", tt.replicationFactor, replicator, store, log.NewNopLogger(), reg) 126 127 require.False(t, s.Ready()) 128 { 129 ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond) 130 defer cancel() 131 require.Equal(t, context.DeadlineExceeded, s.WaitReady(ctx)) 132 } 133 134 require.NoError(t, services.StartAndAwaitRunning(context.Background(), s)) 135 t.Cleanup(func() { 136 require.NoError(t, services.StopAndAwaitTerminated(context.Background(), s)) 137 }) 138 139 require.True(t, s.Ready()) 140 { 141 ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond) 142 defer cancel() 143 require.NoError(t, s.WaitReady(ctx)) 144 } 145 146 ch := s.AddState("nflog", &fakeState{}, reg) 147 148 part := tt.message 149 d, err := part.Marshal() 150 require.NoError(t, err) 151 ch.Broadcast(d) 152 153 require.Eventually(t, func() bool { 154 replicator.mtx.Lock() 155 defer replicator.mtx.Unlock() 156 return len(replicator.results) == len(tt.results) 157 }, time.Second, time.Millisecond) 158 159 if tt.replicationFactor > 1 { 160 assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(` 161 # HELP alertmanager_state_fetch_replica_state_failed_total Number of times we have failed to read and merge the full state from another replica. 162 # TYPE alertmanager_state_fetch_replica_state_failed_total counter 163 alertmanager_state_fetch_replica_state_failed_total 0 164 # HELP alertmanager_state_fetch_replica_state_total Number of times we have tried to read and merge the full state from another replica. 165 # TYPE alertmanager_state_fetch_replica_state_total counter 166 alertmanager_state_fetch_replica_state_total 1 167 # HELP alertmanager_partial_state_merges_failed_total Number of times we have failed to merge a partial state received for a key. 168 # TYPE alertmanager_partial_state_merges_failed_total counter 169 alertmanager_partial_state_merges_failed_total{key="nflog"} 0 170 # HELP alertmanager_partial_state_merges_total Number of times we have received a partial state to merge for a key. 171 # TYPE alertmanager_partial_state_merges_total counter 172 alertmanager_partial_state_merges_total{key="nflog"} 0 173 # HELP alertmanager_state_initial_sync_completed_total Number of times we have completed syncing initial state for each possible outcome. 174 # TYPE alertmanager_state_initial_sync_completed_total counter 175 alertmanager_state_initial_sync_completed_total{outcome="failed"} 0 176 alertmanager_state_initial_sync_completed_total{outcome="from-replica"} 1 177 alertmanager_state_initial_sync_completed_total{outcome="from-storage"} 0 178 alertmanager_state_initial_sync_completed_total{outcome="user-not-found"} 0 179 # HELP alertmanager_state_initial_sync_total Number of times we have tried to sync initial state from peers or remote storage. 180 # TYPE alertmanager_state_initial_sync_total counter 181 alertmanager_state_initial_sync_total 1 182 # HELP alertmanager_state_replication_failed_total Number of times we have failed to replicate a state to other alertmanagers. 183 # TYPE alertmanager_state_replication_failed_total counter 184 alertmanager_state_replication_failed_total{key="nflog"} 0 185 # HELP alertmanager_state_replication_total Number of times we have tried to replicate a state to other alertmanagers. 186 # TYPE alertmanager_state_replication_total counter 187 alertmanager_state_replication_total{key="nflog"} 1 188 `), 189 "alertmanager_state_fetch_replica_state_failed_total", 190 "alertmanager_state_fetch_replica_state_total", 191 "alertmanager_partial_state_merges_failed_total", 192 "alertmanager_partial_state_merges_total", 193 "alertmanager_state_initial_sync_completed_total", 194 "alertmanager_state_initial_sync_total", 195 "alertmanager_state_replication_failed_total", 196 "alertmanager_state_replication_total", 197 )) 198 199 } 200 }) 201 } 202 } 203 204 func TestStateReplication_Settle(t *testing.T) { 205 206 tc := []struct { 207 name string 208 replicationFactor int 209 read readStateResult 210 storeStates map[string]alertspb.FullStateDesc 211 results map[string][][]byte 212 }{ 213 { 214 name: "with a replication factor of <= 1, no state can be read from peers.", 215 replicationFactor: 1, 216 read: readStateResult{}, 217 results: map[string][][]byte{ 218 "key1": nil, 219 "key2": nil, 220 }, 221 }, 222 { 223 name: "with a replication factor of > 1, state is read from all peers.", 224 replicationFactor: 3, 225 read: readStateResult{ 226 res: []*clusterpb.FullState{ 227 {Parts: []clusterpb.Part{{Key: "key1", Data: []byte("Datum1")}, {Key: "key2", Data: []byte("Datum2")}}}, 228 {Parts: []clusterpb.Part{{Key: "key1", Data: []byte("Datum3")}, {Key: "key2", Data: []byte("Datum4")}}}, 229 }, 230 }, 231 results: map[string][][]byte{ 232 "key1": {[]byte("Datum1"), []byte("Datum3")}, 233 "key2": {[]byte("Datum2"), []byte("Datum4")}, 234 }, 235 }, 236 { 237 name: "with full state having no parts, nothing is merged.", 238 replicationFactor: 3, 239 read: readStateResult{ 240 res: []*clusterpb.FullState{{Parts: []clusterpb.Part{}}}, 241 }, 242 results: map[string][][]byte{ 243 "key1": nil, 244 "key2": nil, 245 }, 246 }, 247 { 248 name: "with an unknown key, parts in the same state are merged.", 249 replicationFactor: 3, 250 read: readStateResult{ 251 res: []*clusterpb.FullState{{Parts: []clusterpb.Part{ 252 {Key: "unknown", Data: []byte("Wow")}, 253 {Key: "key1", Data: []byte("Datum1")}, 254 }}}, 255 }, 256 results: map[string][][]byte{ 257 "key1": {[]byte("Datum1")}, 258 "key2": nil, 259 }, 260 }, 261 { 262 name: "with an unknown key, parts in other states are merged.", 263 replicationFactor: 3, 264 read: readStateResult{ 265 res: []*clusterpb.FullState{ 266 {Parts: []clusterpb.Part{{Key: "unknown", Data: []byte("Wow")}}}, 267 {Parts: []clusterpb.Part{{Key: "key1", Data: []byte("Datum1")}}}, 268 }, 269 }, 270 results: map[string][][]byte{ 271 "key1": {[]byte("Datum1")}, 272 "key2": nil, 273 }, 274 }, 275 { 276 name: "when reading from replicas fails, state is read from storage.", 277 replicationFactor: 3, 278 read: readStateResult{err: errors.New("Read Error 1")}, 279 storeStates: map[string]alertspb.FullStateDesc{ 280 "user-1": { 281 State: &clusterpb.FullState{ 282 Parts: []clusterpb.Part{{Key: "key1", Data: []byte("Datum1")}}, 283 }, 284 }, 285 }, 286 results: map[string][][]byte{ 287 "key1": {[]byte("Datum1")}, 288 "key2": nil, 289 }, 290 }, 291 { 292 name: "when reading from replicas and from storage fails, still become ready.", 293 replicationFactor: 3, 294 read: readStateResult{err: errors.New("Read Error 1")}, 295 storeStates: map[string]alertspb.FullStateDesc{}, 296 results: map[string][][]byte{ 297 "key1": nil, 298 "key2": nil, 299 }, 300 }, 301 { 302 name: "when reading the full state takes too long, hit timeout but become ready.", 303 replicationFactor: 3, 304 read: readStateResult{blocking: true}, 305 results: map[string][][]byte{ 306 "key1": nil, 307 "key2": nil, 308 }, 309 }, 310 } 311 312 for _, tt := range tc { 313 t.Run(tt.name, func(t *testing.T) { 314 reg := prometheus.NewPedanticRegistry() 315 316 replicator := newFakeReplicator() 317 replicator.read = tt.read 318 store := newFakeAlertStore() 319 store.states = tt.storeStates 320 s := newReplicatedStates("user-1", tt.replicationFactor, replicator, store, log.NewNopLogger(), reg) 321 322 key1State := &fakeState{} 323 key2State := &fakeState{} 324 325 s.AddState("key1", key1State, reg) 326 s.AddState("key2", key2State, reg) 327 328 s.settleReadTimeout = 1 * time.Second 329 330 assert.False(t, s.Ready()) 331 332 require.NoError(t, services.StartAndAwaitRunning(context.Background(), s)) 333 t.Cleanup(func() { 334 require.NoError(t, services.StopAndAwaitTerminated(context.Background(), s)) 335 }) 336 337 assert.True(t, s.Ready()) 338 339 // Note: We don't actually test beyond Merge() here, just that all data is forwarded. 340 assert.Equal(t, tt.results["key1"], key1State.merges) 341 assert.Equal(t, tt.results["key2"], key2State.merges) 342 }) 343 } 344 } 345 346 func TestStateReplication_GetFullState(t *testing.T) { 347 348 tc := []struct { 349 name string 350 data map[string][]byte 351 result *clusterpb.FullState 352 }{ 353 { 354 name: "no keys", 355 data: map[string][]byte{}, 356 result: &clusterpb.FullState{ 357 Parts: []clusterpb.Part{}, 358 }, 359 }, 360 { 361 name: "zero length data", 362 data: map[string][]byte{ 363 "key1": {}, 364 }, 365 result: &clusterpb.FullState{ 366 Parts: []clusterpb.Part{ 367 {Key: "key1", Data: []byte{}}, 368 }, 369 }, 370 }, 371 { 372 name: "keys with data", 373 data: map[string][]byte{ 374 "key1": []byte("Datum1"), 375 "key2": []byte("Datum2"), 376 }, 377 result: &clusterpb.FullState{ 378 Parts: []clusterpb.Part{ 379 {Key: "key1", Data: []byte("Datum1")}, 380 {Key: "key2", Data: []byte("Datum2")}, 381 }, 382 }, 383 }, 384 } 385 386 for _, tt := range tc { 387 t.Run(tt.name, func(t *testing.T) { 388 reg := prometheus.NewPedanticRegistry() 389 s := newReplicatedStates("user-1", 1, nil, nil, log.NewNopLogger(), reg) 390 391 for key, datum := range tt.data { 392 state := &fakeState{binary: datum} 393 s.AddState(key, state, reg) 394 } 395 396 result, err := s.GetFullState() 397 require.NoError(t, err) 398 399 // Key ordering is undefined for the code under test. 400 sort.Slice(result.Parts, func(i, j int) bool { return result.Parts[i].Key < result.Parts[j].Key }) 401 402 assert.Equal(t, tt.result, result) 403 }) 404 } 405 }