github.com/m3db/m3@v1.5.0/src/dbnode/storage/repair_test.go (about) 1 // Copyright (c) 2019 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package storage 22 23 import ( 24 "errors" 25 "sync" 26 "testing" 27 "time" 28 29 "github.com/m3db/m3/src/dbnode/client" 30 "github.com/m3db/m3/src/dbnode/namespace" 31 "github.com/m3db/m3/src/dbnode/retention" 32 "github.com/m3db/m3/src/dbnode/storage/block" 33 "github.com/m3db/m3/src/dbnode/storage/bootstrap/result" 34 "github.com/m3db/m3/src/dbnode/storage/repair" 35 "github.com/m3db/m3/src/dbnode/topology" 36 "github.com/m3db/m3/src/x/context" 37 "github.com/m3db/m3/src/x/ident" 38 xtest "github.com/m3db/m3/src/x/test" 39 xtime "github.com/m3db/m3/src/x/time" 40 41 "github.com/golang/mock/gomock" 42 "github.com/stretchr/testify/require" 43 "github.com/uber-go/tally" 44 ) 45 46 func TestDatabaseRepairerStartStop(t *testing.T) { 47 ctrl := xtest.NewController(t) 48 defer ctrl.Finish() 49 50 opts := DefaultTestOptions().SetRepairOptions(testRepairOptions(ctrl)) 51 db := NewMockdatabase(ctrl) 52 db.EXPECT().Options().Return(opts).AnyTimes() 53 54 databaseRepairer, err := newDatabaseRepairer(db, opts) 55 require.NoError(t, err) 56 repairer := databaseRepairer.(*dbRepairer) 57 58 var ( 59 repaired bool 60 lock sync.RWMutex 61 ) 62 63 repairer.repairFn = func() error { 64 lock.Lock() 65 repaired = true 66 lock.Unlock() 67 return nil 68 } 69 70 repairer.Start() 71 72 for { 73 // Wait for repair to be called 74 lock.RLock() 75 done := repaired 76 lock.RUnlock() 77 if done { 78 break 79 } 80 time.Sleep(10 * time.Millisecond) 81 } 82 83 repairer.Stop() 84 for { 85 // Wait for the repairer to stop 86 repairer.closedLock.Lock() 87 closed := repairer.closed 88 repairer.closedLock.Unlock() 89 if closed { 90 break 91 } 92 time.Sleep(10 * time.Millisecond) 93 } 94 } 95 96 func TestDatabaseRepairerRepairNotBootstrapped(t *testing.T) { 97 ctrl := xtest.NewController(t) 98 defer ctrl.Finish() 99 100 opts := DefaultTestOptions().SetRepairOptions(testRepairOptions(ctrl)) 101 mockDatabase := NewMockdatabase(ctrl) 102 103 databaseRepairer, err := newDatabaseRepairer(mockDatabase, opts) 104 require.NoError(t, err) 105 repairer := databaseRepairer.(*dbRepairer) 106 107 mockDatabase.EXPECT().IsBootstrapped().Return(false) 108 require.Nil(t, repairer.Repair()) 109 } 110 111 func TestDatabaseShardRepairerRepair(t *testing.T) { 112 testDatabaseShardRepairerRepair(t, false) 113 } 114 115 func TestDatabaseShardRepairerRepairWithLimit(t *testing.T) { 116 testDatabaseShardRepairerRepair(t, true) 117 } 118 119 func testDatabaseShardRepairerRepair(t *testing.T, withLimit bool) { 120 ctrl := xtest.NewController(t) 121 defer ctrl.Finish() 122 123 session := client.NewMockAdminSession(ctrl) 124 session.EXPECT().Origin().Return(topology.NewHost("0", "addr0")).AnyTimes() 125 session.EXPECT().TopologyMap().AnyTimes() 126 127 mockClient := client.NewMockAdminClient(ctrl) 128 mockClient.EXPECT().DefaultAdminSession().Return(session, nil).AnyTimes() 129 130 var ( 131 rpOpts = testRepairOptions(ctrl). 132 SetAdminClients([]client.AdminClient{mockClient}) 133 now = xtime.Now() 134 nowFn = func() time.Time { return now.ToTime() } 135 opts = DefaultTestOptions() 136 copts = opts.ClockOptions() 137 iopts = opts.InstrumentOptions() 138 rtopts = defaultTestRetentionOpts 139 memTrackerOpts = NewMemoryTrackerOptions(1) 140 memTracker = NewMemoryTracker(memTrackerOpts) 141 ) 142 if withLimit { 143 opts = opts.SetMemoryTracker(memTracker) 144 } 145 146 opts = opts. 147 SetClockOptions(copts.SetNowFn(nowFn)). 148 SetInstrumentOptions(iopts.SetMetricsScope(tally.NoopScope)) 149 150 var ( 151 namespaceID = ident.StringID("testNamespace") 152 start = now 153 end = now.Add(rtopts.BlockSize()) 154 repairTimeRange = xtime.Range{Start: start, End: end} 155 fetchOpts = block.FetchBlocksMetadataOptions{ 156 IncludeSizes: true, 157 IncludeChecksums: true, 158 IncludeLastRead: false, 159 } 160 161 sizes = []int64{1, 2, 3, 4} 162 checksums = []uint32{4, 5, 6, 7} 163 lastRead = now.Add(-time.Minute) 164 shardID = uint32(0) 165 shard = NewMockdatabaseShard(ctrl) 166 167 numIters = 1 168 ) 169 170 if withLimit { 171 numIters = 2 172 shard.EXPECT().LoadBlocks(gomock.Any()).Return(nil) 173 shard.EXPECT().LoadBlocks(gomock.Any()).DoAndReturn(func(*result.Map) error { 174 // Return an error that we've hit the limit, but also start a delayed 175 // goroutine to release the throttle repair process. 176 go func() { 177 time.Sleep(10 * time.Millisecond) 178 memTracker.DecPendingLoadedBytes() 179 }() 180 return ErrDatabaseLoadLimitHit 181 }) 182 shard.EXPECT().LoadBlocks(gomock.Any()).Return(nil) 183 } else { 184 shard.EXPECT().LoadBlocks(gomock.Any()).Return(nil) 185 } 186 187 for i := 0; i < numIters; i++ { 188 expectedResults := block.NewFetchBlocksMetadataResults() 189 results := block.NewFetchBlockMetadataResults() 190 results.Add(block.NewFetchBlockMetadataResult(now.Add(30*time.Minute), 191 sizes[0], &checksums[0], lastRead, nil)) 192 results.Add(block.NewFetchBlockMetadataResult(now.Add(time.Hour), 193 sizes[1], &checksums[1], lastRead, nil)) 194 expectedResults.Add(block.NewFetchBlocksMetadataResult(ident.StringID("foo"), nil, results)) 195 results = block.NewFetchBlockMetadataResults() 196 results.Add(block.NewFetchBlockMetadataResult(now.Add(30*time.Minute), 197 sizes[2], &checksums[2], lastRead, nil)) 198 expectedResults.Add(block.NewFetchBlocksMetadataResult(ident.StringID("bar"), nil, results)) 199 200 var ( 201 any = gomock.Any() 202 nonNilPageToken = PageToken("non-nil-page-token") 203 ) 204 // Ensure that the Repair logic will call FetchBlocksMetadataV2 in a loop until 205 // it receives a nil page token. 206 shard.EXPECT(). 207 FetchBlocksMetadataV2(any, start, end, any, nil, fetchOpts). 208 Return(nil, nonNilPageToken, nil) 209 shard.EXPECT(). 210 FetchBlocksMetadataV2(any, start, end, any, nonNilPageToken, fetchOpts). 211 Return(expectedResults, nil, nil) 212 shard.EXPECT().ID().Return(shardID).AnyTimes() 213 214 peerIter := client.NewMockPeerBlockMetadataIter(ctrl) 215 inBlocks := []block.ReplicaMetadata{ 216 { 217 Host: topology.NewHost("1", "addr1"), 218 Metadata: block.NewMetadata(ident.StringID("foo"), ident.Tags{}, now.Add(30*time.Minute), sizes[0], &checksums[0], lastRead), 219 }, 220 { 221 Host: topology.NewHost("1", "addr1"), 222 Metadata: block.NewMetadata(ident.StringID("foo"), ident.Tags{}, now.Add(time.Hour), sizes[0], &checksums[1], lastRead), 223 }, 224 { 225 Host: topology.NewHost("1", "addr1"), 226 // Mismatch checksum so should trigger repair of this series. 227 Metadata: block.NewMetadata(ident.StringID("bar"), ident.Tags{}, now.Add(30*time.Minute), sizes[2], &checksums[3], lastRead), 228 }, 229 } 230 231 gomock.InOrder( 232 peerIter.EXPECT().Next().Return(true), 233 peerIter.EXPECT().Current().Return(inBlocks[0].Host, inBlocks[0].Metadata), 234 peerIter.EXPECT().Next().Return(true), 235 peerIter.EXPECT().Current().Return(inBlocks[1].Host, inBlocks[1].Metadata), 236 peerIter.EXPECT().Next().Return(true), 237 peerIter.EXPECT().Current().Return(inBlocks[2].Host, inBlocks[2].Metadata), 238 peerIter.EXPECT().Next().Return(false), 239 peerIter.EXPECT().Err().Return(nil), 240 ) 241 session.EXPECT(). 242 FetchBlocksMetadataFromPeers(namespaceID, shardID, start, end, 243 rpOpts.RepairConsistencyLevel(), gomock.Any()). 244 Return(peerIter, nil) 245 246 peerBlocksIter := client.NewMockPeerBlocksIter(ctrl) 247 dbBlock1 := block.NewMockDatabaseBlock(ctrl) 248 dbBlock1.EXPECT().StartTime().Return(inBlocks[2].Metadata.Start).AnyTimes() 249 dbBlock2 := block.NewMockDatabaseBlock(ctrl) 250 dbBlock2.EXPECT().StartTime().Return(inBlocks[2].Metadata.Start).AnyTimes() 251 // Ensure merging logic works. 252 dbBlock1.EXPECT().Merge(dbBlock2) 253 gomock.InOrder( 254 peerBlocksIter.EXPECT().Next().Return(true), 255 peerBlocksIter.EXPECT().Current(). 256 Return(inBlocks[2].Host, inBlocks[2].Metadata.ID, inBlocks[2].Metadata.Tags, dbBlock1), 257 peerBlocksIter.EXPECT().Next().Return(true), 258 peerBlocksIter.EXPECT().Current(). 259 Return(inBlocks[2].Host, inBlocks[2].Metadata.ID, inBlocks[2].Metadata.Tags, dbBlock2), 260 peerBlocksIter.EXPECT().Next().Return(false), 261 ) 262 nsMeta, err := namespace.NewMetadata(namespaceID, namespace.NewOptions()) 263 require.NoError(t, err) 264 session.EXPECT(). 265 FetchBlocksFromPeers(nsMeta, shardID, rpOpts.RepairConsistencyLevel(), inBlocks[2:], gomock.Any()). 266 Return(peerBlocksIter, nil) 267 268 var ( 269 resNamespace ident.ID 270 resShard databaseShard 271 resDiff repair.MetadataComparisonResult 272 ) 273 274 databaseShardRepairer := newShardRepairer(opts, rpOpts) 275 repairer := databaseShardRepairer.(shardRepairer) 276 repairer.record = func(origin topology.Host, nsID ident.ID, shard databaseShard, 277 diffRes repair.MetadataComparisonResult) { 278 resNamespace = nsID 279 resShard = shard 280 resDiff = diffRes 281 } 282 283 var ( 284 ctx = context.NewBackground() 285 nsCtx = namespace.Context{ID: namespaceID} 286 ) 287 require.NoError(t, err) 288 repairer.Repair(ctx, nsCtx, nsMeta, repairTimeRange, shard) 289 290 require.Equal(t, namespaceID, resNamespace) 291 require.Equal(t, resShard, shard) 292 require.Equal(t, int64(2), resDiff.NumSeries) 293 require.Equal(t, int64(3), resDiff.NumBlocks) 294 295 checksumDiffSeries := resDiff.ChecksumDifferences.Series() 296 require.Equal(t, 1, checksumDiffSeries.Len()) 297 series, exists := checksumDiffSeries.Get(ident.StringID("bar")) 298 require.True(t, exists) 299 blocks := series.Metadata.Blocks() 300 require.Equal(t, 1, len(blocks)) 301 currBlock, exists := blocks[now.Add(30*time.Minute)] 302 require.True(t, exists) 303 require.Equal(t, now.Add(30*time.Minute), currBlock.Start()) 304 expected := []block.ReplicaMetadata{ 305 // Checksum difference for series "bar". 306 {Host: topology.NewHost("0", "addr0"), Metadata: block.NewMetadata(ident.StringID("bar"), ident.Tags{}, now.Add(30*time.Minute), sizes[2], &checksums[2], lastRead)}, 307 {Host: topology.NewHost("1", "addr1"), Metadata: inBlocks[2].Metadata}, 308 } 309 require.Equal(t, expected, currBlock.Metadata()) 310 311 sizeDiffSeries := resDiff.SizeDifferences.Series() 312 require.Equal(t, 1, sizeDiffSeries.Len()) 313 series, exists = sizeDiffSeries.Get(ident.StringID("foo")) 314 require.True(t, exists) 315 blocks = series.Metadata.Blocks() 316 require.Equal(t, 1, len(blocks)) 317 currBlock, exists = blocks[now.Add(time.Hour)] 318 require.True(t, exists) 319 require.Equal(t, now.Add(time.Hour), currBlock.Start()) 320 expected = []block.ReplicaMetadata{ 321 // Size difference for series "foo". 322 {Host: topology.NewHost("0", "addr0"), Metadata: block.NewMetadata(ident.StringID("foo"), ident.Tags{}, now.Add(time.Hour), sizes[1], &checksums[1], lastRead)}, 323 {Host: topology.NewHost("1", "addr1"), Metadata: inBlocks[1].Metadata}, 324 } 325 require.Equal(t, expected, currBlock.Metadata()) 326 } 327 } 328 329 type multiSessionTestMock struct { 330 host topology.Host 331 client *client.MockAdminClient 332 session *client.MockAdminSession 333 topoMap *topology.MockMap 334 } 335 336 func TestDatabaseShardRepairerRepairMultiSession(t *testing.T) { 337 ctrl := xtest.NewController(t) 338 defer ctrl.Finish() 339 340 // Origin is always zero (on both clients) and hosts[0] and hosts[1] 341 // represents other nodes in different clusters. 342 origin := topology.NewHost("0", "addr0") 343 mocks := []multiSessionTestMock{ 344 { 345 host: topology.NewHost("1", "addr1"), 346 client: client.NewMockAdminClient(ctrl), 347 session: client.NewMockAdminSession(ctrl), 348 topoMap: topology.NewMockMap(ctrl), 349 }, 350 { 351 host: topology.NewHost("2", "addr2"), 352 client: client.NewMockAdminClient(ctrl), 353 session: client.NewMockAdminSession(ctrl), 354 topoMap: topology.NewMockMap(ctrl), 355 }, 356 } 357 358 var mockClients []client.AdminClient 359 var hosts []topology.Host 360 for _, mock := range mocks { 361 mock.session.EXPECT().Origin().Return(origin).AnyTimes() 362 mock.client.EXPECT().DefaultAdminSession().Return(mock.session, nil) 363 mock.session.EXPECT().TopologyMap().Return(mock.topoMap, nil) 364 mockClients = append(mockClients, mock.client) 365 hosts = append(hosts, mock.host) 366 } 367 368 var ( 369 rpOpts = testRepairOptions(ctrl). 370 SetAdminClients(mockClients) 371 now = xtime.Now() 372 nowFn = func() time.Time { return now.ToTime() } 373 opts = DefaultTestOptions() 374 copts = opts.ClockOptions() 375 iopts = opts.InstrumentOptions() 376 rtopts = defaultTestRetentionOpts 377 scope = tally.NewTestScope("", nil) 378 ) 379 380 opts = opts. 381 SetClockOptions(copts.SetNowFn(nowFn)). 382 SetInstrumentOptions(iopts.SetMetricsScope(scope)) 383 384 var ( 385 namespaceID = ident.StringID("testNamespace") 386 start = now 387 end = now.Add(rtopts.BlockSize()) 388 repairTimeRange = xtime.Range{Start: start, End: end} 389 fetchOpts = block.FetchBlocksMetadataOptions{ 390 IncludeSizes: true, 391 IncludeChecksums: true, 392 IncludeLastRead: false, 393 } 394 395 sizes = []int64{3423, 987, 8463, 578} 396 checksums = []uint32{4, 5, 6, 7} 397 lastRead = now.Add(-time.Minute) 398 shardID = uint32(0) 399 shard = NewMockdatabaseShard(ctrl) 400 ) 401 402 expectedResults := block.NewFetchBlocksMetadataResults() 403 results := block.NewFetchBlockMetadataResults() 404 results.Add(block.NewFetchBlockMetadataResult(now.Add(30*time.Minute), 405 sizes[0], &checksums[0], lastRead, nil)) 406 results.Add(block.NewFetchBlockMetadataResult(now.Add(time.Hour), 407 sizes[1], &checksums[1], lastRead, nil)) 408 expectedResults.Add(block.NewFetchBlocksMetadataResult(ident.StringID("foo"), nil, results)) 409 results = block.NewFetchBlockMetadataResults() 410 results.Add(block.NewFetchBlockMetadataResult(now.Add(30*time.Minute), 411 sizes[2], &checksums[2], lastRead, nil)) 412 expectedResults.Add(block.NewFetchBlocksMetadataResult(ident.StringID("bar"), nil, results)) 413 414 var ( 415 any = gomock.Any() 416 nonNilPageToken = PageToken("non-nil-page-token") 417 ) 418 // Ensure that the Repair logic will call FetchBlocksMetadataV2 in a loop until 419 // it receives a nil page token. 420 shard.EXPECT(). 421 FetchBlocksMetadataV2(any, start, end, any, nil, fetchOpts). 422 Return(nil, nonNilPageToken, nil) 423 shard.EXPECT(). 424 FetchBlocksMetadataV2(any, start, end, any, nonNilPageToken, fetchOpts). 425 Return(expectedResults, nil, nil) 426 shard.EXPECT().ID().Return(shardID).AnyTimes() 427 shard.EXPECT().LoadBlocks(gomock.Any()).Return(nil) 428 429 inBlocks := []block.ReplicaMetadata{ 430 { 431 // Peer block size size[2] is different from origin block size size[0] 432 Metadata: block.NewMetadata(ident.StringID("foo"), ident.Tags{}, now.Add(30*time.Minute), sizes[2], &checksums[0], lastRead), 433 }, 434 { 435 // Peer block size size[3] is different from origin block size size[1] 436 Metadata: block.NewMetadata(ident.StringID("foo"), ident.Tags{}, now.Add(time.Hour), sizes[3], &checksums[1], lastRead), 437 }, 438 { 439 // Mismatch checksum so should trigger repair of this series. 440 Metadata: block.NewMetadata(ident.StringID("bar"), ident.Tags{}, now.Add(30*time.Minute), sizes[2], &checksums[3], lastRead), 441 }, 442 } 443 444 for i, mock := range mocks { 445 mockTopoMap := mock.topoMap 446 for _, host := range hosts { 447 iClosure := i 448 mockTopoMap.EXPECT().LookupHostShardSet(host.ID()).DoAndReturn(func(id string) (topology.HostShardSet, bool) { 449 if iClosure == 0 && id == hosts[0].ID() { 450 return nil, true 451 } 452 if iClosure == 1 && id == hosts[1].ID() { 453 return nil, true 454 } 455 return nil, false 456 }).AnyTimes() 457 } 458 } 459 460 nsMeta, err := namespace.NewMetadata(namespaceID, namespace.NewOptions()) 461 for i, mock := range mocks { 462 session := mock.session 463 // Make a copy of the input blocks where the host is set to the host for 464 // the cluster associated with the current session. 465 inBlocksForSession := make([]block.ReplicaMetadata, len(inBlocks)) 466 copy(inBlocksForSession, inBlocks) 467 for j := range inBlocksForSession { 468 inBlocksForSession[j].Host = hosts[i] 469 } 470 471 peerIter := client.NewMockPeerBlockMetadataIter(ctrl) 472 gomock.InOrder( 473 peerIter.EXPECT().Next().Return(true), 474 peerIter.EXPECT().Current().Return(inBlocksForSession[0].Host, inBlocks[0].Metadata), 475 peerIter.EXPECT().Next().Return(true), 476 peerIter.EXPECT().Current().Return(inBlocksForSession[1].Host, inBlocks[1].Metadata), 477 peerIter.EXPECT().Next().Return(true), 478 peerIter.EXPECT().Current().Return(inBlocksForSession[2].Host, inBlocks[2].Metadata), 479 peerIter.EXPECT().Next().Return(false), 480 peerIter.EXPECT().Err().Return(nil), 481 ) 482 session.EXPECT(). 483 FetchBlocksMetadataFromPeers(namespaceID, shardID, start, end, 484 rpOpts.RepairConsistencyLevel(), gomock.Any()). 485 Return(peerIter, nil) 486 487 peerBlocksIter := client.NewMockPeerBlocksIter(ctrl) 488 dbBlock1 := block.NewMockDatabaseBlock(ctrl) 489 dbBlock1.EXPECT().StartTime().Return(inBlocksForSession[2].Metadata.Start).AnyTimes() 490 dbBlock2 := block.NewMockDatabaseBlock(ctrl) 491 dbBlock2.EXPECT().StartTime().Return(inBlocksForSession[2].Metadata.Start).AnyTimes() 492 // Ensure merging logic works. Nede AnyTimes() because the Merge() will only be called on dbBlock1 493 // for the first session (all subsequent blocks from other sessions will get merged into dbBlock1 494 // from the first session.) 495 dbBlock1.EXPECT().Merge(dbBlock2).AnyTimes() 496 gomock.InOrder( 497 peerBlocksIter.EXPECT().Next().Return(true), 498 peerBlocksIter.EXPECT().Current(). 499 Return(inBlocksForSession[2].Host, inBlocks[2].Metadata.ID, inBlocks[2].Metadata.Tags, dbBlock1), 500 peerBlocksIter.EXPECT().Next().Return(true), 501 peerBlocksIter.EXPECT().Current(). 502 Return(inBlocksForSession[2].Host, inBlocks[2].Metadata.ID, inBlocks[2].Metadata.Tags, dbBlock2), 503 peerBlocksIter.EXPECT().Next().Return(false), 504 ) 505 require.NoError(t, err) 506 session.EXPECT(). 507 FetchBlocksFromPeers(nsMeta, shardID, rpOpts.RepairConsistencyLevel(), inBlocksForSession[2:], gomock.Any()). 508 Return(peerBlocksIter, nil) 509 } 510 511 databaseShardRepairer := newShardRepairer(opts, rpOpts) 512 repairer := databaseShardRepairer.(shardRepairer) 513 514 var ( 515 ctx = context.NewBackground() 516 nsCtx = namespace.Context{ID: namespaceID} 517 ) 518 resDiff, err := repairer.Repair(ctx, nsCtx, nsMeta, repairTimeRange, shard) 519 520 require.NoError(t, err) 521 require.Equal(t, int64(2), resDiff.NumSeries) 522 require.Equal(t, int64(3), resDiff.NumBlocks) 523 524 checksumDiffSeries := resDiff.ChecksumDifferences.Series() 525 require.Equal(t, 1, checksumDiffSeries.Len()) 526 series, exists := checksumDiffSeries.Get(ident.StringID("bar")) 527 require.True(t, exists) 528 blocks := series.Metadata.Blocks() 529 require.Equal(t, 1, len(blocks)) 530 currBlock, exists := blocks[now.Add(30*time.Minute)] 531 require.True(t, exists) 532 require.Equal(t, now.Add(30*time.Minute), currBlock.Start()) 533 expected := []block.ReplicaMetadata{ 534 // Checksum difference for series "bar". 535 {Host: origin, Metadata: block.NewMetadata(ident.StringID("bar"), ident.Tags{}, now.Add(30*time.Minute), sizes[2], &checksums[2], lastRead)}, 536 {Host: hosts[0], Metadata: inBlocks[2].Metadata}, 537 {Host: hosts[1], Metadata: inBlocks[2].Metadata}, 538 } 539 require.Equal(t, expected, currBlock.Metadata()) 540 541 sizeDiffSeries := resDiff.SizeDifferences.Series() 542 require.Equal(t, 1, sizeDiffSeries.Len()) 543 series, exists = sizeDiffSeries.Get(ident.StringID("foo")) 544 require.True(t, exists) 545 blocks = series.Metadata.Blocks() 546 require.Equal(t, 2, len(blocks)) 547 // Validate first block 548 currBlock, exists = blocks[now.Add(30*time.Minute)] 549 require.True(t, exists) 550 require.Equal(t, now.Add(30*time.Minute), currBlock.Start()) 551 expected = []block.ReplicaMetadata{ 552 // Size difference for series "foo". 553 {Host: origin, Metadata: block.NewMetadata(ident.StringID("foo"), ident.Tags{}, now.Add(30*time.Minute), sizes[0], &checksums[0], lastRead)}, 554 {Host: hosts[0], Metadata: inBlocks[0].Metadata}, 555 {Host: hosts[1], Metadata: inBlocks[0].Metadata}, 556 } 557 require.Equal(t, expected, currBlock.Metadata()) 558 // Validate second block 559 currBlock, exists = blocks[now.Add(time.Hour)] 560 require.True(t, exists) 561 require.Equal(t, now.Add(time.Hour), currBlock.Start()) 562 expected = []block.ReplicaMetadata{ 563 // Size difference for series "foo". 564 {Host: origin, Metadata: block.NewMetadata(ident.StringID("foo"), ident.Tags{}, now.Add(time.Hour), sizes[1], &checksums[1], lastRead)}, 565 {Host: hosts[0], Metadata: inBlocks[1].Metadata}, 566 {Host: hosts[1], Metadata: inBlocks[1].Metadata}, 567 } 568 require.Equal(t, expected, currBlock.Metadata()) 569 570 // Validate the expected metrics were emitted 571 scopeSnapshot := scope.Snapshot() 572 countersSnapshot := scopeSnapshot.Counters() 573 gaugesSnapshot := scopeSnapshot.Gauges() 574 require.Equal(t, int64(2), 575 countersSnapshot["repair.series+namespace=testNamespace,resultType=total,shard=0"].Value()) 576 require.Equal(t, int64(3), 577 countersSnapshot["repair.blocks+namespace=testNamespace,resultType=total,shard=0"].Value()) 578 // Validate that first block's divergence is emitted instead of second block because first block is diverged 579 // more than second block from its peers. 580 scopeTags := map[string]string{"namespace": "testNamespace", "resultType": "sizeDiff", "shard": "0"} 581 require.Equal(t, float64(sizes[0]-sizes[2]), 582 gaugesSnapshot[tally.KeyForPrefixedStringMap("repair.max-block-size-diff", scopeTags)].Value()) 583 require.Equal(t, float64(100*(sizes[0]-sizes[2]))/float64(sizes[0]), 584 gaugesSnapshot[tally.KeyForPrefixedStringMap("repair.max-block-size-diff-as-percentage", scopeTags)].Value()) 585 } 586 587 type expectedRepair struct { 588 expectedRepairRange xtime.Range 589 mockRepairResult error 590 } 591 592 func TestDatabaseRepairPrioritizationLogic(t *testing.T) { 593 var ( 594 rOpts = retention.NewOptions(). 595 SetRetentionPeriod(retention.NewOptions().BlockSize() * 2) 596 nsOpts = namespace.NewOptions(). 597 SetRetentionOptions(rOpts) 598 blockSize = rOpts.BlockSize() 599 600 // Set current time such that the previous block is flushable. 601 now = xtime.Now().Truncate(blockSize).Add(rOpts.BufferPast()).Add(time.Second) 602 603 flushTimeStart = retention.FlushTimeStart(rOpts, now) 604 flushTimeEnd = retention.FlushTimeEnd(rOpts, now) 605 ) 606 require.NoError(t, nsOpts.Validate()) 607 // Ensure only two flushable blocks in retention to make test logic simpler. 608 require.Equal(t, blockSize, flushTimeEnd.Sub(flushTimeStart)) 609 610 testCases := []struct { 611 title string 612 strategy repair.Strategy 613 repairState repairStatesByNs 614 expectedNS1Repairs []expectedRepair 615 expectedNS2Repairs []expectedRepair 616 }{ 617 { 618 title: "repairs most recent block if no repair state", 619 strategy: repair.DefaultStrategy, 620 expectedNS1Repairs: []expectedRepair{ 621 {expectedRepairRange: xtime.Range{Start: flushTimeEnd, End: flushTimeEnd.Add(blockSize)}}, 622 }, 623 expectedNS2Repairs: []expectedRepair{ 624 {expectedRepairRange: xtime.Range{Start: flushTimeEnd, End: flushTimeEnd.Add(blockSize)}}, 625 }, 626 }, 627 { 628 title: "repairs next unrepaired block in reverse order if some (but not all) blocks have been repaired", 629 strategy: repair.DefaultStrategy, 630 repairState: repairStatesByNs{ 631 "ns1": namespaceRepairStateByTime{ 632 flushTimeEnd: repairState{ 633 Status: repairSuccess, 634 LastAttempt: 0, 635 }, 636 }, 637 "ns2": namespaceRepairStateByTime{ 638 flushTimeEnd: repairState{ 639 Status: repairSuccess, 640 LastAttempt: 0, 641 }, 642 }, 643 }, 644 expectedNS1Repairs: []expectedRepair{ 645 {expectedRepairRange: xtime.Range{Start: flushTimeStart, End: flushTimeStart.Add(blockSize)}}, 646 }, 647 expectedNS2Repairs: []expectedRepair{ 648 {expectedRepairRange: xtime.Range{Start: flushTimeStart, End: flushTimeStart.Add(blockSize)}}, 649 }, 650 }, 651 { 652 title: "repairs least recently repaired block if all blocks have been repaired", 653 strategy: repair.DefaultStrategy, 654 repairState: repairStatesByNs{ 655 "ns1": namespaceRepairStateByTime{ 656 flushTimeStart: repairState{ 657 Status: repairSuccess, 658 LastAttempt: 0, 659 }, 660 flushTimeEnd: repairState{ 661 Status: repairSuccess, 662 LastAttempt: xtime.UnixNano(time.Second), 663 }, 664 }, 665 "ns2": namespaceRepairStateByTime{ 666 flushTimeStart: repairState{ 667 Status: repairSuccess, 668 LastAttempt: 0, 669 }, 670 flushTimeEnd: repairState{ 671 Status: repairSuccess, 672 LastAttempt: xtime.UnixNano(time.Second), 673 }, 674 }, 675 }, 676 expectedNS1Repairs: []expectedRepair{ 677 {expectedRepairRange: xtime.Range{Start: flushTimeStart, End: flushTimeStart.Add(blockSize)}}, 678 }, 679 expectedNS2Repairs: []expectedRepair{ 680 {expectedRepairRange: xtime.Range{Start: flushTimeStart, End: flushTimeStart.Add(blockSize)}}, 681 }, 682 }, 683 { 684 title: "repairs all blocks block if no repair state with full sweep strategy", 685 strategy: repair.FullSweepStrategy, 686 expectedNS1Repairs: []expectedRepair{ 687 {expectedRepairRange: xtime.Range{Start: flushTimeEnd, End: flushTimeEnd.Add(blockSize)}}, 688 {expectedRepairRange: xtime.Range{Start: flushTimeStart, End: flushTimeStart.Add(blockSize)}}, 689 }, 690 expectedNS2Repairs: []expectedRepair{ 691 {expectedRepairRange: xtime.Range{Start: flushTimeEnd, End: flushTimeEnd.Add(blockSize)}}, 692 {expectedRepairRange: xtime.Range{Start: flushTimeStart, End: flushTimeStart.Add(blockSize)}}, 693 }, 694 }, 695 } 696 697 for _, tc := range testCases { 698 tc := tc 699 t.Run(tc.title, func(t *testing.T) { 700 ctrl := xtest.NewController(t) 701 defer ctrl.Finish() 702 703 repairOpts := testRepairOptions(ctrl).SetStrategy(tc.strategy) 704 opts := DefaultTestOptions().SetRepairOptions(repairOpts) 705 mockDatabase := NewMockdatabase(ctrl) 706 707 databaseRepairer, err := newDatabaseRepairer(mockDatabase, opts) 708 require.NoError(t, err) 709 repairer := databaseRepairer.(*dbRepairer) 710 repairer.nowFn = func() time.Time { 711 return now.ToTime() 712 } 713 if tc.repairState == nil { 714 tc.repairState = repairStatesByNs{} 715 } 716 repairer.repairStatesByNs = tc.repairState 717 718 mockDatabase.EXPECT().IsBootstrapped().Return(true) 719 720 var ( 721 ns1 = NewMockdatabaseNamespace(ctrl) 722 ns2 = NewMockdatabaseNamespace(ctrl) 723 namespaces = []databaseNamespace{ns1, ns2} 724 ) 725 ns1.EXPECT().Options().Return(nsOpts).AnyTimes() 726 ns2.EXPECT().Options().Return(nsOpts).AnyTimes() 727 728 ns1.EXPECT().ID().Return(ident.StringID("ns1")).AnyTimes() 729 ns2.EXPECT().ID().Return(ident.StringID("ns2")).AnyTimes() 730 731 for _, expected := range tc.expectedNS1Repairs { 732 ns1.EXPECT().Repair(gomock.Any(), expected.expectedRepairRange, NamespaceRepairOptions{}) 733 } 734 for _, expected := range tc.expectedNS2Repairs { 735 ns2.EXPECT().Repair(gomock.Any(), expected.expectedRepairRange, NamespaceRepairOptions{}) 736 } 737 738 mockDatabase.EXPECT().OwnedNamespaces().Return(namespaces, nil) 739 require.Nil(t, repairer.Repair()) 740 }) 741 } 742 } 743 744 // Database repairer repairs blocks in decreasing time ranges for each namespace. If database repairer fails to 745 // repair a time range of a namespace then instead of skipping repair of all past time ranges of that namespace, test 746 // that database repairer tries to repair the past corrupt time range of that namespace. 747 func TestDatabaseRepairSkipsPoisonShard(t *testing.T) { 748 ctrl := xtest.NewController(t) 749 defer ctrl.Finish() 750 751 var ( 752 rOpts = retention.NewOptions(). 753 SetRetentionPeriod(retention.NewOptions().BlockSize() * 2) 754 nsOpts = namespace.NewOptions(). 755 SetRetentionOptions(rOpts) 756 blockSize = rOpts.BlockSize() 757 758 // Set current time such that the previous block is flushable. 759 now = xtime.Now().Truncate(blockSize).Add(rOpts.BufferPast()).Add(time.Second) 760 761 flushTimeStart = retention.FlushTimeStart(rOpts, now) 762 flushTimeEnd = retention.FlushTimeEnd(rOpts, now) 763 ) 764 require.NoError(t, nsOpts.Validate()) 765 // Ensure only two flushable blocks in retention to make test logic simpler. 766 require.Equal(t, blockSize, flushTimeEnd.Sub(flushTimeStart)) 767 768 testCases := []struct { 769 title string 770 repairState repairStatesByNs 771 expectedNS1Repairs []expectedRepair 772 expectedNS2Repairs []expectedRepair 773 }{ 774 { 775 // Test that corrupt ns1 time range (flushTimeEnd, flushTimeEnd + blockSize) does not prevent past time 776 // ranges (flushTimeStart, flushTimeStart + blockSize) from being repaired. Also test that least recently 777 // repaired policy is honored even when repairing one of the time ranges (flushTimeStart, flushTimeStart + 778 // blockSize) on ns2 fails. 779 title: "attempts to keep repairing time ranges before poison time ranges", 780 repairState: repairStatesByNs{ 781 "ns2": namespaceRepairStateByTime{ 782 flushTimeEnd: repairState{ 783 Status: repairSuccess, 784 LastAttempt: 0, 785 }, 786 }, 787 }, 788 expectedNS1Repairs: []expectedRepair{ 789 { 790 xtime.Range{Start: flushTimeEnd, End: flushTimeEnd.Add(blockSize)}, 791 errors.New("ns1 repair error"), 792 }, 793 { 794 xtime.Range{Start: flushTimeStart, End: flushTimeStart.Add(blockSize)}, 795 nil, 796 }, 797 }, 798 expectedNS2Repairs: []expectedRepair{ 799 { 800 xtime.Range{Start: flushTimeStart, End: flushTimeStart.Add(blockSize)}, 801 errors.New("ns2 repair error"), 802 }, 803 { 804 xtime.Range{Start: flushTimeEnd, End: flushTimeEnd.Add(blockSize)}, 805 nil, 806 }, 807 }, 808 }, 809 } 810 811 for _, tc := range testCases { 812 t.Run(tc.title, func(t *testing.T) { 813 opts := DefaultTestOptions().SetRepairOptions(testRepairOptions(ctrl)) 814 mockDatabase := NewMockdatabase(ctrl) 815 816 databaseRepairer, err := newDatabaseRepairer(mockDatabase, opts) 817 require.NoError(t, err) 818 repairer := databaseRepairer.(*dbRepairer) 819 repairer.nowFn = func() time.Time { 820 return now.ToTime() 821 } 822 if tc.repairState == nil { 823 tc.repairState = repairStatesByNs{} 824 } 825 repairer.repairStatesByNs = tc.repairState 826 827 mockDatabase.EXPECT().IsBootstrapped().Return(true) 828 829 var ( 830 ns1 = NewMockdatabaseNamespace(ctrl) 831 ns2 = NewMockdatabaseNamespace(ctrl) 832 namespaces = []databaseNamespace{ns1, ns2} 833 ) 834 ns1.EXPECT().Options().Return(nsOpts).AnyTimes() 835 ns2.EXPECT().Options().Return(nsOpts).AnyTimes() 836 837 ns1.EXPECT().ID().Return(ident.StringID("ns1")).AnyTimes() 838 ns2.EXPECT().ID().Return(ident.StringID("ns2")).AnyTimes() 839 840 //Setup expected ns1 repair invocations for each repaired time range 841 var ns1RepairExpectations = make([]*gomock.Call, len(tc.expectedNS1Repairs)) 842 for i, ns1Repair := range tc.expectedNS1Repairs { 843 ns1RepairExpectations[i] = ns1.EXPECT(). 844 Repair(gomock.Any(), ns1Repair.expectedRepairRange, NamespaceRepairOptions{}). 845 Return(ns1Repair.mockRepairResult) 846 } 847 gomock.InOrder(ns1RepairExpectations...) 848 849 //Setup expected ns2 repair invocations for each repaired time range 850 var ns2RepairExpectations = make([]*gomock.Call, len(tc.expectedNS2Repairs)) 851 for i, ns2Repair := range tc.expectedNS2Repairs { 852 ns2RepairExpectations[i] = ns2.EXPECT(). 853 Repair(gomock.Any(), ns2Repair.expectedRepairRange, NamespaceRepairOptions{}). 854 Return(ns2Repair.mockRepairResult) 855 } 856 gomock.InOrder(ns2RepairExpectations...) 857 858 mockDatabase.EXPECT().OwnedNamespaces().Return(namespaces, nil) 859 860 require.NotNil(t, repairer.Repair()) 861 }) 862 } 863 }