github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/ingester/wal_test.go (about) 1 package ingester 2 3 import ( 4 "context" 5 "fmt" 6 "io/ioutil" 7 "net/http" 8 "os" 9 "path/filepath" 10 "testing" 11 "time" 12 13 "github.com/grafana/dskit/services" 14 prom_testutil "github.com/prometheus/client_golang/prometheus/testutil" 15 "github.com/prometheus/common/model" 16 "github.com/prometheus/prometheus/pkg/labels" 17 "github.com/stretchr/testify/require" 18 "github.com/weaveworks/common/httpgrpc" 19 "github.com/weaveworks/common/user" 20 21 "github.com/cortexproject/cortex/pkg/cortexpb" 22 ) 23 24 func TestWAL(t *testing.T) { 25 dirname, err := ioutil.TempDir("", "cortex-wal") 26 require.NoError(t, err) 27 defer func() { 28 require.NoError(t, os.RemoveAll(dirname)) 29 }() 30 31 cfg := defaultIngesterTestConfig(t) 32 cfg.WALConfig.WALEnabled = true 33 cfg.WALConfig.CheckpointEnabled = true 34 cfg.WALConfig.Recover = true 35 cfg.WALConfig.Dir = dirname 36 cfg.WALConfig.CheckpointDuration = 100 * time.Minute 37 cfg.WALConfig.checkpointDuringShutdown = true 38 39 numSeries := 100 40 numSamplesPerSeriesPerPush := 10 41 numRestarts := 5 42 43 // Build an ingester, add some samples, then shut it down. 44 _, ing := newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig(), nil) 45 userIDs, testData := pushTestSamples(t, ing, numSeries, numSamplesPerSeriesPerPush, 0) 46 // Checkpoint happens when stopping. 47 require.NoError(t, services.StopAndAwaitTerminated(context.Background(), ing)) 48 49 for r := 0; r < numRestarts; r++ { 50 if r == 2 { 51 // From 3rd restart onwards, we are disabling checkpointing during shutdown 52 // to test both checkpoint+WAL replay. 53 cfg.WALConfig.checkpointDuringShutdown = false 54 } 55 if r == numRestarts-1 { 56 cfg.WALConfig.WALEnabled = false 57 cfg.WALConfig.CheckpointEnabled = false 58 } 59 60 // Start a new ingester and recover the WAL. 61 _, ing = newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig(), nil) 62 63 for i, userID := range userIDs { 64 testData[userID] = buildTestMatrix(numSeries, (r+1)*numSamplesPerSeriesPerPush, i) 65 } 66 // Check the samples are still there! 67 retrieveTestSamples(t, ing, userIDs, testData) 68 69 if r != numRestarts-1 { 70 userIDs, testData = pushTestSamples(t, ing, numSeries, numSamplesPerSeriesPerPush, (r+1)*numSamplesPerSeriesPerPush) 71 } 72 73 require.NoError(t, services.StopAndAwaitTerminated(context.Background(), ing)) 74 } 75 76 cfg.WALConfig.WALEnabled = true 77 cfg.WALConfig.CheckpointEnabled = true 78 79 // Start a new ingester and recover the WAL. 80 _, ing = newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig(), nil) 81 82 userID := userIDs[0] 83 sampleStream := testData[userID][0] 84 lastSample := sampleStream.Values[len(sampleStream.Values)-1] 85 86 // In-order and out of order sample in the same request. 87 metric := cortexpb.FromLabelAdaptersToLabels(cortexpb.FromMetricsToLabelAdapters(sampleStream.Metric)) 88 outOfOrderSample := cortexpb.Sample{TimestampMs: int64(lastSample.Timestamp - 10), Value: 99} 89 inOrderSample := cortexpb.Sample{TimestampMs: int64(lastSample.Timestamp + 10), Value: 999} 90 91 ctx := user.InjectOrgID(context.Background(), userID) 92 _, err = ing.Push(ctx, cortexpb.ToWriteRequest( 93 []labels.Labels{metric, metric}, 94 []cortexpb.Sample{outOfOrderSample, inOrderSample}, nil, cortexpb.API)) 95 require.Equal(t, httpgrpc.Errorf(http.StatusBadRequest, wrapWithUser(makeMetricValidationError(sampleOutOfOrder, metric, 96 fmt.Errorf("sample timestamp out of order; last timestamp: %v, incoming timestamp: %v", lastSample.Timestamp, model.Time(outOfOrderSample.TimestampMs))), userID).Error()), err) 97 98 // We should have logged the in-order sample. 99 testData[userID][0].Values = append(testData[userID][0].Values, model.SamplePair{ 100 Timestamp: model.Time(inOrderSample.TimestampMs), 101 Value: model.SampleValue(inOrderSample.Value), 102 }) 103 104 // Check samples after restart from WAL. 105 require.NoError(t, services.StopAndAwaitTerminated(context.Background(), ing)) 106 _, ing = newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig(), nil) 107 retrieveTestSamples(t, ing, userIDs, testData) 108 } 109 110 func TestCheckpointRepair(t *testing.T) { 111 cfg := defaultIngesterTestConfig(t) 112 cfg.WALConfig.WALEnabled = true 113 cfg.WALConfig.CheckpointEnabled = true 114 cfg.WALConfig.Recover = true 115 cfg.WALConfig.CheckpointDuration = 100 * time.Hour // Basically no automatic checkpoint. 116 117 numSeries := 100 118 numSamplesPerSeriesPerPush := 10 119 for _, numCheckpoints := range []int{0, 1, 2, 3} { 120 dirname, err := ioutil.TempDir("", "cortex-wal") 121 require.NoError(t, err) 122 defer func() { 123 require.NoError(t, os.RemoveAll(dirname)) 124 }() 125 cfg.WALConfig.Dir = dirname 126 127 // Build an ingester, add some samples, then shut it down. 128 _, ing := newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig(), nil) 129 130 w, ok := ing.wal.(*walWrapper) 131 require.True(t, ok) 132 133 var userIDs []string 134 // Push some samples for the 0 checkpoints case. 135 // We dont shutdown the ingester in that case, else it will create a checkpoint. 136 userIDs, _ = pushTestSamples(t, ing, numSeries, numSamplesPerSeriesPerPush, 0) 137 for i := 0; i < numCheckpoints; i++ { 138 // First checkpoint. 139 userIDs, _ = pushTestSamples(t, ing, numSeries, numSamplesPerSeriesPerPush, (i+1)*numSamplesPerSeriesPerPush) 140 if i == numCheckpoints-1 { 141 // Shutdown creates a checkpoint. This is only for the last checkpoint. 142 require.NoError(t, services.StopAndAwaitTerminated(context.Background(), ing)) 143 } else { 144 require.NoError(t, w.performCheckpoint(true)) 145 } 146 } 147 148 require.Equal(t, float64(numCheckpoints), prom_testutil.ToFloat64(w.checkpointCreationTotal)) 149 150 // Verify checkpoint dirs. 151 files, err := ioutil.ReadDir(w.wal.Dir()) 152 require.NoError(t, err) 153 numDirs := 0 154 for _, f := range files { 155 if f.IsDir() { 156 numDirs++ 157 } 158 } 159 if numCheckpoints <= 1 { 160 require.Equal(t, numCheckpoints, numDirs) 161 } else { 162 // At max there are last 2 checkpoints on the disk. 163 require.Equal(t, 2, numDirs) 164 } 165 166 if numCheckpoints > 0 { 167 // Corrupt the last checkpoint. 168 lastChDir, _, err := lastCheckpoint(w.wal.Dir()) 169 require.NoError(t, err) 170 files, err = ioutil.ReadDir(lastChDir) 171 require.NoError(t, err) 172 173 lastFile, err := os.OpenFile(filepath.Join(lastChDir, files[len(files)-1].Name()), os.O_WRONLY, os.ModeAppend) 174 require.NoError(t, err) 175 n, err := lastFile.WriteAt([]byte{1, 2, 3, 4}, 2) 176 require.NoError(t, err) 177 require.Equal(t, 4, n) 178 require.NoError(t, lastFile.Close()) 179 } 180 181 // Open an ingester for the repair. 182 _, ing = newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig(), nil) 183 w, ok = ing.wal.(*walWrapper) 184 require.True(t, ok) 185 // defer in case we hit an error though we explicitly close it later. 186 defer func() { 187 require.NoError(t, services.StopAndAwaitTerminated(context.Background(), ing)) 188 }() 189 190 if numCheckpoints > 0 { 191 require.Equal(t, 1.0, prom_testutil.ToFloat64(ing.metrics.walCorruptionsTotal)) 192 } else { 193 require.Equal(t, 0.0, prom_testutil.ToFloat64(ing.metrics.walCorruptionsTotal)) 194 } 195 196 // Verify checkpoint dirs after the corrupt checkpoint is deleted. 197 files, err = ioutil.ReadDir(w.wal.Dir()) 198 require.NoError(t, err) 199 numDirs = 0 200 for _, f := range files { 201 if f.IsDir() { 202 numDirs++ 203 } 204 } 205 if numCheckpoints <= 1 { 206 // The only checkpoint is removed (or) there was no checkpoint at all. 207 require.Equal(t, 0, numDirs) 208 } else { 209 // There is at max last 2 checkpoints. Hence only 1 should be remaining. 210 require.Equal(t, 1, numDirs) 211 } 212 213 testData := map[string]model.Matrix{} 214 // Verify we did not lose any data. 215 for i, userID := range userIDs { 216 // 'numCheckpoints*' because we ingested the data 'numCheckpoints' number of time. 217 testData[userID] = buildTestMatrix(numSeries, (numCheckpoints+1)*numSamplesPerSeriesPerPush, i) 218 } 219 retrieveTestSamples(t, ing, userIDs, testData) 220 221 require.NoError(t, services.StopAndAwaitTerminated(context.Background(), ing)) 222 } 223 224 } 225 226 func TestCheckpointIndex(t *testing.T) { 227 tcs := []struct { 228 filename string 229 includeTmp bool 230 index int 231 shouldError bool 232 }{ 233 { 234 filename: "checkpoint.123456", 235 includeTmp: false, 236 index: 123456, 237 shouldError: false, 238 }, 239 { 240 filename: "checkpoint.123456", 241 includeTmp: true, 242 index: 123456, 243 shouldError: false, 244 }, 245 { 246 filename: "checkpoint.123456.tmp", 247 includeTmp: true, 248 index: 123456, 249 shouldError: false, 250 }, 251 { 252 filename: "checkpoint.123456.tmp", 253 includeTmp: false, 254 shouldError: true, 255 }, 256 { 257 filename: "not-checkpoint.123456.tmp", 258 includeTmp: true, 259 shouldError: true, 260 }, 261 { 262 filename: "checkpoint.123456.tmp2", 263 shouldError: true, 264 }, 265 { 266 filename: "checkpoints123456", 267 shouldError: true, 268 }, 269 { 270 filename: "012345", 271 shouldError: true, 272 }, 273 } 274 for _, tc := range tcs { 275 index, err := checkpointIndex(tc.filename, tc.includeTmp) 276 if tc.shouldError { 277 require.Error(t, err, "filename: %s, includeTmp: %t", tc.filename, tc.includeTmp) 278 continue 279 } 280 281 require.NoError(t, err, "filename: %s, includeTmp: %t", tc.filename, tc.includeTmp) 282 require.Equal(t, tc.index, index) 283 } 284 } 285 286 func BenchmarkWALReplay(b *testing.B) { 287 dirname, err := ioutil.TempDir("", "cortex-wal") 288 require.NoError(b, err) 289 defer func() { 290 require.NoError(b, os.RemoveAll(dirname)) 291 }() 292 293 cfg := defaultIngesterTestConfig(b) 294 cfg.WALConfig.WALEnabled = true 295 cfg.WALConfig.CheckpointEnabled = true 296 cfg.WALConfig.Recover = true 297 cfg.WALConfig.Dir = dirname 298 cfg.WALConfig.CheckpointDuration = 100 * time.Minute 299 cfg.WALConfig.checkpointDuringShutdown = false 300 301 numSeries := 10 302 numSamplesPerSeriesPerPush := 2 303 numPushes := 100000 304 305 _, ing := newTestStore(b, cfg, defaultClientTestConfig(), defaultLimitsTestConfig(), nil) 306 307 // Add samples for the checkpoint. 308 for r := 0; r < numPushes; r++ { 309 _, _ = pushTestSamples(b, ing, numSeries, numSamplesPerSeriesPerPush, r*numSamplesPerSeriesPerPush) 310 } 311 w, ok := ing.wal.(*walWrapper) 312 require.True(b, ok) 313 require.NoError(b, w.performCheckpoint(true)) 314 315 // Add samples for the additional WAL not in checkpoint. 316 for r := 0; r < numPushes; r++ { 317 _, _ = pushTestSamples(b, ing, numSeries, numSamplesPerSeriesPerPush, (numPushes+r)*numSamplesPerSeriesPerPush) 318 } 319 320 require.NoError(b, services.StopAndAwaitTerminated(context.Background(), ing)) 321 322 var ing2 *Ingester 323 b.Run("wal replay", func(b *testing.B) { 324 // Replay will happen here. 325 _, ing2 = newTestStore(b, cfg, defaultClientTestConfig(), defaultLimitsTestConfig(), nil) 326 }) 327 require.NoError(b, services.StopAndAwaitTerminated(context.Background(), ing2)) 328 }