github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/ingester/wal_test.go (about)

     1  package ingester
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"io/ioutil"
     7  	"net/http"
     8  	"os"
     9  	"path/filepath"
    10  	"testing"
    11  	"time"
    12  
    13  	"github.com/grafana/dskit/services"
    14  	prom_testutil "github.com/prometheus/client_golang/prometheus/testutil"
    15  	"github.com/prometheus/common/model"
    16  	"github.com/prometheus/prometheus/pkg/labels"
    17  	"github.com/stretchr/testify/require"
    18  	"github.com/weaveworks/common/httpgrpc"
    19  	"github.com/weaveworks/common/user"
    20  
    21  	"github.com/cortexproject/cortex/pkg/cortexpb"
    22  )
    23  
    24  func TestWAL(t *testing.T) {
    25  	dirname, err := ioutil.TempDir("", "cortex-wal")
    26  	require.NoError(t, err)
    27  	defer func() {
    28  		require.NoError(t, os.RemoveAll(dirname))
    29  	}()
    30  
    31  	cfg := defaultIngesterTestConfig(t)
    32  	cfg.WALConfig.WALEnabled = true
    33  	cfg.WALConfig.CheckpointEnabled = true
    34  	cfg.WALConfig.Recover = true
    35  	cfg.WALConfig.Dir = dirname
    36  	cfg.WALConfig.CheckpointDuration = 100 * time.Minute
    37  	cfg.WALConfig.checkpointDuringShutdown = true
    38  
    39  	numSeries := 100
    40  	numSamplesPerSeriesPerPush := 10
    41  	numRestarts := 5
    42  
    43  	// Build an ingester, add some samples, then shut it down.
    44  	_, ing := newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig(), nil)
    45  	userIDs, testData := pushTestSamples(t, ing, numSeries, numSamplesPerSeriesPerPush, 0)
    46  	// Checkpoint happens when stopping.
    47  	require.NoError(t, services.StopAndAwaitTerminated(context.Background(), ing))
    48  
    49  	for r := 0; r < numRestarts; r++ {
    50  		if r == 2 {
    51  			// From 3rd restart onwards, we are disabling checkpointing during shutdown
    52  			// to test both checkpoint+WAL replay.
    53  			cfg.WALConfig.checkpointDuringShutdown = false
    54  		}
    55  		if r == numRestarts-1 {
    56  			cfg.WALConfig.WALEnabled = false
    57  			cfg.WALConfig.CheckpointEnabled = false
    58  		}
    59  
    60  		// Start a new ingester and recover the WAL.
    61  		_, ing = newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig(), nil)
    62  
    63  		for i, userID := range userIDs {
    64  			testData[userID] = buildTestMatrix(numSeries, (r+1)*numSamplesPerSeriesPerPush, i)
    65  		}
    66  		// Check the samples are still there!
    67  		retrieveTestSamples(t, ing, userIDs, testData)
    68  
    69  		if r != numRestarts-1 {
    70  			userIDs, testData = pushTestSamples(t, ing, numSeries, numSamplesPerSeriesPerPush, (r+1)*numSamplesPerSeriesPerPush)
    71  		}
    72  
    73  		require.NoError(t, services.StopAndAwaitTerminated(context.Background(), ing))
    74  	}
    75  
    76  	cfg.WALConfig.WALEnabled = true
    77  	cfg.WALConfig.CheckpointEnabled = true
    78  
    79  	// Start a new ingester and recover the WAL.
    80  	_, ing = newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig(), nil)
    81  
    82  	userID := userIDs[0]
    83  	sampleStream := testData[userID][0]
    84  	lastSample := sampleStream.Values[len(sampleStream.Values)-1]
    85  
    86  	// In-order and out of order sample in the same request.
    87  	metric := cortexpb.FromLabelAdaptersToLabels(cortexpb.FromMetricsToLabelAdapters(sampleStream.Metric))
    88  	outOfOrderSample := cortexpb.Sample{TimestampMs: int64(lastSample.Timestamp - 10), Value: 99}
    89  	inOrderSample := cortexpb.Sample{TimestampMs: int64(lastSample.Timestamp + 10), Value: 999}
    90  
    91  	ctx := user.InjectOrgID(context.Background(), userID)
    92  	_, err = ing.Push(ctx, cortexpb.ToWriteRequest(
    93  		[]labels.Labels{metric, metric},
    94  		[]cortexpb.Sample{outOfOrderSample, inOrderSample}, nil, cortexpb.API))
    95  	require.Equal(t, httpgrpc.Errorf(http.StatusBadRequest, wrapWithUser(makeMetricValidationError(sampleOutOfOrder, metric,
    96  		fmt.Errorf("sample timestamp out of order; last timestamp: %v, incoming timestamp: %v", lastSample.Timestamp, model.Time(outOfOrderSample.TimestampMs))), userID).Error()), err)
    97  
    98  	// We should have logged the in-order sample.
    99  	testData[userID][0].Values = append(testData[userID][0].Values, model.SamplePair{
   100  		Timestamp: model.Time(inOrderSample.TimestampMs),
   101  		Value:     model.SampleValue(inOrderSample.Value),
   102  	})
   103  
   104  	// Check samples after restart from WAL.
   105  	require.NoError(t, services.StopAndAwaitTerminated(context.Background(), ing))
   106  	_, ing = newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig(), nil)
   107  	retrieveTestSamples(t, ing, userIDs, testData)
   108  }
   109  
   110  func TestCheckpointRepair(t *testing.T) {
   111  	cfg := defaultIngesterTestConfig(t)
   112  	cfg.WALConfig.WALEnabled = true
   113  	cfg.WALConfig.CheckpointEnabled = true
   114  	cfg.WALConfig.Recover = true
   115  	cfg.WALConfig.CheckpointDuration = 100 * time.Hour // Basically no automatic checkpoint.
   116  
   117  	numSeries := 100
   118  	numSamplesPerSeriesPerPush := 10
   119  	for _, numCheckpoints := range []int{0, 1, 2, 3} {
   120  		dirname, err := ioutil.TempDir("", "cortex-wal")
   121  		require.NoError(t, err)
   122  		defer func() {
   123  			require.NoError(t, os.RemoveAll(dirname))
   124  		}()
   125  		cfg.WALConfig.Dir = dirname
   126  
   127  		// Build an ingester, add some samples, then shut it down.
   128  		_, ing := newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig(), nil)
   129  
   130  		w, ok := ing.wal.(*walWrapper)
   131  		require.True(t, ok)
   132  
   133  		var userIDs []string
   134  		// Push some samples for the 0 checkpoints case.
   135  		// We dont shutdown the ingester in that case, else it will create a checkpoint.
   136  		userIDs, _ = pushTestSamples(t, ing, numSeries, numSamplesPerSeriesPerPush, 0)
   137  		for i := 0; i < numCheckpoints; i++ {
   138  			// First checkpoint.
   139  			userIDs, _ = pushTestSamples(t, ing, numSeries, numSamplesPerSeriesPerPush, (i+1)*numSamplesPerSeriesPerPush)
   140  			if i == numCheckpoints-1 {
   141  				// Shutdown creates a checkpoint. This is only for the last checkpoint.
   142  				require.NoError(t, services.StopAndAwaitTerminated(context.Background(), ing))
   143  			} else {
   144  				require.NoError(t, w.performCheckpoint(true))
   145  			}
   146  		}
   147  
   148  		require.Equal(t, float64(numCheckpoints), prom_testutil.ToFloat64(w.checkpointCreationTotal))
   149  
   150  		// Verify checkpoint dirs.
   151  		files, err := ioutil.ReadDir(w.wal.Dir())
   152  		require.NoError(t, err)
   153  		numDirs := 0
   154  		for _, f := range files {
   155  			if f.IsDir() {
   156  				numDirs++
   157  			}
   158  		}
   159  		if numCheckpoints <= 1 {
   160  			require.Equal(t, numCheckpoints, numDirs)
   161  		} else {
   162  			// At max there are last 2 checkpoints on the disk.
   163  			require.Equal(t, 2, numDirs)
   164  		}
   165  
   166  		if numCheckpoints > 0 {
   167  			// Corrupt the last checkpoint.
   168  			lastChDir, _, err := lastCheckpoint(w.wal.Dir())
   169  			require.NoError(t, err)
   170  			files, err = ioutil.ReadDir(lastChDir)
   171  			require.NoError(t, err)
   172  
   173  			lastFile, err := os.OpenFile(filepath.Join(lastChDir, files[len(files)-1].Name()), os.O_WRONLY, os.ModeAppend)
   174  			require.NoError(t, err)
   175  			n, err := lastFile.WriteAt([]byte{1, 2, 3, 4}, 2)
   176  			require.NoError(t, err)
   177  			require.Equal(t, 4, n)
   178  			require.NoError(t, lastFile.Close())
   179  		}
   180  
   181  		// Open an ingester for the repair.
   182  		_, ing = newTestStore(t, cfg, defaultClientTestConfig(), defaultLimitsTestConfig(), nil)
   183  		w, ok = ing.wal.(*walWrapper)
   184  		require.True(t, ok)
   185  		// defer in case we hit an error though we explicitly close it later.
   186  		defer func() {
   187  			require.NoError(t, services.StopAndAwaitTerminated(context.Background(), ing))
   188  		}()
   189  
   190  		if numCheckpoints > 0 {
   191  			require.Equal(t, 1.0, prom_testutil.ToFloat64(ing.metrics.walCorruptionsTotal))
   192  		} else {
   193  			require.Equal(t, 0.0, prom_testutil.ToFloat64(ing.metrics.walCorruptionsTotal))
   194  		}
   195  
   196  		// Verify checkpoint dirs after the corrupt checkpoint is deleted.
   197  		files, err = ioutil.ReadDir(w.wal.Dir())
   198  		require.NoError(t, err)
   199  		numDirs = 0
   200  		for _, f := range files {
   201  			if f.IsDir() {
   202  				numDirs++
   203  			}
   204  		}
   205  		if numCheckpoints <= 1 {
   206  			// The only checkpoint is removed (or) there was no checkpoint at all.
   207  			require.Equal(t, 0, numDirs)
   208  		} else {
   209  			// There is at max last 2 checkpoints. Hence only 1 should be remaining.
   210  			require.Equal(t, 1, numDirs)
   211  		}
   212  
   213  		testData := map[string]model.Matrix{}
   214  		// Verify we did not lose any data.
   215  		for i, userID := range userIDs {
   216  			// 'numCheckpoints*' because we ingested the data 'numCheckpoints' number of time.
   217  			testData[userID] = buildTestMatrix(numSeries, (numCheckpoints+1)*numSamplesPerSeriesPerPush, i)
   218  		}
   219  		retrieveTestSamples(t, ing, userIDs, testData)
   220  
   221  		require.NoError(t, services.StopAndAwaitTerminated(context.Background(), ing))
   222  	}
   223  
   224  }
   225  
   226  func TestCheckpointIndex(t *testing.T) {
   227  	tcs := []struct {
   228  		filename    string
   229  		includeTmp  bool
   230  		index       int
   231  		shouldError bool
   232  	}{
   233  		{
   234  			filename:    "checkpoint.123456",
   235  			includeTmp:  false,
   236  			index:       123456,
   237  			shouldError: false,
   238  		},
   239  		{
   240  			filename:    "checkpoint.123456",
   241  			includeTmp:  true,
   242  			index:       123456,
   243  			shouldError: false,
   244  		},
   245  		{
   246  			filename:    "checkpoint.123456.tmp",
   247  			includeTmp:  true,
   248  			index:       123456,
   249  			shouldError: false,
   250  		},
   251  		{
   252  			filename:    "checkpoint.123456.tmp",
   253  			includeTmp:  false,
   254  			shouldError: true,
   255  		},
   256  		{
   257  			filename:    "not-checkpoint.123456.tmp",
   258  			includeTmp:  true,
   259  			shouldError: true,
   260  		},
   261  		{
   262  			filename:    "checkpoint.123456.tmp2",
   263  			shouldError: true,
   264  		},
   265  		{
   266  			filename:    "checkpoints123456",
   267  			shouldError: true,
   268  		},
   269  		{
   270  			filename:    "012345",
   271  			shouldError: true,
   272  		},
   273  	}
   274  	for _, tc := range tcs {
   275  		index, err := checkpointIndex(tc.filename, tc.includeTmp)
   276  		if tc.shouldError {
   277  			require.Error(t, err, "filename: %s, includeTmp: %t", tc.filename, tc.includeTmp)
   278  			continue
   279  		}
   280  
   281  		require.NoError(t, err, "filename: %s, includeTmp: %t", tc.filename, tc.includeTmp)
   282  		require.Equal(t, tc.index, index)
   283  	}
   284  }
   285  
   286  func BenchmarkWALReplay(b *testing.B) {
   287  	dirname, err := ioutil.TempDir("", "cortex-wal")
   288  	require.NoError(b, err)
   289  	defer func() {
   290  		require.NoError(b, os.RemoveAll(dirname))
   291  	}()
   292  
   293  	cfg := defaultIngesterTestConfig(b)
   294  	cfg.WALConfig.WALEnabled = true
   295  	cfg.WALConfig.CheckpointEnabled = true
   296  	cfg.WALConfig.Recover = true
   297  	cfg.WALConfig.Dir = dirname
   298  	cfg.WALConfig.CheckpointDuration = 100 * time.Minute
   299  	cfg.WALConfig.checkpointDuringShutdown = false
   300  
   301  	numSeries := 10
   302  	numSamplesPerSeriesPerPush := 2
   303  	numPushes := 100000
   304  
   305  	_, ing := newTestStore(b, cfg, defaultClientTestConfig(), defaultLimitsTestConfig(), nil)
   306  
   307  	// Add samples for the checkpoint.
   308  	for r := 0; r < numPushes; r++ {
   309  		_, _ = pushTestSamples(b, ing, numSeries, numSamplesPerSeriesPerPush, r*numSamplesPerSeriesPerPush)
   310  	}
   311  	w, ok := ing.wal.(*walWrapper)
   312  	require.True(b, ok)
   313  	require.NoError(b, w.performCheckpoint(true))
   314  
   315  	// Add samples for the additional WAL not in checkpoint.
   316  	for r := 0; r < numPushes; r++ {
   317  		_, _ = pushTestSamples(b, ing, numSeries, numSamplesPerSeriesPerPush, (numPushes+r)*numSamplesPerSeriesPerPush)
   318  	}
   319  
   320  	require.NoError(b, services.StopAndAwaitTerminated(context.Background(), ing))
   321  
   322  	var ing2 *Ingester
   323  	b.Run("wal replay", func(b *testing.B) {
   324  		// Replay will happen here.
   325  		_, ing2 = newTestStore(b, cfg, defaultClientTestConfig(), defaultLimitsTestConfig(), nil)
   326  	})
   327  	require.NoError(b, services.StopAndAwaitTerminated(context.Background(), ing2))
   328  }