github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/ingester/flush_test.go (about)

     1  package ingester
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"io/ioutil"
     7  	"os"
     8  	"testing"
     9  	"time"
    10  
    11  	"github.com/go-kit/log"
    12  	"github.com/grafana/dskit/kv"
    13  	"github.com/grafana/dskit/ring"
    14  	"github.com/grafana/dskit/services"
    15  	"github.com/prometheus/common/model"
    16  	"github.com/prometheus/prometheus/pkg/labels"
    17  	"github.com/stretchr/testify/require"
    18  	"github.com/weaveworks/common/user"
    19  	"go.uber.org/atomic"
    20  
    21  	"github.com/cortexproject/cortex/pkg/chunk"
    22  	"github.com/cortexproject/cortex/pkg/cortexpb"
    23  	"github.com/cortexproject/cortex/pkg/ingester/client"
    24  	"github.com/cortexproject/cortex/pkg/util"
    25  	"github.com/cortexproject/cortex/pkg/util/validation"
    26  )
    27  
    28  var singleTestLabel = []labels.Labels{[]labels.Label{{Name: "__name__", Value: "test"}}}
    29  
    30  // This test case demonstrates problem with losing incoming samples while chunks are flushed with "immediate" mode.
    31  func TestSweepImmediateDropsSamples(t *testing.T) {
    32  	cfg := emptyIngesterConfig()
    33  	cfg.FlushCheckPeriod = 1 * time.Minute
    34  	cfg.RetainPeriod = 10 * time.Millisecond
    35  
    36  	st := &sleepyCountingStore{}
    37  	ing := createTestIngester(t, cfg, st)
    38  
    39  	samples := newSampleGenerator(t, time.Now(), time.Millisecond)
    40  
    41  	// Generates one sample.
    42  	pushSample(t, ing, <-samples)
    43  
    44  	notify := make(chan struct{})
    45  	ing.preFlushChunks = func() {
    46  		if ing.State() == services.Running {
    47  			pushSample(t, ing, <-samples)
    48  			notify <- struct{}{}
    49  		}
    50  	}
    51  
    52  	// Simulate /flush. Sweeps everything, but also pushes another sample (in preFlushChunks)
    53  	ing.sweepUsers(true)
    54  	<-notify // Wait for flushing to happen.
    55  
    56  	// Stopping ingester should sweep the remaining samples.
    57  	require.NoError(t, services.StopAndAwaitTerminated(context.Background(), ing))
    58  
    59  	require.Equal(t, 2, st.samples)
    60  }
    61  
    62  // There are several factors in this panic:
    63  // Chunk is first flushed normally
    64  // "/flush" is called (sweepUsers(true)), and that causes new flush of already flushed chunks
    65  // During the flush to store (in flushChunks), chunk is actually removed from list of chunks (and its reference is niled) in removeFlushedChunks.
    66  // After flushing to store, reference is nil, causing panic.
    67  func TestFlushPanicIssue2743(t *testing.T) {
    68  	cfg := emptyIngesterConfig()
    69  	cfg.FlushCheckPeriod = 50 * time.Millisecond // We want to check for flush-able and removable chunks often.
    70  	cfg.RetainPeriod = 500 * time.Millisecond    // Remove flushed chunks quickly. This triggers nil-ing. To get a panic, it should happen while Store is "writing" chunks. (We use "sleepy store" to enforce that)
    71  	cfg.MaxChunkAge = 1 * time.Hour              // We don't use max chunk age for this test, as that is jittered.
    72  	cfg.MaxChunkIdle = 200 * time.Millisecond    // Flush chunk 200ms after adding last sample.
    73  
    74  	st := &sleepyCountingStore{d: 1 * time.Second} // Longer than retain period
    75  
    76  	ing := createTestIngester(t, cfg, st)
    77  	samples := newSampleGenerator(t, time.Now(), 1*time.Second)
    78  
    79  	notifyCh := make(chan bool, 10)
    80  	ing.preFlushChunks = func() {
    81  		select {
    82  		case notifyCh <- true:
    83  		default:
    84  		}
    85  	}
    86  
    87  	// Generates one sample
    88  	pushSample(t, ing, <-samples)
    89  
    90  	// Wait until regular flush kicks in (flushing due to chunk being idle)
    91  	<-notifyCh
    92  
    93  	// Sweep again -- this causes the same chunks to be queued for flushing again.
    94  	// We must hit this *before* flushed chunk is removed from list of chunks. (RetainPeriod)
    95  	// While store is flushing (simulated by sleep in the store), previously flushed chunk is removed from memory.
    96  	ing.sweepUsers(true)
    97  
    98  	// Wait a bit for flushing to end. In buggy version, we get panic while waiting.
    99  	time.Sleep(2 * time.Second)
   100  }
   101  
   102  func pushSample(t *testing.T, ing *Ingester, sample cortexpb.Sample) {
   103  	_, err := ing.Push(user.InjectOrgID(context.Background(), userID), cortexpb.ToWriteRequest(singleTestLabel, []cortexpb.Sample{sample}, nil, cortexpb.API))
   104  	require.NoError(t, err)
   105  }
   106  
   107  func createTestIngester(t *testing.T, cfg Config, store ChunkStore) *Ingester {
   108  	l := validation.Limits{}
   109  	overrides, err := validation.NewOverrides(l, nil)
   110  	require.NoError(t, err)
   111  
   112  	ing, err := New(cfg, client.Config{}, overrides, store, nil, log.NewNopLogger())
   113  	require.NoError(t, err)
   114  
   115  	require.NoError(t, services.StartAndAwaitRunning(context.Background(), ing))
   116  	t.Cleanup(func() {
   117  		_ = services.StopAndAwaitTerminated(context.Background(), ing)
   118  	})
   119  
   120  	return ing
   121  }
   122  
   123  type sleepyCountingStore struct {
   124  	d       time.Duration
   125  	samples int
   126  }
   127  
   128  func (m *sleepyCountingStore) Put(_ context.Context, chunks []chunk.Chunk) error {
   129  	if m.d > 0 {
   130  		time.Sleep(m.d)
   131  	}
   132  
   133  	for _, c := range chunks {
   134  		m.samples += c.Data.Len()
   135  	}
   136  	return nil
   137  }
   138  
   139  func emptyIngesterConfig() Config {
   140  	return Config{
   141  		WALConfig: WALConfig{},
   142  		LifecyclerConfig: ring.LifecyclerConfig{
   143  			RingConfig: ring.Config{
   144  				KVStore: kv.Config{
   145  					Store: "inmemory",
   146  				},
   147  				ReplicationFactor: 1,
   148  			},
   149  			InfNames:        []string{"en0", "eth0", "lo0", "lo"},
   150  			HeartbeatPeriod: 10 * time.Second,
   151  		},
   152  
   153  		ConcurrentFlushes:               1,               // Single queue only. Doesn't really matter for this test (same series is always flushed by same worker), but must be positive.
   154  		RateUpdatePeriod:                1 * time.Hour,   // Must be positive, doesn't matter for this test.
   155  		ActiveSeriesMetricsUpdatePeriod: 5 * time.Minute, // Must be positive.
   156  	}
   157  }
   158  
   159  func newSampleGenerator(t *testing.T, initTime time.Time, step time.Duration) <-chan cortexpb.Sample {
   160  	ts := make(chan cortexpb.Sample)
   161  
   162  	ctx, cancel := context.WithCancel(context.Background())
   163  	t.Cleanup(cancel)
   164  
   165  	go func(ctx context.Context) {
   166  		c := initTime
   167  		for {
   168  			select {
   169  			case ts <- cortexpb.Sample{Value: 0, TimestampMs: util.TimeToMillis(c)}:
   170  			case <-ctx.Done():
   171  				return
   172  			}
   173  
   174  			c = c.Add(step)
   175  		}
   176  	}(ctx)
   177  
   178  	return ts
   179  }
   180  
   181  func TestFlushReasonString(t *testing.T) {
   182  	for fr := flushReason(0); fr < maxFlushReason; fr++ {
   183  		require.True(t, len(fr.String()) > 0)
   184  	}
   185  }
   186  
   187  // Issue 3139 depends on a timing between immediate flush, and periodic flush, and the fact that "immediate" chunks get behind "idle" chunks.
   188  // Periodic flush may still find "idle" chunks and put them onto queue, because "ingester for flusher" still runs all the loops.
   189  // When flush of "immediate" chunk fails (eg. due to storage error), it is put back onto the queue, but behind Idle chunk.
   190  // When handling Idle chunks, they are then compared against user limit (MinChunkLength), which panics -- because we were not setting limits.
   191  func TestIssue3139(t *testing.T) {
   192  	dir, err := ioutil.TempDir("", "wal")
   193  	require.NoError(t, err)
   194  	t.Cleanup(func() {
   195  		_ = os.RemoveAll(dir)
   196  	})
   197  
   198  	cfg := emptyIngesterConfig()
   199  	cfg.WALConfig.FlushOnShutdown = false
   200  	cfg.WALConfig.Dir = dir
   201  	cfg.WALConfig.WALEnabled = true
   202  
   203  	cfg.FlushCheckPeriod = 10 * time.Millisecond
   204  	cfg.MaxChunkAge = 1 * time.Hour // We don't want to hit "age" check, but idle-ness check.
   205  	cfg.MaxChunkIdle = 0            // Everything is idle immediately
   206  
   207  	// Sleep long enough for period flush to happen. Also we want to return errors to the first attempts, so that
   208  	// series are flushed again.
   209  	st := &sleepyStoreWithErrors{d: 500 * time.Millisecond}
   210  	st.errorsToGenerate.Store(1)
   211  
   212  	ing := createTestIngester(t, cfg, st)
   213  
   214  	// Generates a sample. While it is flushed for the first time (which returns error), it will be put on the queue
   215  	// again.
   216  	pushSample(t, ing, cortexpb.Sample{Value: 100, TimestampMs: int64(model.Now())})
   217  
   218  	// stop ingester -- no flushing should happen yet
   219  	require.NoError(t, services.StopAndAwaitTerminated(context.Background(), ing))
   220  
   221  	// Make sure nothing was flushed yet... sample should be in WAL
   222  	require.Equal(t, int64(0), st.samples.Load())
   223  	require.Equal(t, int64(1), st.errorsToGenerate.Load()) // no error was "consumed"
   224  
   225  	// Start new ingester, for flushing only
   226  	ing, err = NewForFlusher(cfg, st, nil, nil, log.NewNopLogger())
   227  	require.NoError(t, err)
   228  	require.NoError(t, services.StartAndAwaitRunning(context.Background(), ing))
   229  	t.Cleanup(func() {
   230  		// Just in case test fails earlier, stop ingester anyay.
   231  		_ = services.StopAndAwaitTerminated(context.Background(), ing)
   232  	})
   233  
   234  	ing.Flush()
   235  	require.NoError(t, services.StopAndAwaitTerminated(context.Background(), ing))
   236  
   237  	// Verify sample was flushed from WAL.
   238  	require.Equal(t, int64(1), st.samples.Load())
   239  }
   240  
   241  type sleepyStoreWithErrors struct {
   242  	d                time.Duration
   243  	errorsToGenerate atomic.Int64
   244  	samples          atomic.Int64
   245  }
   246  
   247  func (m *sleepyStoreWithErrors) Put(_ context.Context, chunks []chunk.Chunk) error {
   248  	if m.d > 0 {
   249  		time.Sleep(m.d)
   250  	}
   251  
   252  	if m.errorsToGenerate.Load() > 0 {
   253  		m.errorsToGenerate.Dec()
   254  		return fmt.Errorf("put error")
   255  	}
   256  
   257  	for _, c := range chunks {
   258  		m.samples.Add(int64(c.Data.Len()))
   259  	}
   260  	return nil
   261  }