github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/ingester/flush_test.go (about) 1 package ingester 2 3 import ( 4 "context" 5 "fmt" 6 "io/ioutil" 7 "os" 8 "testing" 9 "time" 10 11 "github.com/go-kit/log" 12 "github.com/grafana/dskit/kv" 13 "github.com/grafana/dskit/ring" 14 "github.com/grafana/dskit/services" 15 "github.com/prometheus/common/model" 16 "github.com/prometheus/prometheus/pkg/labels" 17 "github.com/stretchr/testify/require" 18 "github.com/weaveworks/common/user" 19 "go.uber.org/atomic" 20 21 "github.com/cortexproject/cortex/pkg/chunk" 22 "github.com/cortexproject/cortex/pkg/cortexpb" 23 "github.com/cortexproject/cortex/pkg/ingester/client" 24 "github.com/cortexproject/cortex/pkg/util" 25 "github.com/cortexproject/cortex/pkg/util/validation" 26 ) 27 28 var singleTestLabel = []labels.Labels{[]labels.Label{{Name: "__name__", Value: "test"}}} 29 30 // This test case demonstrates problem with losing incoming samples while chunks are flushed with "immediate" mode. 31 func TestSweepImmediateDropsSamples(t *testing.T) { 32 cfg := emptyIngesterConfig() 33 cfg.FlushCheckPeriod = 1 * time.Minute 34 cfg.RetainPeriod = 10 * time.Millisecond 35 36 st := &sleepyCountingStore{} 37 ing := createTestIngester(t, cfg, st) 38 39 samples := newSampleGenerator(t, time.Now(), time.Millisecond) 40 41 // Generates one sample. 42 pushSample(t, ing, <-samples) 43 44 notify := make(chan struct{}) 45 ing.preFlushChunks = func() { 46 if ing.State() == services.Running { 47 pushSample(t, ing, <-samples) 48 notify <- struct{}{} 49 } 50 } 51 52 // Simulate /flush. Sweeps everything, but also pushes another sample (in preFlushChunks) 53 ing.sweepUsers(true) 54 <-notify // Wait for flushing to happen. 55 56 // Stopping ingester should sweep the remaining samples. 57 require.NoError(t, services.StopAndAwaitTerminated(context.Background(), ing)) 58 59 require.Equal(t, 2, st.samples) 60 } 61 62 // There are several factors in this panic: 63 // Chunk is first flushed normally 64 // "/flush" is called (sweepUsers(true)), and that causes new flush of already flushed chunks 65 // During the flush to store (in flushChunks), chunk is actually removed from list of chunks (and its reference is niled) in removeFlushedChunks. 66 // After flushing to store, reference is nil, causing panic. 67 func TestFlushPanicIssue2743(t *testing.T) { 68 cfg := emptyIngesterConfig() 69 cfg.FlushCheckPeriod = 50 * time.Millisecond // We want to check for flush-able and removable chunks often. 70 cfg.RetainPeriod = 500 * time.Millisecond // Remove flushed chunks quickly. This triggers nil-ing. To get a panic, it should happen while Store is "writing" chunks. (We use "sleepy store" to enforce that) 71 cfg.MaxChunkAge = 1 * time.Hour // We don't use max chunk age for this test, as that is jittered. 72 cfg.MaxChunkIdle = 200 * time.Millisecond // Flush chunk 200ms after adding last sample. 73 74 st := &sleepyCountingStore{d: 1 * time.Second} // Longer than retain period 75 76 ing := createTestIngester(t, cfg, st) 77 samples := newSampleGenerator(t, time.Now(), 1*time.Second) 78 79 notifyCh := make(chan bool, 10) 80 ing.preFlushChunks = func() { 81 select { 82 case notifyCh <- true: 83 default: 84 } 85 } 86 87 // Generates one sample 88 pushSample(t, ing, <-samples) 89 90 // Wait until regular flush kicks in (flushing due to chunk being idle) 91 <-notifyCh 92 93 // Sweep again -- this causes the same chunks to be queued for flushing again. 94 // We must hit this *before* flushed chunk is removed from list of chunks. (RetainPeriod) 95 // While store is flushing (simulated by sleep in the store), previously flushed chunk is removed from memory. 96 ing.sweepUsers(true) 97 98 // Wait a bit for flushing to end. In buggy version, we get panic while waiting. 99 time.Sleep(2 * time.Second) 100 } 101 102 func pushSample(t *testing.T, ing *Ingester, sample cortexpb.Sample) { 103 _, err := ing.Push(user.InjectOrgID(context.Background(), userID), cortexpb.ToWriteRequest(singleTestLabel, []cortexpb.Sample{sample}, nil, cortexpb.API)) 104 require.NoError(t, err) 105 } 106 107 func createTestIngester(t *testing.T, cfg Config, store ChunkStore) *Ingester { 108 l := validation.Limits{} 109 overrides, err := validation.NewOverrides(l, nil) 110 require.NoError(t, err) 111 112 ing, err := New(cfg, client.Config{}, overrides, store, nil, log.NewNopLogger()) 113 require.NoError(t, err) 114 115 require.NoError(t, services.StartAndAwaitRunning(context.Background(), ing)) 116 t.Cleanup(func() { 117 _ = services.StopAndAwaitTerminated(context.Background(), ing) 118 }) 119 120 return ing 121 } 122 123 type sleepyCountingStore struct { 124 d time.Duration 125 samples int 126 } 127 128 func (m *sleepyCountingStore) Put(_ context.Context, chunks []chunk.Chunk) error { 129 if m.d > 0 { 130 time.Sleep(m.d) 131 } 132 133 for _, c := range chunks { 134 m.samples += c.Data.Len() 135 } 136 return nil 137 } 138 139 func emptyIngesterConfig() Config { 140 return Config{ 141 WALConfig: WALConfig{}, 142 LifecyclerConfig: ring.LifecyclerConfig{ 143 RingConfig: ring.Config{ 144 KVStore: kv.Config{ 145 Store: "inmemory", 146 }, 147 ReplicationFactor: 1, 148 }, 149 InfNames: []string{"en0", "eth0", "lo0", "lo"}, 150 HeartbeatPeriod: 10 * time.Second, 151 }, 152 153 ConcurrentFlushes: 1, // Single queue only. Doesn't really matter for this test (same series is always flushed by same worker), but must be positive. 154 RateUpdatePeriod: 1 * time.Hour, // Must be positive, doesn't matter for this test. 155 ActiveSeriesMetricsUpdatePeriod: 5 * time.Minute, // Must be positive. 156 } 157 } 158 159 func newSampleGenerator(t *testing.T, initTime time.Time, step time.Duration) <-chan cortexpb.Sample { 160 ts := make(chan cortexpb.Sample) 161 162 ctx, cancel := context.WithCancel(context.Background()) 163 t.Cleanup(cancel) 164 165 go func(ctx context.Context) { 166 c := initTime 167 for { 168 select { 169 case ts <- cortexpb.Sample{Value: 0, TimestampMs: util.TimeToMillis(c)}: 170 case <-ctx.Done(): 171 return 172 } 173 174 c = c.Add(step) 175 } 176 }(ctx) 177 178 return ts 179 } 180 181 func TestFlushReasonString(t *testing.T) { 182 for fr := flushReason(0); fr < maxFlushReason; fr++ { 183 require.True(t, len(fr.String()) > 0) 184 } 185 } 186 187 // Issue 3139 depends on a timing between immediate flush, and periodic flush, and the fact that "immediate" chunks get behind "idle" chunks. 188 // Periodic flush may still find "idle" chunks and put them onto queue, because "ingester for flusher" still runs all the loops. 189 // When flush of "immediate" chunk fails (eg. due to storage error), it is put back onto the queue, but behind Idle chunk. 190 // When handling Idle chunks, they are then compared against user limit (MinChunkLength), which panics -- because we were not setting limits. 191 func TestIssue3139(t *testing.T) { 192 dir, err := ioutil.TempDir("", "wal") 193 require.NoError(t, err) 194 t.Cleanup(func() { 195 _ = os.RemoveAll(dir) 196 }) 197 198 cfg := emptyIngesterConfig() 199 cfg.WALConfig.FlushOnShutdown = false 200 cfg.WALConfig.Dir = dir 201 cfg.WALConfig.WALEnabled = true 202 203 cfg.FlushCheckPeriod = 10 * time.Millisecond 204 cfg.MaxChunkAge = 1 * time.Hour // We don't want to hit "age" check, but idle-ness check. 205 cfg.MaxChunkIdle = 0 // Everything is idle immediately 206 207 // Sleep long enough for period flush to happen. Also we want to return errors to the first attempts, so that 208 // series are flushed again. 209 st := &sleepyStoreWithErrors{d: 500 * time.Millisecond} 210 st.errorsToGenerate.Store(1) 211 212 ing := createTestIngester(t, cfg, st) 213 214 // Generates a sample. While it is flushed for the first time (which returns error), it will be put on the queue 215 // again. 216 pushSample(t, ing, cortexpb.Sample{Value: 100, TimestampMs: int64(model.Now())}) 217 218 // stop ingester -- no flushing should happen yet 219 require.NoError(t, services.StopAndAwaitTerminated(context.Background(), ing)) 220 221 // Make sure nothing was flushed yet... sample should be in WAL 222 require.Equal(t, int64(0), st.samples.Load()) 223 require.Equal(t, int64(1), st.errorsToGenerate.Load()) // no error was "consumed" 224 225 // Start new ingester, for flushing only 226 ing, err = NewForFlusher(cfg, st, nil, nil, log.NewNopLogger()) 227 require.NoError(t, err) 228 require.NoError(t, services.StartAndAwaitRunning(context.Background(), ing)) 229 t.Cleanup(func() { 230 // Just in case test fails earlier, stop ingester anyay. 231 _ = services.StopAndAwaitTerminated(context.Background(), ing) 232 }) 233 234 ing.Flush() 235 require.NoError(t, services.StopAndAwaitTerminated(context.Background(), ing)) 236 237 // Verify sample was flushed from WAL. 238 require.Equal(t, int64(1), st.samples.Load()) 239 } 240 241 type sleepyStoreWithErrors struct { 242 d time.Duration 243 errorsToGenerate atomic.Int64 244 samples atomic.Int64 245 } 246 247 func (m *sleepyStoreWithErrors) Put(_ context.Context, chunks []chunk.Chunk) error { 248 if m.d > 0 { 249 time.Sleep(m.d) 250 } 251 252 if m.errorsToGenerate.Load() > 0 { 253 m.errorsToGenerate.Dec() 254 return fmt.Errorf("put error") 255 } 256 257 for _, c := range chunks { 258 m.samples.Add(int64(c.Data.Len())) 259 } 260 return nil 261 }