github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/record/log_writer_test.go (about)

     1  // Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package record
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"math"
    11  	"sort"
    12  	"sync"
    13  	"sync/atomic"
    14  	"testing"
    15  	"time"
    16  
    17  	"github.com/cockroachdb/errors"
    18  	"github.com/cockroachdb/pebble/internal/humanize"
    19  	"github.com/cockroachdb/pebble/vfs"
    20  	"github.com/cockroachdb/pebble/vfs/errorfs"
    21  	"github.com/cockroachdb/pebble/vfs/vfstest"
    22  	"github.com/prometheus/client_golang/prometheus"
    23  	prometheusgo "github.com/prometheus/client_model/go"
    24  	"github.com/stretchr/testify/require"
    25  )
    26  
    27  type syncErrorFile struct {
    28  	vfs.File
    29  	err error
    30  }
    31  
    32  func (f syncErrorFile) Sync() error {
    33  	return f.err
    34  }
    35  
    36  func TestSyncQueue(t *testing.T) {
    37  	var q syncQueue
    38  	var closed atomic.Bool
    39  
    40  	var flusherWG sync.WaitGroup
    41  	flusherWG.Add(1)
    42  	go func() {
    43  		defer flusherWG.Done()
    44  		for {
    45  			if closed.Load() {
    46  				return
    47  			}
    48  			head, tail, _ := q.load()
    49  			q.pop(head, tail, nil, nil)
    50  		}
    51  	}()
    52  
    53  	var commitMu sync.Mutex
    54  	var doneWG sync.WaitGroup
    55  	for i := 0; i < SyncConcurrency; i++ {
    56  		doneWG.Add(1)
    57  		go func(i int) {
    58  			defer doneWG.Done()
    59  			for j := 0; j < 1000; j++ {
    60  				wg := &sync.WaitGroup{}
    61  				wg.Add(1)
    62  				// syncQueue is a single-producer, single-consumer queue. We need to
    63  				// provide mutual exclusion on the producer side.
    64  				commitMu.Lock()
    65  				q.push(wg, new(error))
    66  				commitMu.Unlock()
    67  				wg.Wait()
    68  			}
    69  		}(i)
    70  	}
    71  	doneWG.Wait()
    72  
    73  	closed.Store(true)
    74  	flusherWG.Wait()
    75  }
    76  
    77  func TestFlusherCond(t *testing.T) {
    78  	var mu sync.Mutex
    79  	var q syncQueue
    80  	var c flusherCond
    81  	var closed bool
    82  
    83  	c.init(&mu, &q)
    84  
    85  	var flusherWG sync.WaitGroup
    86  	flusherWG.Add(1)
    87  	go func() {
    88  		defer flusherWG.Done()
    89  
    90  		mu.Lock()
    91  		defer mu.Unlock()
    92  
    93  		for {
    94  			for {
    95  				if closed {
    96  					return
    97  				}
    98  				if !q.empty() {
    99  					break
   100  				}
   101  				c.Wait()
   102  			}
   103  
   104  			head, tail, _ := q.load()
   105  			q.pop(head, tail, nil, nil)
   106  		}
   107  	}()
   108  
   109  	var commitMu sync.Mutex
   110  	var doneWG sync.WaitGroup
   111  	// NB: we're testing with low concurrency here, because what we want to
   112  	// stress is that signalling of the flusherCond works
   113  	// correctly. Specifically, we want to make sure that a signal is "lost",
   114  	// causing the test to wedge.
   115  	for i := 0; i < 2; i++ {
   116  		doneWG.Add(1)
   117  		go func(i int) {
   118  			defer doneWG.Done()
   119  			for j := 0; j < 10000; j++ {
   120  				wg := &sync.WaitGroup{}
   121  				wg.Add(1)
   122  				// syncQueue is a single-producer, single-consumer queue. We need to
   123  				// provide mutual exclusion on the producer side.
   124  				commitMu.Lock()
   125  				q.push(wg, new(error))
   126  				commitMu.Unlock()
   127  				c.Signal()
   128  				wg.Wait()
   129  			}
   130  		}(i)
   131  	}
   132  	doneWG.Wait()
   133  
   134  	mu.Lock()
   135  	closed = true
   136  	c.Signal()
   137  	mu.Unlock()
   138  	flusherWG.Wait()
   139  }
   140  
   141  func TestSyncError(t *testing.T) {
   142  	mem := vfs.NewMem()
   143  	f, err := mem.Create("log")
   144  	require.NoError(t, err)
   145  
   146  	injectedErr := errors.New("injected error")
   147  	w := NewLogWriter(syncErrorFile{f, injectedErr}, 0, LogWriterConfig{
   148  		WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}),
   149  	})
   150  
   151  	syncRecord := func() {
   152  		var syncErr error
   153  		var syncWG sync.WaitGroup
   154  		syncWG.Add(1)
   155  		_, err = w.SyncRecord([]byte("hello"), &syncWG, &syncErr)
   156  		require.NoError(t, err)
   157  		syncWG.Wait()
   158  		if injectedErr != syncErr {
   159  			t.Fatalf("unexpected %v but found %v", injectedErr, syncErr)
   160  		}
   161  	}
   162  	// First waiter receives error.
   163  	syncRecord()
   164  	// All subsequent waiters also receive the error.
   165  	syncRecord()
   166  	syncRecord()
   167  }
   168  
   169  type syncFile struct {
   170  	writePos atomic.Int64
   171  	syncPos  atomic.Int64
   172  }
   173  
   174  func (f *syncFile) Write(buf []byte) (int, error) {
   175  	n := len(buf)
   176  	f.writePos.Add(int64(n))
   177  	return n, nil
   178  }
   179  
   180  func (f *syncFile) Sync() error {
   181  	f.syncPos.Store(f.writePos.Load())
   182  	return nil
   183  }
   184  
   185  func TestSyncRecord(t *testing.T) {
   186  	f := &syncFile{}
   187  	w := NewLogWriter(f, 0, LogWriterConfig{WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{})})
   188  
   189  	var syncErr error
   190  	for i := 0; i < 100000; i++ {
   191  		var syncWG sync.WaitGroup
   192  		syncWG.Add(1)
   193  		offset, err := w.SyncRecord([]byte("hello"), &syncWG, &syncErr)
   194  		require.NoError(t, err)
   195  		syncWG.Wait()
   196  		require.NoError(t, syncErr)
   197  		if v := f.writePos.Load(); offset != v {
   198  			t.Fatalf("expected write pos %d, but found %d", offset, v)
   199  		}
   200  		if v := f.syncPos.Load(); offset != v {
   201  			t.Fatalf("expected sync pos %d, but found %d", offset, v)
   202  		}
   203  	}
   204  }
   205  
   206  func TestSyncRecordWithSignalChan(t *testing.T) {
   207  	f := &syncFile{}
   208  	semChan := make(chan struct{}, 5)
   209  	for i := 0; i < cap(semChan); i++ {
   210  		semChan <- struct{}{}
   211  	}
   212  	w := NewLogWriter(f, 0, LogWriterConfig{
   213  		WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}),
   214  		QueueSemChan:    semChan,
   215  	})
   216  	require.Equal(t, cap(semChan), len(semChan))
   217  	var syncErr error
   218  	for i := 0; i < 5; i++ {
   219  		var syncWG sync.WaitGroup
   220  		syncWG.Add(1)
   221  		_, err := w.SyncRecord([]byte("hello"), &syncWG, &syncErr)
   222  		require.NoError(t, err)
   223  		syncWG.Wait()
   224  		require.NoError(t, syncErr)
   225  		// The waitgroup is released before the channel is read, so wait if
   226  		// necessary.
   227  		require.Eventually(t, func() bool {
   228  			return cap(semChan)-(i+1) == len(semChan)
   229  		}, 10*time.Second, time.Millisecond)
   230  	}
   231  }
   232  
   233  type fakeTimer struct {
   234  	f func()
   235  }
   236  
   237  func (t *fakeTimer) Reset(d time.Duration) bool {
   238  	return false
   239  }
   240  
   241  func (t *fakeTimer) Stop() bool {
   242  	return false
   243  }
   244  
   245  func try(initialSleep, maxTotalSleep time.Duration, f func() error) error {
   246  	totalSleep := time.Duration(0)
   247  	for d := initialSleep; ; d *= 2 {
   248  		time.Sleep(d)
   249  		totalSleep += d
   250  		if err := f(); err == nil || totalSleep >= maxTotalSleep {
   251  			return err
   252  		}
   253  	}
   254  }
   255  
   256  func TestMinSyncInterval(t *testing.T) {
   257  	const minSyncInterval = 100 * time.Millisecond
   258  
   259  	f := &syncFile{}
   260  	w := NewLogWriter(f, 0, LogWriterConfig{
   261  		WALMinSyncInterval: func() time.Duration {
   262  			return minSyncInterval
   263  		},
   264  		WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}),
   265  	})
   266  
   267  	var timer fakeTimer
   268  	w.afterFunc = func(d time.Duration, f func()) syncTimer {
   269  		if d != minSyncInterval {
   270  			t.Fatalf("expected minSyncInterval %s, but found %s", minSyncInterval, d)
   271  		}
   272  		timer.f = f
   273  		timer.Reset(d)
   274  		return &timer
   275  	}
   276  
   277  	syncRecord := func(n int) *sync.WaitGroup {
   278  		wg := &sync.WaitGroup{}
   279  		wg.Add(1)
   280  		_, err := w.SyncRecord(bytes.Repeat([]byte{'a'}, n), wg, new(error))
   281  		require.NoError(t, err)
   282  		return wg
   283  	}
   284  
   285  	// Sync one record which will cause the sync timer to kick in.
   286  	syncRecord(1).Wait()
   287  
   288  	startWritePos := f.writePos.Load()
   289  	startSyncPos := f.syncPos.Load()
   290  
   291  	// Write a bunch of large records. The sync position should not change
   292  	// because we haven't triggered the timer. But note that the writes should
   293  	// not block either even though syncing isn't being done.
   294  	var wg *sync.WaitGroup
   295  	for i := 0; i < 100; i++ {
   296  		wg = syncRecord(10000)
   297  		if v := f.syncPos.Load(); startSyncPos != v {
   298  			t.Fatalf("expected syncPos %d, but found %d", startSyncPos, v)
   299  		}
   300  		// NB: we can't use syncQueue.load() here as that will return 0,0 while the
   301  		// syncQueue is blocked.
   302  		head, tail := w.flusher.syncQ.unpack(w.flusher.syncQ.headTail.Load())
   303  		waiters := head - tail
   304  		if waiters != uint32(i+1) {
   305  			t.Fatalf("expected %d waiters, but found %d", i+1, waiters)
   306  		}
   307  	}
   308  
   309  	err := try(time.Millisecond, 5*time.Second, func() error {
   310  		v := f.writePos.Load()
   311  		if v > startWritePos {
   312  			return nil
   313  		}
   314  		return errors.Errorf("expected writePos > %d, but found %d", startWritePos, v)
   315  	})
   316  	require.NoError(t, err)
   317  
   318  	// Fire the timer, and then wait for the last record to sync.
   319  	timer.f()
   320  	wg.Wait()
   321  
   322  	if w, s := f.writePos.Load(), f.syncPos.Load(); w != s {
   323  		t.Fatalf("expected syncPos %d, but found %d", s, w)
   324  	}
   325  }
   326  
   327  func TestMinSyncIntervalClose(t *testing.T) {
   328  	const minSyncInterval = 100 * time.Millisecond
   329  
   330  	f := &syncFile{}
   331  	w := NewLogWriter(f, 0, LogWriterConfig{
   332  		WALMinSyncInterval: func() time.Duration {
   333  			return minSyncInterval
   334  		},
   335  		WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}),
   336  	})
   337  
   338  	var timer fakeTimer
   339  	w.afterFunc = func(d time.Duration, f func()) syncTimer {
   340  		if d != minSyncInterval {
   341  			t.Fatalf("expected minSyncInterval %s, but found %s", minSyncInterval, d)
   342  		}
   343  		timer.f = f
   344  		timer.Reset(d)
   345  		return &timer
   346  	}
   347  
   348  	syncRecord := func(n int) *sync.WaitGroup {
   349  		wg := &sync.WaitGroup{}
   350  		wg.Add(1)
   351  		_, err := w.SyncRecord(bytes.Repeat([]byte{'a'}, n), wg, new(error))
   352  		require.NoError(t, err)
   353  		return wg
   354  	}
   355  
   356  	// Sync one record which will cause the sync timer to kick in.
   357  	syncRecord(1).Wait()
   358  
   359  	// Syncing another record will not complete until the timer is fired OR the
   360  	// writer is closed.
   361  	wg := syncRecord(1)
   362  	require.NoError(t, w.Close())
   363  	wg.Wait()
   364  }
   365  
   366  type syncFileWithWait struct {
   367  	f       syncFile
   368  	writeWG sync.WaitGroup
   369  	syncWG  sync.WaitGroup
   370  }
   371  
   372  func (f *syncFileWithWait) Write(buf []byte) (int, error) {
   373  	f.writeWG.Wait()
   374  	return f.f.Write(buf)
   375  }
   376  
   377  func (f *syncFileWithWait) Sync() error {
   378  	f.syncWG.Wait()
   379  	return f.f.Sync()
   380  }
   381  
   382  func TestMetricsWithoutSync(t *testing.T) {
   383  	f := &syncFileWithWait{}
   384  	f.writeWG.Add(1)
   385  	w := NewLogWriter(f, 0, LogWriterConfig{WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{})})
   386  	offset, err := w.SyncRecord([]byte("hello"), nil, nil)
   387  	require.NoError(t, err)
   388  	const recordSize = 16
   389  	require.EqualValues(t, recordSize, offset)
   390  	// We have 512KB of buffer capacity, and 5 bytes + overhead = 16 bytes for
   391  	// each record. Write 28 * 1024 records to fill it up to 87.5%. This
   392  	// constitutes ~14 blocks (each 32KB).
   393  	const numRecords = 28 << 10
   394  	for i := 0; i < numRecords; i++ {
   395  		_, err = w.SyncRecord([]byte("hello"), nil, nil)
   396  		require.NoError(t, err)
   397  	}
   398  	// Unblock the flush loop. It will run once or twice to write these blocks,
   399  	// plus may run one more time due to the Close, so up to 3 runs. So ~14
   400  	// blocks flushed over up to 3 runs.
   401  	f.writeWG.Done()
   402  	w.Close()
   403  	m := w.Metrics()
   404  	// Mean is >= 4 filled blocks.
   405  	require.LessOrEqual(t, float64(4), m.PendingBufferLen.Mean())
   406  	// None of these writes asked to be synced.
   407  	require.EqualValues(t, 0, int(m.SyncQueueLen.Mean()))
   408  	require.Less(t, int64(numRecords*recordSize), m.WriteThroughput.Bytes)
   409  }
   410  
   411  func TestMetricsWithSync(t *testing.T) {
   412  	f := &syncFileWithWait{}
   413  	f.syncWG.Add(1)
   414  	syncLatencyMicros := prometheus.NewHistogram(prometheus.HistogramOpts{
   415  		Buckets: []float64{0,
   416  			float64(time.Millisecond),
   417  			float64(2 * time.Millisecond),
   418  			float64(3 * time.Millisecond),
   419  			float64(4 * time.Millisecond),
   420  			float64(5 * time.Millisecond),
   421  			float64(6 * time.Millisecond),
   422  			float64(7 * time.Millisecond),
   423  			float64(8 * time.Millisecond),
   424  			float64(9 * time.Millisecond),
   425  			float64(10 * time.Millisecond)},
   426  	})
   427  
   428  	w := NewLogWriter(f, 0, LogWriterConfig{
   429  		WALFsyncLatency: syncLatencyMicros,
   430  	},
   431  	)
   432  	var wg sync.WaitGroup
   433  	wg.Add(100)
   434  	for i := 0; i < 100; i++ {
   435  		var syncErr error
   436  		_, err := w.SyncRecord([]byte("hello"), &wg, &syncErr)
   437  		require.NoError(t, err)
   438  	}
   439  
   440  	const syncLatency = 100 * time.Millisecond
   441  	go func() {
   442  		time.Sleep(syncLatency)
   443  		// Unblock the flush loop. It may have run once or twice for these writes,
   444  		// plus may run one more time due to the Close, so up to 3 runs. So 100
   445  		// elements in the sync queue, spread over up to 3 runs.
   446  		f.syncWG.Done()
   447  	}()
   448  
   449  	// Close() will only return after flushing is finished.
   450  	require.NoError(t, w.Close())
   451  
   452  	m := w.Metrics()
   453  	require.LessOrEqual(t, float64(30), m.SyncQueueLen.Mean())
   454  
   455  	writeTo := &prometheusgo.Metric{}
   456  	require.NoError(t, syncLatencyMicros.Write(writeTo))
   457  	for i := 0; i < 100; i += 10 {
   458  		t.Logf("%d%%: %v", i, valueAtQuantileWindowed(writeTo.Histogram, float64(i)))
   459  	}
   460  	// Allow for some inaccuracy in sleep and for two syncs, one of which was
   461  	// fast.
   462  	require.LessOrEqual(t, float64(syncLatency/(2*time.Microsecond)),
   463  		valueAtQuantileWindowed(writeTo.Histogram, 90))
   464  	require.LessOrEqual(t, syncLatency/2, m.WriteThroughput.WorkDuration)
   465  }
   466  
   467  func valueAtQuantileWindowed(histogram *prometheusgo.Histogram, q float64) float64 {
   468  	buckets := histogram.Bucket
   469  	n := float64(*histogram.SampleCount)
   470  	if n == 0 {
   471  		return 0
   472  	}
   473  
   474  	// NB: The 0.5 is added for rounding purposes; it helps in cases where
   475  	// SampleCount is small.
   476  	rank := uint64(((q / 100) * n) + 0.5)
   477  
   478  	// Since we are missing the +Inf bucket, CumulativeCounts may never exceed
   479  	// rank. By omitting the highest bucket we have from the search, the failed
   480  	// search will land on that last bucket and we don't have to do any special
   481  	// checks regarding landing on a non-existent bucket.
   482  	b := sort.Search(len(buckets)-1, func(i int) bool { return *buckets[i].CumulativeCount >= rank })
   483  
   484  	var (
   485  		bucketStart float64 // defaults to 0, which we assume is the lower bound of the smallest bucket
   486  		bucketEnd   = *buckets[b].UpperBound
   487  		count       = *buckets[b].CumulativeCount
   488  	)
   489  
   490  	// Calculate the linearly interpolated value within the bucket.
   491  	if b > 0 {
   492  		bucketStart = *buckets[b-1].UpperBound
   493  		count -= *buckets[b-1].CumulativeCount
   494  		rank -= *buckets[b-1].CumulativeCount
   495  	}
   496  	val := bucketStart + (bucketEnd-bucketStart)*(float64(rank)/float64(count))
   497  	if math.IsNaN(val) || math.IsInf(val, -1) {
   498  		return 0
   499  	}
   500  
   501  	// Should not extrapolate past the upper bound of the largest bucket.
   502  	//
   503  	// NB: SampleCount includes the implicit +Inf bucket but the
   504  	// buckets[len(buckets)-1].UpperBound refers to the largest bucket defined
   505  	// by us -- the client library doesn't give us access to the +Inf bucket
   506  	// which Prometheus uses under the hood. With a high enough quantile, the
   507  	// val computed further below surpasses the upper bound of the largest
   508  	// bucket. Using that interpolated value feels wrong since we'd be
   509  	// extrapolating. Also, for specific metrics if we see our q99 values to be
   510  	// hitting the top-most bucket boundary, that's an indication for us to
   511  	// choose better buckets for more accuracy. It's also worth noting that the
   512  	// prometheus client library does the same thing when the resulting value is
   513  	// in the +Inf bucket, whereby they return the upper bound of the second
   514  	// last bucket -- see [1].
   515  	//
   516  	// [1]: https://github.com/prometheus/prometheus/blob/d9162189/promql/quantile.go#L103.
   517  	if val > *buckets[len(buckets)-1].UpperBound {
   518  		return *buckets[len(buckets)-1].UpperBound
   519  	}
   520  
   521  	return val
   522  }
   523  
   524  // TestQueueWALBlocks tests queueing many un-flushed WAL blocks when syncing is
   525  // blocked.
   526  func TestQueueWALBlocks(t *testing.T) {
   527  	blockWriteCh := make(chan struct{}, 1)
   528  	f := errorfs.WrapFile(vfstest.DiscardFile, errorfs.InjectorFunc(func(op errorfs.Op) error {
   529  		if op.Kind == errorfs.OpFileWrite {
   530  			<-blockWriteCh
   531  		}
   532  		return nil
   533  	}))
   534  	w := NewLogWriter(f, 0, LogWriterConfig{
   535  		WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}),
   536  	})
   537  	const numBlocks = 1024
   538  	var b [blockSize]byte
   539  	var logSize int64
   540  	for i := 0; i < numBlocks; i++ {
   541  		var err error
   542  		logSize, err = w.SyncRecord(b[:], nil, nil)
   543  		if err != nil {
   544  			t.Fatal(err)
   545  		}
   546  	}
   547  	close(blockWriteCh)
   548  	require.NoError(t, w.Close())
   549  
   550  	m := w.Metrics()
   551  	t.Logf("LogSize is %s", humanize.Bytes.Int64(logSize))
   552  	t.Logf("Mean pending buffer len is %.2f", m.PendingBufferLen.Mean())
   553  	require.GreaterOrEqual(t, logSize, int64(numBlocks*blockSize))
   554  }
   555  
   556  // BenchmarkQueueWALBlocks exercises queueing within the LogWriter. It can be
   557  // useful to measure allocations involved when flushing is slow enough to
   558  // accumulate a large backlog fo queued blocks.
   559  func BenchmarkQueueWALBlocks(b *testing.B) {
   560  	const dataVolume = 64 << 20 /* 64 MB */
   561  	for _, writeSize := range []int64{64, 512, 1024, 2048, 32768} {
   562  		b.Run(fmt.Sprintf("record-size=%s", humanize.Bytes.Int64(writeSize)), func(b *testing.B) {
   563  			record := make([]byte, writeSize)
   564  			numRecords := int(dataVolume / writeSize)
   565  
   566  			for j := 0; j < b.N; j++ {
   567  				b.StopTimer()
   568  				blockWriteCh := make(chan struct{}, 1)
   569  				f := errorfs.WrapFile(vfstest.DiscardFile, errorfs.InjectorFunc(func(op errorfs.Op) error {
   570  					if op.Kind == errorfs.OpFileWrite {
   571  						<-blockWriteCh
   572  					}
   573  					return nil
   574  				}))
   575  				w := NewLogWriter(f, 0, LogWriterConfig{
   576  					WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}),
   577  				})
   578  
   579  				b.StartTimer()
   580  				for n := numRecords; n > 0; n-- {
   581  					if _, err := w.SyncRecord(record[:], nil, nil); err != nil {
   582  						b.Fatal(err)
   583  					}
   584  				}
   585  				b.StopTimer()
   586  
   587  				b.SetBytes(dataVolume)
   588  				close(blockWriteCh)
   589  				require.NoError(b, w.Close())
   590  			}
   591  		})
   592  	}
   593  }