github.com/grafana/pyroscope@v1.18.0/pkg/phlaredb/query/iters_test.go (about)

     1  package query
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"fmt"
     7  	"math"
     8  	"math/rand"
     9  	"os"
    10  	"testing"
    11  
    12  	"github.com/parquet-go/parquet-go"
    13  	"github.com/prometheus/client_golang/prometheus"
    14  	"github.com/prometheus/client_golang/prometheus/testutil"
    15  	"github.com/stretchr/testify/assert"
    16  	"github.com/stretchr/testify/require"
    17  
    18  	"github.com/grafana/pyroscope/pkg/iter"
    19  )
    20  
    21  type makeTestIterFn func(pf *parquet.File, idx int, filter Predicate, selectAs string) Iterator
    22  
    23  var iterTestCases = []struct {
    24  	name     string
    25  	makeIter makeTestIterFn
    26  }{
    27  	{"sync", func(pf *parquet.File, idx int, filter Predicate, selectAs string) Iterator {
    28  		return NewSyncIterator(context.TODO(), pf.RowGroups(), idx, selectAs, 1000, filter, selectAs)
    29  	}},
    30  }
    31  
    32  // TestNext compares the unrolled Next() with the original nextSlow() to
    33  // prevent drift
    34  func TestNext(t *testing.T) {
    35  	rn1 := RowNumber{0, 0, 0, 0, 0, 0}
    36  	rn2 := RowNumber{0, 0, 0, 0, 0, 0}
    37  
    38  	for i := 0; i < 1000; i++ {
    39  		r := rand.Intn(6)
    40  		d := rand.Intn(6)
    41  
    42  		rn1.Next(r, d)
    43  		rn2.nextSlow(r, d)
    44  
    45  		require.Equal(t, rn1, rn2)
    46  	}
    47  }
    48  
    49  func TestRowNumber(t *testing.T) {
    50  	tr := EmptyRowNumber()
    51  	require.Equal(t, RowNumber{-1, -1, -1, -1, -1, -1}, tr)
    52  
    53  	steps := []struct {
    54  		repetitionLevel int
    55  		definitionLevel int
    56  		expected        RowNumber
    57  	}{
    58  		// Name.Language.Country examples from the Dremel whitepaper
    59  		{0, 3, RowNumber{0, 0, 0, 0, -1, -1}},
    60  		{2, 2, RowNumber{0, 0, 1, -1, -1, -1}},
    61  		{1, 1, RowNumber{0, 1, -1, -1, -1, -1}},
    62  		{1, 3, RowNumber{0, 2, 0, 0, -1, -1}},
    63  		{0, 1, RowNumber{1, 0, -1, -1, -1, -1}},
    64  	}
    65  
    66  	for _, step := range steps {
    67  		tr.Next(step.repetitionLevel, step.definitionLevel)
    68  		require.Equal(t, step.expected, tr)
    69  	}
    70  }
    71  
    72  func TestCompareRowNumbers(t *testing.T) {
    73  	testCases := []struct {
    74  		a, b     RowNumber
    75  		expected int
    76  	}{
    77  		{RowNumber{-1}, RowNumber{0}, -1},
    78  		{RowNumber{0}, RowNumber{0}, 0},
    79  		{RowNumber{1}, RowNumber{0}, 1},
    80  
    81  		{RowNumber{0, 1}, RowNumber{0, 2}, -1},
    82  		{RowNumber{0, 2}, RowNumber{0, 1}, 1},
    83  	}
    84  
    85  	for _, tc := range testCases {
    86  		require.Equal(t, tc.expected, CompareRowNumbers(MaxDefinitionLevel, tc.a, tc.b))
    87  	}
    88  }
    89  
    90  func TestRowNumberPreceding(t *testing.T) {
    91  	testCases := []struct {
    92  		start, preceding RowNumber
    93  	}{
    94  		{RowNumber{1000, -1, -1, -1, -1, -1}, RowNumber{999, -1, -1, -1, -1, -1}},
    95  		{RowNumber{1000, 0, 0, 0, 0, 0}, RowNumber{999, math.MaxInt64, math.MaxInt64, math.MaxInt64, math.MaxInt64, math.MaxInt64}},
    96  	}
    97  
    98  	for _, tc := range testCases {
    99  		require.Equal(t, tc.preceding, tc.start.Preceding())
   100  	}
   101  }
   102  
   103  func TestColumnIterator(t *testing.T) {
   104  	for _, tc := range iterTestCases {
   105  		t.Run(tc.name, func(t *testing.T) {
   106  			testColumnIterator(t, tc.makeIter)
   107  		})
   108  	}
   109  }
   110  
   111  func testColumnIterator(t *testing.T, makeIter makeTestIterFn) {
   112  	count := 100_000
   113  	pf := createTestFile(t, count)
   114  
   115  	idx, _ := GetColumnIndexByPath(pf.Root(), "A")
   116  	iter := makeIter(pf, idx, nil, "A")
   117  	defer iter.Close()
   118  
   119  	for i := 0; i < count; i++ {
   120  		require.True(t, iter.Next())
   121  		res := iter.At()
   122  		require.NotNil(t, res, "i=%d", i)
   123  		require.Equal(t, RowNumber{int64(i), -1, -1, -1, -1, -1}, res.RowNumber)
   124  		require.Equal(t, int64(i), res.ToMap()["A"][0].Int64())
   125  	}
   126  
   127  	require.False(t, iter.Next())
   128  	require.NoError(t, iter.Err())
   129  }
   130  
   131  func TestColumnIteratorSeek(t *testing.T) {
   132  	for _, tc := range iterTestCases {
   133  		t.Run(tc.name, func(t *testing.T) {
   134  			testColumnIteratorSeek(t, tc.makeIter)
   135  		})
   136  	}
   137  }
   138  
   139  func testColumnIteratorSeek(t *testing.T, makeIter makeTestIterFn) {
   140  	count := 10_000
   141  	pf := createTestFile(t, count)
   142  
   143  	idx, _ := GetColumnIndexByPath(pf.Root(), "A")
   144  	iter := makeIter(pf, idx, nil, "A")
   145  	defer iter.Close()
   146  
   147  	seekTos := []int64{
   148  		100,
   149  		1234,
   150  		4567,
   151  		5000,
   152  		7890,
   153  	}
   154  
   155  	for _, seekTo := range seekTos {
   156  		rn := EmptyRowNumber()
   157  		rn[0] = seekTo
   158  		require.True(t, iter.Seek(RowNumberWithDefinitionLevel{rn, 0}))
   159  		res := iter.At()
   160  		require.NotNil(t, res, "seekTo=%v", seekTo)
   161  		require.Equal(t, RowNumber{seekTo, -1, -1, -1, -1, -1}, res.RowNumber)
   162  		require.Equal(t, seekTo, res.ToMap()["A"][0].Int64())
   163  	}
   164  }
   165  
   166  func TestColumnIteratorPredicate(t *testing.T) {
   167  	for _, tc := range iterTestCases {
   168  		t.Run(tc.name, func(t *testing.T) {
   169  			testColumnIteratorPredicate(t, tc.makeIter)
   170  		})
   171  	}
   172  }
   173  
   174  func testColumnIteratorPredicate(t *testing.T, makeIter makeTestIterFn) {
   175  	count := 10_000
   176  	pf := createTestFile(t, count)
   177  
   178  	pred := NewIntBetweenPredicate(7001, 7003)
   179  
   180  	idx, _ := GetColumnIndexByPath(pf.Root(), "A")
   181  	iter := makeIter(pf, idx, pred, "A")
   182  	defer iter.Close()
   183  
   184  	expectedResults := []int64{
   185  		7001,
   186  		7002,
   187  		7003,
   188  	}
   189  
   190  	for _, expectedResult := range expectedResults {
   191  		require.True(t, iter.Next())
   192  		res := iter.At()
   193  		require.NotNil(t, res)
   194  		require.Equal(t, RowNumber{expectedResult, -1, -1, -1, -1, -1}, res.RowNumber)
   195  		require.Equal(t, expectedResult, res.ToMap()["A"][0].Int64())
   196  	}
   197  }
   198  
   199  func TestColumnIteratorExitEarly(t *testing.T) {
   200  	type T struct{ A int }
   201  
   202  	rows := []T{}
   203  	count := 10_000
   204  	for i := 0; i < count; i++ {
   205  		rows = append(rows, T{i})
   206  	}
   207  
   208  	pf := createFileWith(t, rows, 2)
   209  	idx, _ := GetColumnIndexByPath(pf.Root(), "A")
   210  	readSize := 1000
   211  
   212  	readIter := func(iter Iterator) (int, error) {
   213  		received := 0
   214  		for iter.Next() {
   215  			received++
   216  		}
   217  		return received, iter.Err()
   218  	}
   219  
   220  	t.Run("cancelledEarly", func(t *testing.T) {
   221  		// Cancel before iterating
   222  		ctx, cancel := context.WithCancel(context.TODO())
   223  		cancel()
   224  		iter := NewSyncIterator(ctx, pf.RowGroups(), idx, "", readSize, nil, "A")
   225  		count, err := readIter(iter)
   226  		require.ErrorContains(t, err, "context canceled")
   227  		require.Equal(t, 0, count)
   228  	})
   229  
   230  	t.Run("cancelledPartial", func(t *testing.T) {
   231  		ctx, cancel := context.WithCancel(context.TODO())
   232  		iter := NewSyncIterator(ctx, pf.RowGroups(), idx, "", readSize, nil, "A")
   233  
   234  		// Read some results
   235  		require.True(t, iter.Next())
   236  
   237  		// Then cancel
   238  		cancel()
   239  
   240  		// Read again = context cancelled
   241  		_, err := readIter(iter)
   242  		require.ErrorContains(t, err, "context canceled")
   243  	})
   244  
   245  	t.Run("closedEarly", func(t *testing.T) {
   246  		// Close before iterating
   247  		iter := NewSyncIterator(context.TODO(), pf.RowGroups(), idx, "", readSize, nil, "A")
   248  		iter.Close()
   249  		count, err := readIter(iter)
   250  		require.ErrorContains(t, err, "context canceled")
   251  		require.Equal(t, 0, count)
   252  	})
   253  
   254  	t.Run("closedPartial", func(t *testing.T) {
   255  		iter := NewSyncIterator(context.TODO(), pf.RowGroups(), idx, "", readSize, nil, "A")
   256  
   257  		// Read some results
   258  		require.True(t, iter.Next())
   259  
   260  		// Then close
   261  		iter.Close()
   262  
   263  		// Read again = should close early
   264  		res2, err := readIter(iter)
   265  		require.ErrorContains(t, err, "context canceled")
   266  		require.Less(t, readSize+res2, count)
   267  	})
   268  }
   269  
   270  func BenchmarkColumnIterator(b *testing.B) {
   271  	for _, tc := range iterTestCases {
   272  		b.Run(tc.name, func(b *testing.B) {
   273  			benchmarkColumnIterator(b, tc.makeIter)
   274  		})
   275  	}
   276  }
   277  
   278  func benchmarkColumnIterator(b *testing.B, makeIter makeTestIterFn) {
   279  	count := 100_000
   280  	pf := createTestFile(b, count)
   281  
   282  	idx, _ := GetColumnIndexByPath(pf.Root(), "A")
   283  
   284  	b.ResetTimer()
   285  
   286  	for i := 0; i < b.N; i++ {
   287  		iter := makeIter(pf, idx, nil, "A")
   288  		actualCount := 0
   289  		for iter.Next() {
   290  			actualCount++
   291  		}
   292  		iter.Close()
   293  		require.Equal(b, count, actualCount)
   294  		// fmt.Println(actualCount)
   295  	}
   296  }
   297  
   298  func createTestFile(t testing.TB, count int) *parquet.File {
   299  	type T struct{ A int }
   300  
   301  	rows := []T{}
   302  	for i := 0; i < count; i++ {
   303  		rows = append(rows, T{i})
   304  	}
   305  
   306  	pf := createFileWith(t, rows, 2)
   307  	return pf
   308  }
   309  
   310  func createProfileLikeFile(t testing.TB, count int) *parquet.File {
   311  	type T struct {
   312  		SeriesID  uint32
   313  		TimeNanos int64
   314  	}
   315  
   316  	// every row group is ordered by serieID and then time nanos
   317  	// time is always increasing between rowgroups
   318  
   319  	rowGroups := 10
   320  	series := 8
   321  
   322  	rows := make([]T, count)
   323  	for i := range rows {
   324  
   325  		rowsPerRowGroup := count / rowGroups
   326  		seriesPerRowGroup := rowsPerRowGroup / series
   327  		rowGroupNum := i / rowsPerRowGroup
   328  
   329  		seriesID := uint32(i % (count / rowGroups) / (rowsPerRowGroup / series))
   330  		rows[i] = T{
   331  			SeriesID:  seriesID,
   332  			TimeNanos: int64(i%seriesPerRowGroup+rowGroupNum*seriesPerRowGroup) * 1000,
   333  		}
   334  
   335  	}
   336  
   337  	return createFileWith[T](t, rows, rowGroups)
   338  }
   339  
   340  func createFileWith[T any](t testing.TB, rows []T, rowGroups int) *parquet.File {
   341  	f, err := os.CreateTemp(t.TempDir(), "data.parquet")
   342  	require.NoError(t, err)
   343  	t.Logf("Created temp file %s", f.Name())
   344  
   345  	perRG := len(rows) / rowGroups
   346  
   347  	w := parquet.NewGenericWriter[T](f)
   348  	for i := 0; i < (rowGroups - 1); i++ {
   349  		_, err = w.Write(rows[0:perRG])
   350  		require.NoError(t, err)
   351  		require.NoError(t, w.Flush())
   352  		rows = rows[perRG:]
   353  	}
   354  
   355  	_, err = w.Write(rows)
   356  	require.NoError(t, err)
   357  	require.NoError(t, w.Flush())
   358  
   359  	require.NoError(t, w.Close())
   360  
   361  	stat, err := f.Stat()
   362  	require.NoError(t, err)
   363  
   364  	pf, err := parquet.OpenFile(f, stat.Size())
   365  	require.NoError(t, err)
   366  
   367  	return pf
   368  }
   369  
   370  func TestBinaryJoinIterator(t *testing.T) {
   371  	rowCount := 1600
   372  	pf := createProfileLikeFile(t, rowCount)
   373  
   374  	for _, tc := range []struct {
   375  		name                string
   376  		seriesPredicate     Predicate
   377  		seriesPageReads     int
   378  		timePredicate       Predicate
   379  		timePageReads       int
   380  		expectedResultCount int
   381  	}{
   382  		{
   383  			name:                "no predicate",
   384  			expectedResultCount: rowCount, // expect everything
   385  			seriesPageReads:     10,
   386  			timePageReads:       10,
   387  		},
   388  		{
   389  			name:                "one series ID",
   390  			expectedResultCount: rowCount / 8, // expect an eight of the rows
   391  			seriesPredicate:     NewMapPredicate(map[int64]struct{}{0: {}}),
   392  			seriesPageReads:     10,
   393  			timePageReads:       10,
   394  		},
   395  		{
   396  			name:                "two series IDs",
   397  			expectedResultCount: rowCount / 8 * 2, // expect two eights of the rows
   398  			seriesPredicate:     NewMapPredicate(map[int64]struct{}{0: {}, 1: {}}),
   399  			seriesPageReads:     10,
   400  			timePageReads:       10,
   401  		},
   402  		{
   403  			name:                "missing series",
   404  			expectedResultCount: 0,
   405  			seriesPredicate:     NewMapPredicate(map[int64]struct{}{10: {}}),
   406  		},
   407  		{
   408  			name:                "first two time stamps each",
   409  			expectedResultCount: 2 * 8, // expect two profiles for each series
   410  			timePredicate:       NewIntBetweenPredicate(0, 1000),
   411  			seriesPageReads:     1,
   412  			timePageReads:       1,
   413  		},
   414  		{
   415  			name:                "time before results",
   416  			expectedResultCount: 0,
   417  			timePredicate:       NewIntBetweenPredicate(-10, -1),
   418  			seriesPageReads:     1,
   419  			timePageReads:       0,
   420  		},
   421  		{
   422  			name:                "time after results",
   423  			expectedResultCount: 0,
   424  			timePredicate:       NewIntBetweenPredicate(200000, 20001000),
   425  			seriesPredicate:     NewMapPredicate(map[int64]struct{}{0: {}, 1: {}}),
   426  			seriesPageReads:     1,
   427  			timePageReads:       0,
   428  		},
   429  	} {
   430  		t.Run(tc.name, func(t *testing.T) {
   431  			ctx, cancel := context.WithCancel(context.Background())
   432  			defer cancel()
   433  
   434  			reg := prometheus.NewRegistry()
   435  			metrics := NewMetrics(reg)
   436  			metrics.pageReadsTotal.WithLabelValues("ts", "SeriesId").Add(0)
   437  			metrics.pageReadsTotal.WithLabelValues("ts", "TimeNanos").Add(0)
   438  			ctx = AddMetricsToContext(ctx, metrics)
   439  
   440  			seriesIt := NewSyncIterator(ctx, pf.RowGroups(), 0, "SeriesId", 1000, tc.seriesPredicate, "SeriesId")
   441  			timeIt := NewSyncIterator(ctx, pf.RowGroups(), 1, "TimeNanos", 1000, tc.timePredicate, "TimeNanos")
   442  
   443  			it := NewBinaryJoinIterator(
   444  				0,
   445  				seriesIt,
   446  				timeIt,
   447  			)
   448  
   449  			results := 0
   450  			for it.Next() {
   451  				results++
   452  			}
   453  			require.NoError(t, it.Err())
   454  
   455  			require.NoError(t, it.Close())
   456  
   457  			require.Equal(t, tc.expectedResultCount, results)
   458  
   459  			require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewReader([]byte(fmt.Sprintf(
   460  				`
   461          # HELP pyroscopedb_page_reads_total Total number of pages read while querying
   462          # TYPE pyroscopedb_page_reads_total counter
   463          pyroscopedb_page_reads_total{column="SeriesId",table="ts"} %d
   464          pyroscopedb_page_reads_total{column="TimeNanos",table="ts"} %d
   465          `, tc.seriesPageReads, tc.timePageReads))), "pyroscopedb_page_reads_total"))
   466  		})
   467  	}
   468  }
   469  
   470  type rowGetter int64
   471  
   472  func (r rowGetter) RowNumber() int64 {
   473  	return int64(r)
   474  }
   475  
   476  func TestRowNumberIterator(t *testing.T) {
   477  	rows := []rowGetter{1, 2, 3, 50, 100, 102, 200}
   478  
   479  	t.Run("iterate over all", func(t *testing.T) {
   480  		it := NewRowNumberIterator(iter.NewSliceIterator(rows))
   481  		result := []int64{}
   482  		for it.Next() {
   483  			result = append(result, it.At().RowNumber[0])
   484  		}
   485  		require.NoError(t, it.Err())
   486  		assert.Equal(t, []int64{1, 2, 3, 50, 100, 102, 200}, result)
   487  	})
   488  
   489  	t.Run("seek into iter", func(t *testing.T) {
   490  		it := NewRowNumberIterator(iter.NewSliceIterator(rows))
   491  
   492  		to := EmptyRowNumber()
   493  		to[0] = 100
   494  		require.True(t, it.Seek(RowNumberWithDefinitionLevel{RowNumber: to}))
   495  		result := []int64{it.At().RowNumber[0]}
   496  		for it.Next() {
   497  			result = append(result, it.At().RowNumber[0])
   498  		}
   499  		require.NoError(t, it.Err())
   500  		assert.Equal(t, []int64{100, 102, 200}, result)
   501  	})
   502  
   503  	t.Run("seek to non existing value", func(t *testing.T) {
   504  		it := NewRowNumberIterator(iter.NewSliceIterator(rows))
   505  		to := EmptyRowNumber()
   506  		to[0] = 10
   507  		require.True(t, it.Seek(RowNumberWithDefinitionLevel{RowNumber: to}))
   508  		result := []int64{it.At().RowNumber[0]}
   509  		for it.Next() {
   510  			result = append(result, it.At().RowNumber[0])
   511  		}
   512  		require.NoError(t, it.Err())
   513  		assert.Equal(t, []int64{50, 100, 102, 200}, result)
   514  	})
   515  
   516  	t.Run("seek beyond rows", func(t *testing.T) {
   517  		it := NewRowNumberIterator(iter.NewSliceIterator(rows))
   518  		to := EmptyRowNumber()
   519  		to[0] = 300
   520  		require.False(t, it.Seek(RowNumberWithDefinitionLevel{RowNumber: to}))
   521  		require.NoError(t, it.Err())
   522  	})
   523  
   524  	t.Run("underlying iterator not ordered", func(t *testing.T) {
   525  		it := NewRowNumberIterator(iter.NewSliceIterator(append(rows, 300, 210, 500)))
   526  		for it.Next() {
   527  		}
   528  		require.ErrorContains(t, it.Err(), "is not sorted")
   529  	})
   530  }