github.com/grafana/pyroscope@v1.18.0/pkg/parquet/row_reader_test.go (about) 1 package parquet 2 3 import ( 4 "fmt" 5 "io" 6 "math" 7 "testing" 8 9 "github.com/parquet-go/parquet-go" 10 "github.com/stretchr/testify/require" 11 ) 12 13 var _ parquet.RowReader = (*BatchReader)(nil) 14 15 type BatchReader struct { 16 batches [][]parquet.Row 17 } 18 19 func NewBatchReader(batches [][]parquet.Row) *BatchReader { 20 return &BatchReader{batches: batches} 21 } 22 23 func (br *BatchReader) ReadRows(rows []parquet.Row) (int, error) { 24 if len(br.batches) == 0 { 25 return 0, io.EOF 26 } 27 n := copy(rows, br.batches[0]) 28 if n < len(br.batches[0]) { 29 br.batches[0] = br.batches[0][n:] 30 return n, nil 31 } 32 br.batches = br.batches[1:] 33 return n, nil 34 } 35 36 func TestBufferedRowReaderIterator(t *testing.T) { 37 testBatchSize := func(n int) func(t *testing.T) { 38 return func(t *testing.T) { 39 reader := NewBufferedRowReaderIterator( 40 NewBatchReader( 41 [][]parquet.Row{ 42 {{parquet.Int32Value(1)}}, 43 {{parquet.Int32Value(2)}, {parquet.Int32Value(3)}}, 44 {{parquet.Int32Value(4)}}, 45 }), 46 n) 47 require.True(t, reader.Next()) 48 require.Equal(t, parquet.Int32Value(1), reader.At()[0]) 49 require.True(t, reader.Next()) 50 require.Equal(t, parquet.Int32Value(2), reader.At()[0]) 51 require.True(t, reader.Next()) 52 require.Equal(t, parquet.Int32Value(3), reader.At()[0]) 53 require.True(t, reader.Next()) 54 require.Equal(t, parquet.Int32Value(4), reader.At()[0]) 55 require.False(t, reader.Next()) 56 } 57 } 58 t.Run("batch of 1", testBatchSize(1)) 59 t.Run("bigger batch", testBatchSize(100)) 60 t.Run("equal batch", testBatchSize(2)) 61 } 62 63 func TestNewMergeRowReader(t *testing.T) { 64 for _, batchSize := range []int{1, 2, 3, 4, 5, 6} { 65 bufferSize := batchSize 66 t.Run(fmt.Sprintf("%d", bufferSize), func(t *testing.T) { 67 for _, tc := range []struct { 68 name string 69 readers []parquet.RowReader 70 expected []parquet.Row 71 }{ 72 { 73 "merge 1 readers", 74 []parquet.RowReader{ 75 NewBatchReader([][]parquet.Row{ 76 {{parquet.Int32Value(1)}}, 77 {{parquet.Int32Value(3)}}, 78 {{parquet.Int32Value(5)}}, 79 }), 80 }, 81 []parquet.Row{ 82 {parquet.Int32Value(1)}, 83 {parquet.Int32Value(3)}, 84 {parquet.Int32Value(5)}, 85 }, 86 }, 87 { 88 "merge 2 readers", 89 []parquet.RowReader{ 90 NewBatchReader([][]parquet.Row{ 91 {{parquet.Int32Value(1)}}, 92 {{parquet.Int32Value(3)}}, 93 {{parquet.Int32Value(5)}}, 94 }), 95 NewBatchReader([][]parquet.Row{ 96 {{parquet.Int32Value(2)}}, 97 {{parquet.Int32Value(4)}}, 98 {{parquet.Int32Value(6)}}, 99 }), 100 }, 101 []parquet.Row{ 102 {parquet.Int32Value(1)}, 103 {parquet.Int32Value(2)}, 104 {parquet.Int32Value(3)}, 105 {parquet.Int32Value(4)}, 106 {parquet.Int32Value(5)}, 107 {parquet.Int32Value(6)}, 108 }, 109 }, 110 { 111 "merge 3 readers 1 value", 112 []parquet.RowReader{ 113 NewBatchReader([][]parquet.Row{ 114 {{parquet.Int32Value(1)}}, 115 }), 116 NewBatchReader([][]parquet.Row{ 117 {{parquet.Int32Value(2)}}, 118 }), 119 NewBatchReader([][]parquet.Row{ 120 {{parquet.Int32Value(3)}}, 121 }), 122 }, 123 []parquet.Row{ 124 {parquet.Int32Value(1)}, 125 {parquet.Int32Value(2)}, 126 {parquet.Int32Value(3)}, 127 }, 128 }, 129 } { 130 tc := tc 131 t.Run(tc.name, func(t *testing.T) { 132 reader := NewMergeRowReader(tc.readers, parquet.Row{parquet.Int32Value(math.MaxInt32)}, func(r1, r2 parquet.Row) bool { 133 return r1[0].Int32() < r2[0].Int32() 134 }) 135 136 actual, err := ReadAllWithBufferSize(reader, bufferSize) 137 require.NoError(t, err) 138 require.Equal(t, tc.expected, actual) 139 }) 140 } 141 }) 142 } 143 } 144 145 func TestIteratorRowReader(t *testing.T) { 146 it := NewIteratorRowReader( 147 NewBufferedRowReaderIterator(NewBatchReader([][]parquet.Row{ 148 {{parquet.Int32Value(1)}, {parquet.Int32Value(2)}, {parquet.Int32Value(3)}}, 149 {{parquet.Int32Value(4)}, {parquet.Int32Value(5)}, {parquet.Int32Value(6)}}, 150 {{parquet.Int32Value(7)}, {parquet.Int32Value(8)}, {parquet.Int32Value(9)}}, 151 }), 4), 152 ) 153 actual, err := ReadAllWithBufferSize(it, 3) 154 require.NoError(t, err) 155 require.Equal(t, []parquet.Row{ 156 {parquet.Int32Value(1)}, 157 {parquet.Int32Value(2)}, 158 {parquet.Int32Value(3)}, 159 {parquet.Int32Value(4)}, 160 {parquet.Int32Value(5)}, 161 {parquet.Int32Value(6)}, 162 {parquet.Int32Value(7)}, 163 {parquet.Int32Value(8)}, 164 {parquet.Int32Value(9)}, 165 }, actual) 166 } 167 168 type SomeRow struct { 169 Col1 int 170 } 171 172 func BenchmarkBufferedRowReader(b *testing.B) { 173 buff := parquet.NewGenericBuffer[SomeRow]() 174 for i := 0; i < 1000000; i++ { 175 _, err := buff.Write([]SomeRow{{Col1: (i)}}) 176 if err != nil { 177 b.Fatal(err) 178 } 179 } 180 reader := NewBufferedRowReaderIterator(buff.Rows(), 100) 181 defer reader.Close() 182 b.ResetTimer() 183 for i := 0; i < b.N; i++ { 184 for reader.Next() { 185 _ = reader.At() 186 } 187 } 188 }