github.com/grafana/pyroscope@v1.18.0/pkg/objstore/parquet/file_test.go (about) 1 package parquet 2 3 import ( 4 "context" 5 "os" 6 "testing" 7 8 "github.com/parquet-go/parquet-go" 9 "github.com/stretchr/testify/assert" 10 "github.com/stretchr/testify/require" 11 12 "github.com/grafana/pyroscope/pkg/objstore" 13 "github.com/grafana/pyroscope/pkg/objstore/providers/filesystem" 14 "github.com/grafana/pyroscope/pkg/phlaredb/block" 15 ) 16 17 type readerAtCall struct { 18 offset int64 19 size int64 20 } 21 22 type readerAtLogger struct { 23 objstore.ReaderAtCloser 24 calls []readerAtCall 25 } 26 27 func (r *readerAtLogger) ReadAt(p []byte, off int64) (n int, err error) { 28 r.calls = append(r.calls, readerAtCall{offset: off, size: int64(len(p))}) 29 return r.ReaderAtCloser.ReadAt(p, off) 30 31 } 32 33 type bucketReadRangeLogger struct { 34 objstore.BucketReader 35 lastReaderAt *readerAtLogger 36 } 37 38 func (b *bucketReadRangeLogger) ReaderAt(ctx context.Context, filename string) (objstore.ReaderAtCloser, error) { 39 readerAt, err := b.BucketReader.ReaderAt(ctx, filename) 40 b.lastReaderAt = &readerAtLogger{ 41 ReaderAtCloser: readerAt, 42 } 43 return b.lastReaderAt, err 44 } 45 46 func newBucketReader(t *testing.T, path string) *bucketReadRangeLogger { 47 bucketClient, err := filesystem.NewBucket(path) 48 require.NoError(t, err) 49 50 return &bucketReadRangeLogger{BucketReader: objstore.NewBucket(bucketClient)} 51 } 52 53 func newParquetFile(t *testing.T, rowCount int) (block.File, *bucketReadRangeLogger) { 54 batch := 10 55 56 type Row struct{ N, NTime2, NTimes3 int } 57 58 rows := make([]Row, batch) 59 pos := 0 60 61 tempDir := t.TempDir() 62 fileName := "test.parquet" 63 64 output, err := os.Create(tempDir + "/" + fileName) 65 require.NoError(t, err) 66 67 writer := parquet.NewGenericWriter[Row](output) 68 69 for { 70 for idx := range rows { 71 rows[idx].N = pos 72 rows[idx].NTime2 = pos * 2 73 rows[idx].NTimes3 = pos * 3 74 pos += 1 75 76 if pos >= rowCount { 77 rows = rows[:idx+1] 78 break 79 } 80 } 81 82 _, err = writer.Write(rows) 83 require.NoError(t, err) 84 85 if pos >= rowCount { 86 break 87 } 88 } 89 90 // closing the writer is necessary to flush buffers and write the file footer. 91 require.NoError(t, writer.Close()) 92 93 // get file size 94 fi, err := output.Stat() 95 require.NoError(t, err) 96 97 return block.File{ 98 RelPath: "test.parquet", 99 SizeBytes: uint64(fi.Size()), 100 Parquet: &block.ParquetFile{}, 101 }, newBucketReader(t, tempDir) 102 } 103 104 const ( 105 parquetReadBufferSize = 256 << 10 // 256KB 106 ) 107 108 func DefaultFileOptions() []parquet.FileOption { 109 return []parquet.FileOption{ 110 parquet.SkipBloomFilters(true), // we don't use bloom filters 111 parquet.FileReadMode(parquet.ReadModeAsync), 112 parquet.ReadBufferSize(parquetReadBufferSize), 113 } 114 } 115 116 func TestFile_Open(t *testing.T) { 117 var f File 118 119 t.Run("small parquet file, ensure single request to bucket", func(t *testing.T) { 120 meta, bucketReader := newParquetFile(t, 100) 121 122 require.NoError(t, f.Open(context.Background(), bucketReader, meta, DefaultFileOptions()...)) 123 require.Len(t, bucketReader.lastReaderAt.calls, 1) 124 125 // parquet file smalle, so cache will actually hold all of it 126 assert.Equal(t, int64(0), bucketReader.lastReaderAt.calls[0].offset) 127 assert.Equal(t, int64(meta.SizeBytes), bucketReader.lastReaderAt.calls[0].size) 128 }) 129 130 t.Run("bigger parquet file, ensure single request to bucket", func(t *testing.T) { 131 meta, bucketReader := newParquetFile(t, 100_000) 132 133 require.NoError(t, f.Open(context.Background(), bucketReader, meta, DefaultFileOptions()...)) 134 require.Len(t, bucketReader.lastReaderAt.calls, 1) 135 136 // parquet file will use the minimum 32KiB cache size 137 assert.Equal(t, int64(meta.SizeBytes-(32*1024)), bucketReader.lastReaderAt.calls[0].offset) 138 assert.Equal(t, int64(32*1024), bucketReader.lastReaderAt.calls[0].size) 139 }) 140 }