github.com/grafana/pyroscope@v1.18.0/pkg/objstore/parquet/file_test.go (about)

     1  package parquet
     2  
     3  import (
     4  	"context"
     5  	"os"
     6  	"testing"
     7  
     8  	"github.com/parquet-go/parquet-go"
     9  	"github.com/stretchr/testify/assert"
    10  	"github.com/stretchr/testify/require"
    11  
    12  	"github.com/grafana/pyroscope/pkg/objstore"
    13  	"github.com/grafana/pyroscope/pkg/objstore/providers/filesystem"
    14  	"github.com/grafana/pyroscope/pkg/phlaredb/block"
    15  )
    16  
    17  type readerAtCall struct {
    18  	offset int64
    19  	size   int64
    20  }
    21  
    22  type readerAtLogger struct {
    23  	objstore.ReaderAtCloser
    24  	calls []readerAtCall
    25  }
    26  
    27  func (r *readerAtLogger) ReadAt(p []byte, off int64) (n int, err error) {
    28  	r.calls = append(r.calls, readerAtCall{offset: off, size: int64(len(p))})
    29  	return r.ReaderAtCloser.ReadAt(p, off)
    30  
    31  }
    32  
    33  type bucketReadRangeLogger struct {
    34  	objstore.BucketReader
    35  	lastReaderAt *readerAtLogger
    36  }
    37  
    38  func (b *bucketReadRangeLogger) ReaderAt(ctx context.Context, filename string) (objstore.ReaderAtCloser, error) {
    39  	readerAt, err := b.BucketReader.ReaderAt(ctx, filename)
    40  	b.lastReaderAt = &readerAtLogger{
    41  		ReaderAtCloser: readerAt,
    42  	}
    43  	return b.lastReaderAt, err
    44  }
    45  
    46  func newBucketReader(t *testing.T, path string) *bucketReadRangeLogger {
    47  	bucketClient, err := filesystem.NewBucket(path)
    48  	require.NoError(t, err)
    49  
    50  	return &bucketReadRangeLogger{BucketReader: objstore.NewBucket(bucketClient)}
    51  }
    52  
    53  func newParquetFile(t *testing.T, rowCount int) (block.File, *bucketReadRangeLogger) {
    54  	batch := 10
    55  
    56  	type Row struct{ N, NTime2, NTimes3 int }
    57  
    58  	rows := make([]Row, batch)
    59  	pos := 0
    60  
    61  	tempDir := t.TempDir()
    62  	fileName := "test.parquet"
    63  
    64  	output, err := os.Create(tempDir + "/" + fileName)
    65  	require.NoError(t, err)
    66  
    67  	writer := parquet.NewGenericWriter[Row](output)
    68  
    69  	for {
    70  		for idx := range rows {
    71  			rows[idx].N = pos
    72  			rows[idx].NTime2 = pos * 2
    73  			rows[idx].NTimes3 = pos * 3
    74  			pos += 1
    75  
    76  			if pos >= rowCount {
    77  				rows = rows[:idx+1]
    78  				break
    79  			}
    80  		}
    81  
    82  		_, err = writer.Write(rows)
    83  		require.NoError(t, err)
    84  
    85  		if pos >= rowCount {
    86  			break
    87  		}
    88  	}
    89  
    90  	// closing the writer is necessary to flush buffers and write the file footer.
    91  	require.NoError(t, writer.Close())
    92  
    93  	// get file size
    94  	fi, err := output.Stat()
    95  	require.NoError(t, err)
    96  
    97  	return block.File{
    98  		RelPath:   "test.parquet",
    99  		SizeBytes: uint64(fi.Size()),
   100  		Parquet:   &block.ParquetFile{},
   101  	}, newBucketReader(t, tempDir)
   102  }
   103  
   104  const (
   105  	parquetReadBufferSize = 256 << 10 // 256KB
   106  )
   107  
   108  func DefaultFileOptions() []parquet.FileOption {
   109  	return []parquet.FileOption{
   110  		parquet.SkipBloomFilters(true), // we don't use bloom filters
   111  		parquet.FileReadMode(parquet.ReadModeAsync),
   112  		parquet.ReadBufferSize(parquetReadBufferSize),
   113  	}
   114  }
   115  
   116  func TestFile_Open(t *testing.T) {
   117  	var f File
   118  
   119  	t.Run("small parquet file, ensure single request to bucket", func(t *testing.T) {
   120  		meta, bucketReader := newParquetFile(t, 100)
   121  
   122  		require.NoError(t, f.Open(context.Background(), bucketReader, meta, DefaultFileOptions()...))
   123  		require.Len(t, bucketReader.lastReaderAt.calls, 1)
   124  
   125  		// parquet file smalle, so cache will actually hold all of it
   126  		assert.Equal(t, int64(0), bucketReader.lastReaderAt.calls[0].offset)
   127  		assert.Equal(t, int64(meta.SizeBytes), bucketReader.lastReaderAt.calls[0].size)
   128  	})
   129  
   130  	t.Run("bigger parquet file, ensure single request to bucket", func(t *testing.T) {
   131  		meta, bucketReader := newParquetFile(t, 100_000)
   132  
   133  		require.NoError(t, f.Open(context.Background(), bucketReader, meta, DefaultFileOptions()...))
   134  		require.Len(t, bucketReader.lastReaderAt.calls, 1)
   135  
   136  		// parquet file will use the minimum 32KiB cache size
   137  		assert.Equal(t, int64(meta.SizeBytes-(32*1024)), bucketReader.lastReaderAt.calls[0].offset)
   138  		assert.Equal(t, int64(32*1024), bucketReader.lastReaderAt.calls[0].size)
   139  	})
   140  }