github.com/grafana/pyroscope@v1.18.0/pkg/phlaredb/symdb/block_reader_parquet.go (about)

     1  //nolint:unused
     2  package symdb
     3  
     4  import (
     5  	"context"
     6  	"errors"
     7  	"fmt"
     8  	"io"
     9  
    10  	"github.com/grafana/dskit/multierror"
    11  	"github.com/opentracing/opentracing-go"
    12  	otlog "github.com/opentracing/opentracing-go/log"
    13  	"github.com/parquet-go/parquet-go"
    14  	"golang.org/x/sync/errgroup"
    15  
    16  	"github.com/grafana/pyroscope/pkg/objstore"
    17  	parquetobj "github.com/grafana/pyroscope/pkg/objstore/parquet"
    18  	"github.com/grafana/pyroscope/pkg/phlaredb/block"
    19  	schemav1 "github.com/grafana/pyroscope/pkg/phlaredb/schemas/v1"
    20  	"github.com/grafana/pyroscope/pkg/util/refctr"
    21  )
    22  
    23  // Used in v2. Left for compatibility.
    24  
    25  type parquetTable[M schemav1.Models, P schemav1.Persister[M]] struct {
    26  	headers   []RowRangeReference
    27  	bucket    objstore.BucketReader
    28  	persister P
    29  
    30  	file *parquetobj.File
    31  
    32  	r refctr.Counter
    33  	s []M
    34  }
    35  
    36  const (
    37  	// parquet.CopyRows uses hardcoded buffer size:
    38  	// defaultRowBufferSize = 42
    39  	inMemoryReaderRowsBufSize = 1 << 10
    40  	parquetReadBufferSize     = 256 << 10 // 256KB
    41  )
    42  
    43  func (t *parquetTable[M, P]) fetch(ctx context.Context) (err error) {
    44  	span, _ := opentracing.StartSpanFromContext(ctx, "parquetTable.fetch", opentracing.Tags{
    45  		"table_name": t.persister.Name(),
    46  		"row_groups": len(t.headers),
    47  	})
    48  	defer span.Finish()
    49  	return t.r.Inc(func() error {
    50  		var s uint32
    51  		for _, h := range t.headers {
    52  			s += h.Rows
    53  		}
    54  		buf := make([]parquet.Row, inMemoryReaderRowsBufSize)
    55  		t.s = make([]M, s)
    56  		var offset int
    57  		// TODO(kolesnikovae): Row groups could be fetched in parallel.
    58  		rgs := t.file.RowGroups()
    59  		for _, h := range t.headers {
    60  			span.LogFields(
    61  				otlog.Uint32("row_group", h.RowGroup),
    62  				otlog.Uint32("index_row", h.Index),
    63  				otlog.Uint32("rows", h.Rows),
    64  			)
    65  			rg := rgs[h.RowGroup]
    66  			rows := rg.Rows()
    67  			if err := rows.SeekToRow(int64(h.Index)); err != nil {
    68  				return err
    69  			}
    70  			dst := t.s[offset : offset+int(h.Rows)]
    71  			if err := t.readRows(dst, buf, rows); err != nil {
    72  				return fmt.Errorf("reading row group from parquet file %q: %w", t.file.Path(), err)
    73  			}
    74  			offset += int(h.Rows)
    75  		}
    76  		return nil
    77  	})
    78  }
    79  
    80  func (t *parquetTable[M, P]) readRows(dst []M, buf []parquet.Row, rows parquet.Rows) (err error) {
    81  	defer func() {
    82  		err = multierror.New(err, rows.Close()).Err()
    83  	}()
    84  	for i := 0; i < len(dst); {
    85  		n, err := rows.ReadRows(buf)
    86  		if n > 0 {
    87  			for _, row := range buf[:n] {
    88  				if i == len(dst) {
    89  					return nil
    90  				}
    91  				v, err := t.persister.Reconstruct(row)
    92  				if err != nil {
    93  					return err
    94  				}
    95  				dst[i] = v
    96  				i++
    97  			}
    98  		}
    99  		if err != nil {
   100  			if errors.Is(err, io.EOF) {
   101  				return nil
   102  			}
   103  			return err
   104  		}
   105  	}
   106  	return nil
   107  }
   108  
   109  func (t *parquetTable[M, P]) slice() []M { return t.s }
   110  
   111  func (t *parquetTable[M, P]) release() {
   112  	t.r.Dec(func() {
   113  		t.s = nil
   114  	})
   115  }
   116  
   117  type parquetFiles struct {
   118  	locations parquetobj.File
   119  	mappings  parquetobj.File
   120  	functions parquetobj.File
   121  	strings   parquetobj.File
   122  }
   123  
   124  func (f *parquetFiles) Close() error {
   125  	return multierror.New(
   126  		f.locations.Close(),
   127  		f.mappings.Close(),
   128  		f.functions.Close(),
   129  		f.strings.Close()).
   130  		Err()
   131  }
   132  
   133  func openParquetFiles(ctx context.Context, r *Reader) error {
   134  	options := []parquet.FileOption{
   135  		parquet.SkipBloomFilters(true),
   136  		parquet.FileReadMode(parquet.ReadModeAsync),
   137  		parquet.ReadBufferSize(parquetReadBufferSize),
   138  	}
   139  	files := new(parquetFiles)
   140  	m := map[string]*parquetobj.File{
   141  		new(schemav1.LocationPersister).Name() + block.ParquetSuffix: &files.locations,
   142  		new(schemav1.MappingPersister).Name() + block.ParquetSuffix:  &files.mappings,
   143  		new(schemav1.FunctionPersister).Name() + block.ParquetSuffix: &files.functions,
   144  		new(schemav1.StringPersister).Name() + block.ParquetSuffix:   &files.strings,
   145  	}
   146  	g, ctx := errgroup.WithContext(ctx)
   147  	for n, fp := range m {
   148  		n := n
   149  		fp := fp
   150  		g.Go(func() error {
   151  			fm, err := r.lookupFile(n)
   152  			if err != nil {
   153  				return err
   154  			}
   155  			if err = fp.Open(ctx, r.bucket, fm, options...); err != nil {
   156  				return fmt.Errorf("opening file %q: %w", n, err)
   157  			}
   158  			return nil
   159  		})
   160  	}
   161  	if err := g.Wait(); err != nil {
   162  		return err
   163  	}
   164  	r.parquetFiles = files
   165  	return nil
   166  }