github.com/grafana/pyroscope@v1.18.0/pkg/phlaredb/symdb/block_reader_parquet.go (about) 1 //nolint:unused 2 package symdb 3 4 import ( 5 "context" 6 "errors" 7 "fmt" 8 "io" 9 10 "github.com/grafana/dskit/multierror" 11 "github.com/opentracing/opentracing-go" 12 otlog "github.com/opentracing/opentracing-go/log" 13 "github.com/parquet-go/parquet-go" 14 "golang.org/x/sync/errgroup" 15 16 "github.com/grafana/pyroscope/pkg/objstore" 17 parquetobj "github.com/grafana/pyroscope/pkg/objstore/parquet" 18 "github.com/grafana/pyroscope/pkg/phlaredb/block" 19 schemav1 "github.com/grafana/pyroscope/pkg/phlaredb/schemas/v1" 20 "github.com/grafana/pyroscope/pkg/util/refctr" 21 ) 22 23 // Used in v2. Left for compatibility. 24 25 type parquetTable[M schemav1.Models, P schemav1.Persister[M]] struct { 26 headers []RowRangeReference 27 bucket objstore.BucketReader 28 persister P 29 30 file *parquetobj.File 31 32 r refctr.Counter 33 s []M 34 } 35 36 const ( 37 // parquet.CopyRows uses hardcoded buffer size: 38 // defaultRowBufferSize = 42 39 inMemoryReaderRowsBufSize = 1 << 10 40 parquetReadBufferSize = 256 << 10 // 256KB 41 ) 42 43 func (t *parquetTable[M, P]) fetch(ctx context.Context) (err error) { 44 span, _ := opentracing.StartSpanFromContext(ctx, "parquetTable.fetch", opentracing.Tags{ 45 "table_name": t.persister.Name(), 46 "row_groups": len(t.headers), 47 }) 48 defer span.Finish() 49 return t.r.Inc(func() error { 50 var s uint32 51 for _, h := range t.headers { 52 s += h.Rows 53 } 54 buf := make([]parquet.Row, inMemoryReaderRowsBufSize) 55 t.s = make([]M, s) 56 var offset int 57 // TODO(kolesnikovae): Row groups could be fetched in parallel. 58 rgs := t.file.RowGroups() 59 for _, h := range t.headers { 60 span.LogFields( 61 otlog.Uint32("row_group", h.RowGroup), 62 otlog.Uint32("index_row", h.Index), 63 otlog.Uint32("rows", h.Rows), 64 ) 65 rg := rgs[h.RowGroup] 66 rows := rg.Rows() 67 if err := rows.SeekToRow(int64(h.Index)); err != nil { 68 return err 69 } 70 dst := t.s[offset : offset+int(h.Rows)] 71 if err := t.readRows(dst, buf, rows); err != nil { 72 return fmt.Errorf("reading row group from parquet file %q: %w", t.file.Path(), err) 73 } 74 offset += int(h.Rows) 75 } 76 return nil 77 }) 78 } 79 80 func (t *parquetTable[M, P]) readRows(dst []M, buf []parquet.Row, rows parquet.Rows) (err error) { 81 defer func() { 82 err = multierror.New(err, rows.Close()).Err() 83 }() 84 for i := 0; i < len(dst); { 85 n, err := rows.ReadRows(buf) 86 if n > 0 { 87 for _, row := range buf[:n] { 88 if i == len(dst) { 89 return nil 90 } 91 v, err := t.persister.Reconstruct(row) 92 if err != nil { 93 return err 94 } 95 dst[i] = v 96 i++ 97 } 98 } 99 if err != nil { 100 if errors.Is(err, io.EOF) { 101 return nil 102 } 103 return err 104 } 105 } 106 return nil 107 } 108 109 func (t *parquetTable[M, P]) slice() []M { return t.s } 110 111 func (t *parquetTable[M, P]) release() { 112 t.r.Dec(func() { 113 t.s = nil 114 }) 115 } 116 117 type parquetFiles struct { 118 locations parquetobj.File 119 mappings parquetobj.File 120 functions parquetobj.File 121 strings parquetobj.File 122 } 123 124 func (f *parquetFiles) Close() error { 125 return multierror.New( 126 f.locations.Close(), 127 f.mappings.Close(), 128 f.functions.Close(), 129 f.strings.Close()). 130 Err() 131 } 132 133 func openParquetFiles(ctx context.Context, r *Reader) error { 134 options := []parquet.FileOption{ 135 parquet.SkipBloomFilters(true), 136 parquet.FileReadMode(parquet.ReadModeAsync), 137 parquet.ReadBufferSize(parquetReadBufferSize), 138 } 139 files := new(parquetFiles) 140 m := map[string]*parquetobj.File{ 141 new(schemav1.LocationPersister).Name() + block.ParquetSuffix: &files.locations, 142 new(schemav1.MappingPersister).Name() + block.ParquetSuffix: &files.mappings, 143 new(schemav1.FunctionPersister).Name() + block.ParquetSuffix: &files.functions, 144 new(schemav1.StringPersister).Name() + block.ParquetSuffix: &files.strings, 145 } 146 g, ctx := errgroup.WithContext(ctx) 147 for n, fp := range m { 148 n := n 149 fp := fp 150 g.Go(func() error { 151 fm, err := r.lookupFile(n) 152 if err != nil { 153 return err 154 } 155 if err = fp.Open(ctx, r.bucket, fm, options...); err != nil { 156 return fmt.Errorf("opening file %q: %w", n, err) 157 } 158 return nil 159 }) 160 } 161 if err := g.Wait(); err != nil { 162 return err 163 } 164 r.parquetFiles = files 165 return nil 166 }