github.com/fraugster/parquet-go@v0.12.0/file_reader.go (about)

     1  package goparquet
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"io"
     7  	"runtime"
     8  
     9  	"github.com/fraugster/parquet-go/parquet"
    10  	"github.com/fraugster/parquet-go/parquetschema"
    11  )
    12  
    13  // FileReader is used to read data from a parquet file. Always use NewFileReader or a related
    14  // function  to create such an object.
    15  type FileReader struct {
    16  	meta         *parquet.FileMetaData
    17  	schemaReader *schema
    18  	reader       io.ReadSeeker
    19  
    20  	rowGroupPosition int
    21  	currentRecord    int64
    22  	skipRowGroup     bool
    23  
    24  	ctx context.Context
    25  
    26  	allocTracker *allocTracker
    27  }
    28  
    29  // NewFileReaderWithOptions creates a new FileReader. You can provide a list of FileReaderOptions to configure
    30  // aspects of its behaviour, such as limiting the columns to read, the file metadata to use, or the
    31  // context to use. For a full list of options, please see the type FileReaderOption.
    32  func NewFileReaderWithOptions(r io.ReadSeeker, readerOptions ...FileReaderOption) (*FileReader, error) {
    33  	opts := newFileReaderOptions()
    34  	if err := opts.apply(readerOptions); err != nil {
    35  		return nil, err
    36  	}
    37  
    38  	var err error
    39  	if opts.metaData == nil {
    40  		opts.metaData, err = ReadFileMetaData(r, true)
    41  		if err != nil {
    42  			return nil, fmt.Errorf("reading file meta data failed: %w", err)
    43  		}
    44  	}
    45  
    46  	schema, err := makeSchema(opts.metaData, opts.validateCRC, opts.allocTracker)
    47  	if err != nil {
    48  		return nil, fmt.Errorf("creating schema failed: %w", err)
    49  	}
    50  
    51  	schema.SetSelectedColumns(opts.columns...)
    52  	// Reset the reader to the beginning of the file
    53  	if _, err := r.Seek(4, io.SeekStart); err != nil {
    54  		return nil, err
    55  	}
    56  	return &FileReader{
    57  		meta:         opts.metaData,
    58  		schemaReader: schema,
    59  		reader:       r,
    60  		ctx:          opts.ctx,
    61  		allocTracker: opts.allocTracker,
    62  	}, nil
    63  }
    64  
    65  // FileReaderOption is an option that can be passed on to NewFileReaderWithOptions when
    66  // creating a new parquet file reader.
    67  type FileReaderOption func(*fileReaderOptions) error
    68  type fileReaderOptions struct {
    69  	metaData     *parquet.FileMetaData
    70  	ctx          context.Context
    71  	columns      []ColumnPath
    72  	validateCRC  bool
    73  	allocTracker *allocTracker
    74  }
    75  
    76  func newFileReaderOptions() *fileReaderOptions {
    77  	return &fileReaderOptions{ctx: context.Background()}
    78  }
    79  
    80  func (o *fileReaderOptions) apply(opts []FileReaderOption) error {
    81  	for _, f := range opts {
    82  		if err := f(o); err != nil {
    83  			return err
    84  		}
    85  	}
    86  	return nil
    87  }
    88  
    89  // WithReaderContext configures a custom context for the file reader. If none
    90  // is provided, context.Background() is used as a default.
    91  func WithReaderContext(ctx context.Context) FileReaderOption {
    92  	return func(opts *fileReaderOptions) error {
    93  		opts.ctx = ctx
    94  		return nil
    95  	}
    96  }
    97  
    98  // WithFileMetaData allows you to provide your own file metadata. If none
    99  // is set with this option, the file reader will read it from the parquet
   100  // file.
   101  func WithFileMetaData(metaData *parquet.FileMetaData) FileReaderOption {
   102  	return func(opts *fileReaderOptions) error {
   103  		opts.metaData = metaData
   104  		return nil
   105  	}
   106  }
   107  
   108  // WithColumns limits the columns which are read. If none are set, then
   109  // all columns will be read by the parquet file reader.
   110  //
   111  // Deprecated: use WithColumnPaths instead.
   112  func WithColumns(columns ...string) FileReaderOption {
   113  	return func(opts *fileReaderOptions) error {
   114  		parsedCols := []ColumnPath{}
   115  		for _, c := range columns {
   116  			parsedCols = append(parsedCols, parseColumnPath(c))
   117  		}
   118  		opts.columns = parsedCols
   119  		return nil
   120  	}
   121  }
   122  
   123  // WithColumnPaths limits the columns which are read. If none are set, then
   124  // all columns will be read by the parquet file reader.
   125  func WithColumnPaths(columns ...ColumnPath) FileReaderOption {
   126  	return func(opts *fileReaderOptions) error {
   127  		opts.columns = columns
   128  		return nil
   129  	}
   130  }
   131  
   132  // WithCRC32Validation allows you to configure whether CRC32 page checksums will
   133  // be validated when they're read. By default, checksum validation is disabled.
   134  func WithCRC32Validation(enable bool) FileReaderOption {
   135  	return func(opts *fileReaderOptions) error {
   136  		opts.validateCRC = enable
   137  		return nil
   138  	}
   139  }
   140  
   141  // WithMaximumMemorySize allows you to configure a maximum limit in terms of memory
   142  // that shall be allocated when reading this file. If the amount of memory gets over
   143  // this limit, further function calls will fail.
   144  func WithMaximumMemorySize(maxSizeBytes uint64) FileReaderOption {
   145  	return func(opts *fileReaderOptions) error {
   146  		opts.allocTracker = newAllocTracker(maxSizeBytes)
   147  		return nil
   148  	}
   149  }
   150  
   151  // NewFileReader creates a new FileReader. You can limit the columns that are read by providing
   152  // the names of the specific columns to read using dotted notation. If no columns are provided,
   153  // then all columns are read.
   154  func NewFileReader(r io.ReadSeeker, columns ...string) (*FileReader, error) {
   155  	return NewFileReaderWithOptions(r, WithColumns(columns...))
   156  }
   157  
   158  // NewFileReaderWithContext creates a new FileReader. You can limit the columns that are read by providing
   159  // the names of the specific columns to read using dotted notation. If no columns are provided,
   160  // then all columns are read. The provided context.Context overrides the default context (which is a context.Background())
   161  // for use in other methods of the *FileReader type.
   162  //
   163  // Deprecated: use the function NewFileReaderWithOptions and the option WithContext instead.
   164  func NewFileReaderWithContext(ctx context.Context, r io.ReadSeeker, columns ...string) (*FileReader, error) {
   165  	return NewFileReaderWithOptions(r, WithReaderContext(ctx), WithColumns(columns...))
   166  }
   167  
   168  // NewFileReaderWithMetaData creates a new FileReader with custom file meta data. You can limit the columns that
   169  // are read by providing the names of the specific columns to read using dotted notation. If no columns are provided,
   170  // then all columns are read.
   171  //
   172  // Deprecated: use the function NewFileReaderWithOptions and the option WithFileMetaData instead.
   173  func NewFileReaderWithMetaData(r io.ReadSeeker, meta *parquet.FileMetaData, columns ...string) (*FileReader, error) {
   174  	return NewFileReaderWithOptions(r, WithFileMetaData(meta), WithColumns(columns...))
   175  }
   176  
   177  func (*FileReader) recover(errp *error) {
   178  	if e := recover(); e != nil {
   179  		if _, ok := e.(runtime.Error); ok {
   180  			panic(e)
   181  		}
   182  		*errp = e.(error)
   183  	}
   184  }
   185  
   186  // SeekToRowGroup seeks to a particular row group, identified by its index.
   187  func (f *FileReader) SeekToRowGroup(rowGroupPosition int) (err error) {
   188  	defer f.recover(&err)
   189  	return f.SeekToRowGroupWithContext(f.ctx, rowGroupPosition)
   190  }
   191  
   192  // SeekToRowGroupWithContext seeks to a particular row group, identified by its index.
   193  func (f *FileReader) SeekToRowGroupWithContext(ctx context.Context, rowGroupPosition int) (err error) {
   194  	defer f.recover(&err)
   195  	f.rowGroupPosition = rowGroupPosition - 1
   196  	f.currentRecord = 0
   197  	return f.readRowGroup(ctx)
   198  }
   199  
   200  // readRowGroup read the next row group into memory
   201  func (f *FileReader) readRowGroup(ctx context.Context) error {
   202  	if len(f.meta.RowGroups) <= f.rowGroupPosition {
   203  		return io.EOF
   204  	}
   205  	f.rowGroupPosition++
   206  	return f.readRowGroupData(ctx) //, f.reader, f.schemaReader, f.meta.RowGroups[f.rowGroupPosition-1])
   207  }
   208  
   209  // CurrentRowGroup returns information about the current row group.
   210  func (f *FileReader) CurrentRowGroup() *parquet.RowGroup {
   211  	if f == nil || f.meta == nil || f.meta.RowGroups == nil || f.rowGroupPosition-1 >= len(f.meta.RowGroups) {
   212  		return nil
   213  	}
   214  	return f.meta.RowGroups[f.rowGroupPosition-1]
   215  }
   216  
   217  // RowGroupCount returns the number of row groups in the parquet file.
   218  func (f *FileReader) RowGroupCount() int {
   219  	return len(f.meta.RowGroups)
   220  }
   221  
   222  // NumRows returns the number of rows in the parquet file. This information is directly taken from
   223  // the file's meta data.
   224  func (f *FileReader) NumRows() int64 {
   225  	return f.meta.NumRows
   226  }
   227  
   228  func (f *FileReader) advanceIfNeeded(ctx context.Context) error {
   229  	if f.rowGroupPosition == 0 || f.currentRecord >= f.schemaReader.rowGroupNumRecords() || f.skipRowGroup {
   230  		if err := f.readRowGroup(ctx); err != nil {
   231  			f.skipRowGroup = true
   232  			return err
   233  		}
   234  		f.currentRecord = 0
   235  		f.skipRowGroup = false
   236  	}
   237  
   238  	return nil
   239  }
   240  
   241  // RowGroupNumRows returns the number of rows in the current RowGroup.
   242  func (f *FileReader) RowGroupNumRows() (int64, error) {
   243  	return f.RowGroupNumRowsWithContext(f.ctx)
   244  }
   245  
   246  // RowGroupNumRowsWithContext returns the number of rows in the current RowGroup.
   247  func (f *FileReader) RowGroupNumRowsWithContext(ctx context.Context) (numRecords int64, err error) {
   248  	defer f.recover(&err)
   249  
   250  	if err := f.advanceIfNeeded(ctx); err != nil {
   251  		return 0, err
   252  	}
   253  
   254  	return f.schemaReader.rowGroupNumRecords(), nil
   255  }
   256  
   257  // NextRow reads the next row from the parquet file. If required, it will load the next row group.
   258  func (f *FileReader) NextRow() (map[string]interface{}, error) {
   259  	return f.NextRowWithContext(f.ctx)
   260  }
   261  
   262  // NextRowWithContext reads the next row from the parquet file. If required, it will load the next row group.
   263  func (f *FileReader) NextRowWithContext(ctx context.Context) (row map[string]interface{}, err error) {
   264  	defer f.recover(&err)
   265  
   266  	if err := f.advanceIfNeeded(ctx); err != nil {
   267  		return nil, err
   268  	}
   269  
   270  	f.currentRecord++
   271  	return f.schemaReader.getData()
   272  }
   273  
   274  // SkipRowGroup skips the currently loaded row group and advances to the next row group.
   275  func (f *FileReader) SkipRowGroup() {
   276  	f.skipRowGroup = true
   277  }
   278  
   279  // PreLoad is used to load the row group if required. It does nothing if the row group is already loaded.
   280  func (f *FileReader) PreLoad() error {
   281  	return f.PreLoadWithContext(f.ctx)
   282  }
   283  
   284  // PreLoadWithContext is used to load the row group if required. It does nothing if the row group is already loaded.
   285  func (f *FileReader) PreLoadWithContext(ctx context.Context) (err error) {
   286  	defer f.recover(&err)
   287  	return f.advanceIfNeeded(ctx)
   288  }
   289  
   290  // MetaData returns a map of metadata key-value pairs stored in the parquet file.
   291  func (f *FileReader) MetaData() map[string]string {
   292  	return keyValueMetaDataToMap(f.meta.KeyValueMetadata)
   293  }
   294  
   295  // ColumnMetaData returns a map of metadata key-value pairs for the provided column in the current
   296  // row group. The column name has to be provided in its dotted notation.
   297  //
   298  // Deprecated: use ColumnMetaDataPath instead.
   299  func (f *FileReader) ColumnMetaData(colName string) (map[string]string, error) {
   300  	return f.ColumnMetaDataByPath(parseColumnPath(colName))
   301  }
   302  
   303  // ColumnMetaData returns a map of metadata key-value pairs for the provided column in the current
   304  // row group. The column is provided as ColumnPath.
   305  func (f *FileReader) ColumnMetaDataByPath(path ColumnPath) (metaData map[string]string, err error) {
   306  	defer f.recover(&err)
   307  	for _, col := range f.CurrentRowGroup().Columns {
   308  		if path.Equal(ColumnPath(col.MetaData.PathInSchema)) {
   309  			return keyValueMetaDataToMap(col.MetaData.KeyValueMetadata), nil
   310  		}
   311  	}
   312  	return nil, fmt.Errorf("column %q not found", path.flatName())
   313  }
   314  
   315  // SetSelectedColumns sets the columns which are read. By default, all columns
   316  // will be read.
   317  //
   318  // Deprecated: use SetSelectedColumnsByPath instead.
   319  func (f *FileReader) SetSelectedColumns(cols ...string) {
   320  	parsedCols := []ColumnPath{}
   321  	for _, c := range cols {
   322  		parsedCols = append(parsedCols, parseColumnPath(c))
   323  	}
   324  	f.schemaReader.SetSelectedColumns(parsedCols...)
   325  }
   326  
   327  func (f *FileReader) SetSelectedColumnsByPath(cols ...ColumnPath) {
   328  	f.schemaReader.SetSelectedColumns(cols...)
   329  }
   330  
   331  // Columns returns the list of columns.
   332  func (f *FileReader) Columns() []*Column {
   333  	return f.schemaReader.Columns()
   334  }
   335  
   336  // GetColumnByName returns a column identified by name. If the column doesn't exist,
   337  // the method returns nil.
   338  func (f *FileReader) GetColumnByName(name string) *Column {
   339  	return f.schemaReader.GetColumnByName(name)
   340  }
   341  
   342  // GetColumnByPath returns a column identified by its path. If the column doesn't exist,
   343  // nil is returned.
   344  func (f *FileReader) GetColumnByPath(path ColumnPath) *Column {
   345  	return f.schemaReader.GetColumnByPath(path)
   346  }
   347  
   348  // GetSchemaDefinition returns the current schema definition.
   349  func (f *FileReader) GetSchemaDefinition() *parquetschema.SchemaDefinition {
   350  	return f.schemaReader.GetSchemaDefinition()
   351  }
   352  
   353  func keyValueMetaDataToMap(kvMetaData []*parquet.KeyValue) map[string]string {
   354  	data := make(map[string]string)
   355  	for _, kv := range kvMetaData {
   356  		if kv.Value != nil {
   357  			data[kv.Key] = *kv.Value
   358  		}
   359  	}
   360  	return data
   361  }