github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/reader.go (about)

     1  package parquet
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"io"
     7  	"reflect"
     8  )
     9  
    10  // Deprecated: A Reader reads Go values from parquet files.
    11  //
    12  // This example showcases a typical use of parquet readers:
    13  //
    14  //	reader := parquet.NewReader(file)
    15  //	rows := []RowType{}
    16  //	for {
    17  //		row := RowType{}
    18  //		err := reader.Read(&row)
    19  //		if err != nil {
    20  //			if err == io.EOF {
    21  //				break
    22  //			}
    23  //			...
    24  //		}
    25  //		rows = append(rows, row)
    26  //	}
    27  //	if err := reader.Close(); err != nil {
    28  //		...
    29  //	}
    30  //
    31  //
    32  // For programs building with Go 1.18 or later, the GenericReader[T] type
    33  // supersedes this one.
    34  type Reader struct {
    35  	seen     reflect.Type
    36  	file     reader
    37  	read     reader
    38  	rowIndex int64
    39  	rowbuf   []Row
    40  }
    41  
    42  // NewReader constructs a parquet reader reading rows from the given
    43  // io.ReaderAt.
    44  //
    45  // In order to read parquet rows, the io.ReaderAt must be converted to a
    46  // parquet.File. If r is already a parquet.File it is used directly; otherwise,
    47  // the io.ReaderAt value is expected to either have a `Size() int64` method or
    48  // implement io.Seeker in order to determine its size.
    49  //
    50  // The function panics if the reader configuration is invalid. Programs that
    51  // cannot guarantee the validity of the options passed to NewReader should
    52  // construct the reader configuration independently prior to calling this
    53  // function:
    54  //
    55  //	config, err := parquet.NewReaderConfig(options...)
    56  //	if err != nil {
    57  //		// handle the configuration error
    58  //		...
    59  //	} else {
    60  //		// this call to create a reader is guaranteed not to panic
    61  //		reader := parquet.NewReader(input, config)
    62  //		...
    63  //	}
    64  //
    65  func NewReader(input io.ReaderAt, options ...ReaderOption) *Reader {
    66  	c, err := NewReaderConfig(options...)
    67  	if err != nil {
    68  		panic(err)
    69  	}
    70  
    71  	f, err := openFile(input)
    72  	if err != nil {
    73  		panic(err)
    74  	}
    75  
    76  	r := &Reader{
    77  		file: reader{
    78  			schema:   f.schema,
    79  			rowGroup: fileRowGroupOf(f),
    80  		},
    81  	}
    82  
    83  	if c.Schema != nil {
    84  		r.file.schema = c.Schema
    85  		r.file.rowGroup = convertRowGroupTo(r.file.rowGroup, c.Schema)
    86  	}
    87  
    88  	r.read.init(r.file.schema, r.file.rowGroup)
    89  	return r
    90  }
    91  
    92  func openFile(input io.ReaderAt) (*File, error) {
    93  	f, _ := input.(*File)
    94  	if f != nil {
    95  		return f, nil
    96  	}
    97  	n, err := sizeOf(input)
    98  	if err != nil {
    99  		return nil, err
   100  	}
   101  	return OpenFile(input, n)
   102  }
   103  
   104  func fileRowGroupOf(f *File) RowGroup {
   105  	switch rowGroups := f.RowGroups(); len(rowGroups) {
   106  	case 0:
   107  		return newEmptyRowGroup(f.Schema())
   108  	case 1:
   109  		return rowGroups[0]
   110  	default:
   111  		// TODO: should we attempt to merge the row groups via MergeRowGroups
   112  		// to preserve the global order of sorting columns within the file?
   113  		return MultiRowGroup(rowGroups...)
   114  	}
   115  }
   116  
   117  // NewRowGroupReader constructs a new Reader which reads rows from the RowGroup
   118  // passed as argument.
   119  func NewRowGroupReader(rowGroup RowGroup, options ...ReaderOption) *Reader {
   120  	c, err := NewReaderConfig(options...)
   121  	if err != nil {
   122  		panic(err)
   123  	}
   124  
   125  	if c.Schema != nil {
   126  		rowGroup = convertRowGroupTo(rowGroup, c.Schema)
   127  	}
   128  
   129  	r := &Reader{
   130  		file: reader{
   131  			schema:   rowGroup.Schema(),
   132  			rowGroup: rowGroup,
   133  		},
   134  	}
   135  
   136  	r.read.init(r.file.schema, r.file.rowGroup)
   137  	return r
   138  }
   139  
   140  func convertRowGroupTo(rowGroup RowGroup, schema *Schema) RowGroup {
   141  	if rowGroupSchema := rowGroup.Schema(); !nodesAreEqual(schema, rowGroupSchema) {
   142  		conv, err := Convert(schema, rowGroupSchema)
   143  		if err != nil {
   144  			// TODO: this looks like something we should not be panicking on,
   145  			// but the current NewReader API does not offer a mechanism to
   146  			// report errors.
   147  			panic(err)
   148  		}
   149  		rowGroup = ConvertRowGroup(rowGroup, conv)
   150  	}
   151  	return rowGroup
   152  }
   153  
   154  func sizeOf(r io.ReaderAt) (int64, error) {
   155  	switch f := r.(type) {
   156  	case interface{ Size() int64 }:
   157  		return f.Size(), nil
   158  	case io.Seeker:
   159  		off, err := f.Seek(0, io.SeekCurrent)
   160  		if err != nil {
   161  			return 0, err
   162  		}
   163  		end, err := f.Seek(0, io.SeekEnd)
   164  		if err != nil {
   165  			return 0, err
   166  		}
   167  		_, err = f.Seek(off, io.SeekStart)
   168  		return end, err
   169  	default:
   170  		return 0, fmt.Errorf("cannot determine length of %T", r)
   171  	}
   172  }
   173  
   174  // Reset repositions the reader at the beginning of the underlying parquet file.
   175  func (r *Reader) Reset() {
   176  	r.file.Reset()
   177  	r.read.Reset()
   178  	r.rowIndex = 0
   179  	clearRows(r.rowbuf)
   180  }
   181  
   182  // Read reads the next row from r. The type of the row must match the schema
   183  // of the underlying parquet file or an error will be returned.
   184  //
   185  // The method returns io.EOF when no more rows can be read from r.
   186  func (r *Reader) Read(row interface{}) error {
   187  	if rowType := dereference(reflect.TypeOf(row)); rowType.Kind() == reflect.Struct {
   188  		if r.seen != rowType {
   189  			if err := r.updateReadSchema(rowType); err != nil {
   190  				return fmt.Errorf("cannot read parquet row into go value of type %T: %w", row, err)
   191  			}
   192  		}
   193  	}
   194  
   195  	if err := r.read.SeekToRow(r.rowIndex); err != nil {
   196  		if errors.Is(err, io.ErrClosedPipe) {
   197  			return io.EOF
   198  		}
   199  		return fmt.Errorf("seeking reader to row %d: %w", r.rowIndex, err)
   200  	}
   201  
   202  	if cap(r.rowbuf) == 0 {
   203  		r.rowbuf = make([]Row, 1)
   204  	} else {
   205  		r.rowbuf = r.rowbuf[:1]
   206  	}
   207  
   208  	n, err := r.read.ReadRows(r.rowbuf[:])
   209  	if n == 0 {
   210  		return err
   211  	}
   212  
   213  	r.rowIndex++
   214  	return r.read.schema.Reconstruct(row, r.rowbuf[0])
   215  }
   216  
   217  func (r *Reader) updateReadSchema(rowType reflect.Type) error {
   218  	schema := schemaOf(rowType)
   219  
   220  	if nodesAreEqual(schema, r.file.schema) {
   221  		r.read.init(schema, r.file.rowGroup)
   222  	} else {
   223  		conv, err := Convert(schema, r.file.schema)
   224  		if err != nil {
   225  			return err
   226  		}
   227  		r.read.init(schema, ConvertRowGroup(r.file.rowGroup, conv))
   228  	}
   229  
   230  	r.seen = rowType
   231  	return nil
   232  }
   233  
   234  // ReadRows reads the next rows from r into the given Row buffer.
   235  //
   236  // The returned values are laid out in the order expected by the
   237  // parquet.(*Schema).Reconstruct method.
   238  //
   239  // The method returns io.EOF when no more rows can be read from r.
   240  func (r *Reader) ReadRows(rows []Row) (int, error) {
   241  	if err := r.file.SeekToRow(r.rowIndex); err != nil {
   242  		return 0, err
   243  	}
   244  	n, err := r.file.ReadRows(rows)
   245  	r.rowIndex += int64(n)
   246  	return n, err
   247  }
   248  
   249  // Schema returns the schema of rows read by r.
   250  func (r *Reader) Schema() *Schema { return r.file.schema }
   251  
   252  // NumRows returns the number of rows that can be read from r.
   253  func (r *Reader) NumRows() int64 { return r.file.rowGroup.NumRows() }
   254  
   255  // SeekToRow positions r at the given row index.
   256  func (r *Reader) SeekToRow(rowIndex int64) error {
   257  	if err := r.file.SeekToRow(rowIndex); err != nil {
   258  		return err
   259  	}
   260  	r.rowIndex = rowIndex
   261  	return nil
   262  }
   263  
   264  // Close closes the reader, preventing more rows from being read.
   265  func (r *Reader) Close() error {
   266  	if err := r.read.Close(); err != nil {
   267  		return err
   268  	}
   269  	if err := r.file.Close(); err != nil {
   270  		return err
   271  	}
   272  	return nil
   273  }
   274  
   275  // reader is a subtype used in the implementation of Reader to support the two
   276  // use cases of either reading rows calling the ReadRow method (where full rows
   277  // are read from the underlying parquet file), or calling the Read method to
   278  // read rows into Go values, potentially doing partial reads on a subset of the
   279  // columns due to using a converted row group view.
   280  type reader struct {
   281  	schema   *Schema
   282  	rowGroup RowGroup
   283  	rows     Rows
   284  	rowIndex int64
   285  }
   286  
   287  func (r *reader) init(schema *Schema, rowGroup RowGroup) {
   288  	r.schema = schema
   289  	r.rowGroup = rowGroup
   290  	r.Reset()
   291  }
   292  
   293  func (r *reader) Reset() {
   294  	r.rowIndex = 0
   295  
   296  	if rows, ok := r.rows.(interface{ Reset() }); ok {
   297  		// This optimization works for the common case where the underlying type
   298  		// of the Rows instance is rowGroupRows, which should be true in most
   299  		// cases since even external implementations of the RowGroup interface
   300  		// can construct values of this type via the NewRowGroupRowReader
   301  		// function.
   302  		//
   303  		// Foreign implementations of the Rows interface may also define a Reset
   304  		// method in order to participate in this optimization.
   305  		rows.Reset()
   306  		return
   307  	}
   308  
   309  	if r.rows != nil {
   310  		r.rows.Close()
   311  		r.rows = nil
   312  	}
   313  }
   314  
   315  func (r *reader) ReadRows(rows []Row) (int, error) {
   316  	if r.rowGroup == nil {
   317  		return 0, io.EOF
   318  	}
   319  	if r.rows == nil {
   320  		r.rows = r.rowGroup.Rows()
   321  		if r.rowIndex > 0 {
   322  			if err := r.rows.SeekToRow(r.rowIndex); err != nil {
   323  				return 0, err
   324  			}
   325  		}
   326  	}
   327  	n, err := r.rows.ReadRows(rows)
   328  	r.rowIndex += int64(n)
   329  	return n, err
   330  }
   331  
   332  func (r *reader) SeekToRow(rowIndex int64) error {
   333  	if r.rowGroup == nil {
   334  		return io.ErrClosedPipe
   335  	}
   336  	if rowIndex != r.rowIndex {
   337  		if r.rows != nil {
   338  			if err := r.rows.SeekToRow(rowIndex); err != nil {
   339  				return err
   340  			}
   341  		}
   342  		r.rowIndex = rowIndex
   343  	}
   344  	return nil
   345  }
   346  
   347  func (r *reader) Close() (err error) {
   348  	r.rowGroup = nil
   349  	if r.rows != nil {
   350  		err = r.rows.Close()
   351  	}
   352  	return err
   353  }
   354  
   355  var (
   356  	_ Rows                = (*Reader)(nil)
   357  	_ RowReaderWithSchema = (*Reader)(nil)
   358  
   359  	_ RowReader = (*reader)(nil)
   360  	_ RowSeeker = (*reader)(nil)
   361  )