github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/reader.go (about)

     1  package parquet
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"io"
     7  	"reflect"
     8  )
     9  
    10  // Deprecated: A Reader reads Go values from parquet files.
    11  //
    12  // This example showcases a typical use of parquet readers:
    13  //
    14  //	reader := parquet.NewReader(file)
    15  //	rows := []RowType{}
    16  //	for {
    17  //		row := RowType{}
    18  //		err := reader.Read(&row)
    19  //		if err != nil {
    20  //			if err == io.EOF {
    21  //				break
    22  //			}
    23  //			...
    24  //		}
    25  //		rows = append(rows, row)
    26  //	}
    27  //	if err := reader.Close(); err != nil {
    28  //		...
    29  //	}
    30  //
    31  // For programs building with Go 1.18 or later, the GenericReader[T] type
    32  // supersedes this one.
    33  type Reader struct {
    34  	seen     reflect.Type
    35  	file     reader
    36  	read     reader
    37  	rowIndex int64
    38  	rowbuf   []Row
    39  }
    40  
    41  // NewReader constructs a parquet reader reading rows from the given
    42  // io.ReaderAt.
    43  //
    44  // In order to read parquet rows, the io.ReaderAt must be converted to a
    45  // parquet.File. If r is already a parquet.File it is used directly; otherwise,
    46  // the io.ReaderAt value is expected to either have a `Size() int64` method or
    47  // implement io.Seeker in order to determine its size.
    48  //
    49  // The function panics if the reader configuration is invalid. Programs that
    50  // cannot guarantee the validity of the options passed to NewReader should
    51  // construct the reader configuration independently prior to calling this
    52  // function:
    53  //
    54  //	config, err := parquet.NewReaderConfig(options...)
    55  //	if err != nil {
    56  //		// handle the configuration error
    57  //		...
    58  //	} else {
    59  //		// this call to create a reader is guaranteed not to panic
    60  //		reader := parquet.NewReader(input, config)
    61  //		...
    62  //	}
    63  func NewReader(input io.ReaderAt, options ...ReaderOption) *Reader {
    64  	c, err := NewReaderConfig(options...)
    65  	if err != nil {
    66  		panic(err)
    67  	}
    68  
    69  	f, err := openFile(input)
    70  	if err != nil {
    71  		panic(err)
    72  	}
    73  
    74  	r := &Reader{
    75  		file: reader{
    76  			schema:   f.schema,
    77  			rowGroup: fileRowGroupOf(f),
    78  		},
    79  	}
    80  
    81  	if c.Schema != nil {
    82  		r.file.schema = c.Schema
    83  		r.file.rowGroup = convertRowGroupTo(r.file.rowGroup, c.Schema)
    84  	}
    85  
    86  	r.read.init(r.file.schema, r.file.rowGroup)
    87  	return r
    88  }
    89  
    90  func openFile(input io.ReaderAt) (*File, error) {
    91  	f, _ := input.(*File)
    92  	if f != nil {
    93  		return f, nil
    94  	}
    95  	n, err := sizeOf(input)
    96  	if err != nil {
    97  		return nil, err
    98  	}
    99  	return OpenFile(input, n)
   100  }
   101  
   102  func fileRowGroupOf(f *File) RowGroup {
   103  	switch rowGroups := f.RowGroups(); len(rowGroups) {
   104  	case 0:
   105  		return newEmptyRowGroup(f.Schema())
   106  	case 1:
   107  		return rowGroups[0]
   108  	default:
   109  		// TODO: should we attempt to merge the row groups via MergeRowGroups
   110  		// to preserve the global order of sorting columns within the file?
   111  		return newMultiRowGroup(f.config.ReadMode, rowGroups...)
   112  	}
   113  }
   114  
   115  // NewRowGroupReader constructs a new Reader which reads rows from the RowGroup
   116  // passed as argument.
   117  func NewRowGroupReader(rowGroup RowGroup, options ...ReaderOption) *Reader {
   118  	c, err := NewReaderConfig(options...)
   119  	if err != nil {
   120  		panic(err)
   121  	}
   122  
   123  	if c.Schema != nil {
   124  		rowGroup = convertRowGroupTo(rowGroup, c.Schema)
   125  	}
   126  
   127  	r := &Reader{
   128  		file: reader{
   129  			schema:   rowGroup.Schema(),
   130  			rowGroup: rowGroup,
   131  		},
   132  	}
   133  
   134  	r.read.init(r.file.schema, r.file.rowGroup)
   135  	return r
   136  }
   137  
   138  func convertRowGroupTo(rowGroup RowGroup, schema *Schema) RowGroup {
   139  	if rowGroupSchema := rowGroup.Schema(); !nodesAreEqual(schema, rowGroupSchema) {
   140  		conv, err := Convert(schema, rowGroupSchema)
   141  		if err != nil {
   142  			// TODO: this looks like something we should not be panicking on,
   143  			// but the current NewReader API does not offer a mechanism to
   144  			// report errors.
   145  			panic(err)
   146  		}
   147  		rowGroup = ConvertRowGroup(rowGroup, conv)
   148  	}
   149  	return rowGroup
   150  }
   151  
   152  func sizeOf(r io.ReaderAt) (int64, error) {
   153  	switch f := r.(type) {
   154  	case interface{ Size() int64 }:
   155  		return f.Size(), nil
   156  	case io.Seeker:
   157  		off, err := f.Seek(0, io.SeekCurrent)
   158  		if err != nil {
   159  			return 0, err
   160  		}
   161  		end, err := f.Seek(0, io.SeekEnd)
   162  		if err != nil {
   163  			return 0, err
   164  		}
   165  		_, err = f.Seek(off, io.SeekStart)
   166  		return end, err
   167  	default:
   168  		return 0, fmt.Errorf("cannot determine length of %T", r)
   169  	}
   170  }
   171  
   172  // Reset repositions the reader at the beginning of the underlying parquet file.
   173  func (r *Reader) Reset() {
   174  	r.file.Reset()
   175  	r.read.Reset()
   176  	r.rowIndex = 0
   177  	clearRows(r.rowbuf)
   178  }
   179  
   180  // Read reads the next row from r. The type of the row must match the schema
   181  // of the underlying parquet file or an error will be returned.
   182  //
   183  // The method returns io.EOF when no more rows can be read from r.
   184  func (r *Reader) Read(row interface{}) error {
   185  	if rowType := dereference(reflect.TypeOf(row)); rowType.Kind() == reflect.Struct {
   186  		if r.seen != rowType {
   187  			if err := r.updateReadSchema(rowType); err != nil {
   188  				return fmt.Errorf("cannot read parquet row into go value of type %T: %w", row, err)
   189  			}
   190  		}
   191  	}
   192  
   193  	if err := r.read.SeekToRow(r.rowIndex); err != nil {
   194  		if errors.Is(err, io.ErrClosedPipe) {
   195  			return io.EOF
   196  		}
   197  		return fmt.Errorf("seeking reader to row %d: %w", r.rowIndex, err)
   198  	}
   199  
   200  	if cap(r.rowbuf) == 0 {
   201  		r.rowbuf = make([]Row, 1)
   202  	} else {
   203  		r.rowbuf = r.rowbuf[:1]
   204  	}
   205  
   206  	n, err := r.read.ReadRows(r.rowbuf[:])
   207  	if n == 0 {
   208  		return err
   209  	}
   210  
   211  	r.rowIndex++
   212  	return r.read.schema.Reconstruct(row, r.rowbuf[0])
   213  }
   214  
   215  func (r *Reader) updateReadSchema(rowType reflect.Type) error {
   216  	schema := schemaOf(rowType)
   217  
   218  	if nodesAreEqual(schema, r.file.schema) {
   219  		r.read.init(schema, r.file.rowGroup)
   220  	} else {
   221  		conv, err := Convert(schema, r.file.schema)
   222  		if err != nil {
   223  			return err
   224  		}
   225  		r.read.init(schema, ConvertRowGroup(r.file.rowGroup, conv))
   226  	}
   227  
   228  	r.seen = rowType
   229  	return nil
   230  }
   231  
   232  // ReadRows reads the next rows from r into the given Row buffer.
   233  //
   234  // The returned values are laid out in the order expected by the
   235  // parquet.(*Schema).Reconstruct method.
   236  //
   237  // The method returns io.EOF when no more rows can be read from r.
   238  func (r *Reader) ReadRows(rows []Row) (int, error) {
   239  	if err := r.file.SeekToRow(r.rowIndex); err != nil {
   240  		return 0, err
   241  	}
   242  	n, err := r.file.ReadRows(rows)
   243  	r.rowIndex += int64(n)
   244  	return n, err
   245  }
   246  
   247  // Schema returns the schema of rows read by r.
   248  func (r *Reader) Schema() *Schema { return r.file.schema }
   249  
   250  // NumRows returns the number of rows that can be read from r.
   251  func (r *Reader) NumRows() int64 { return r.file.rowGroup.NumRows() }
   252  
   253  // SeekToRow positions r at the given row index.
   254  func (r *Reader) SeekToRow(rowIndex int64) error {
   255  	if err := r.file.SeekToRow(rowIndex); err != nil {
   256  		return err
   257  	}
   258  	r.rowIndex = rowIndex
   259  	return nil
   260  }
   261  
   262  // Close closes the reader, preventing more rows from being read.
   263  func (r *Reader) Close() error {
   264  	if err := r.read.Close(); err != nil {
   265  		return err
   266  	}
   267  	if err := r.file.Close(); err != nil {
   268  		return err
   269  	}
   270  	return nil
   271  }
   272  
   273  // reader is a subtype used in the implementation of Reader to support the two
   274  // use cases of either reading rows calling the ReadRow method (where full rows
   275  // are read from the underlying parquet file), or calling the Read method to
   276  // read rows into Go values, potentially doing partial reads on a subset of the
   277  // columns due to using a converted row group view.
   278  type reader struct {
   279  	schema   *Schema
   280  	rowGroup RowGroup
   281  	rows     Rows
   282  	rowIndex int64
   283  }
   284  
   285  func (r *reader) init(schema *Schema, rowGroup RowGroup) {
   286  	r.schema = schema
   287  	r.rowGroup = rowGroup
   288  	r.Reset()
   289  }
   290  
   291  func (r *reader) Reset() {
   292  	r.rowIndex = 0
   293  
   294  	if rows, ok := r.rows.(interface{ Reset() }); ok {
   295  		// This optimization works for the common case where the underlying type
   296  		// of the Rows instance is rowGroupRows, which should be true in most
   297  		// cases since even external implementations of the RowGroup interface
   298  		// can construct values of this type via the NewRowGroupRowReader
   299  		// function.
   300  		//
   301  		// Foreign implementations of the Rows interface may also define a Reset
   302  		// method in order to participate in this optimization.
   303  		rows.Reset()
   304  		return
   305  	}
   306  
   307  	if r.rows != nil {
   308  		r.rows.Close()
   309  		r.rows = nil
   310  	}
   311  }
   312  
   313  func (r *reader) ReadRows(rows []Row) (int, error) {
   314  	if r.rowGroup == nil {
   315  		return 0, io.EOF
   316  	}
   317  	if r.rows == nil {
   318  		r.rows = r.rowGroup.Rows()
   319  		if r.rowIndex > 0 {
   320  			if err := r.rows.SeekToRow(r.rowIndex); err != nil {
   321  				return 0, err
   322  			}
   323  		}
   324  	}
   325  	n, err := r.rows.ReadRows(rows)
   326  	r.rowIndex += int64(n)
   327  	return n, err
   328  }
   329  
   330  func (r *reader) SeekToRow(rowIndex int64) error {
   331  	if r.rowGroup == nil {
   332  		return io.ErrClosedPipe
   333  	}
   334  	if rowIndex != r.rowIndex {
   335  		if r.rows != nil {
   336  			if err := r.rows.SeekToRow(rowIndex); err != nil {
   337  				return err
   338  			}
   339  		}
   340  		r.rowIndex = rowIndex
   341  	}
   342  	return nil
   343  }
   344  
   345  func (r *reader) Close() (err error) {
   346  	r.rowGroup = nil
   347  	if r.rows != nil {
   348  		err = r.rows.Close()
   349  	}
   350  	return err
   351  }
   352  
   353  var (
   354  	_ Rows                = (*Reader)(nil)
   355  	_ RowReaderWithSchema = (*Reader)(nil)
   356  
   357  	_ RowReader = (*reader)(nil)
   358  	_ RowSeeker = (*reader)(nil)
   359  )