github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/reader_go18.go (about)

     1  //go:build go1.18
     2  
     3  package parquet
     4  
     5  import (
     6  	"io"
     7  	"reflect"
     8  )
     9  
    10  // GenericReader is similar to a Reader but uses a type parameter to define the
    11  // Go type representing the schema of rows being read.
    12  //
    13  // See GenericWriter for details about the benefits over the classic Reader API.
    14  type GenericReader[T any] struct {
    15  	base Reader
    16  	read readFunc[T]
    17  }
    18  
    19  // NewGenericReader is like NewReader but returns GenericReader[T] suited to write
    20  // rows of Go type T.
    21  //
    22  // The type parameter T should be a map, struct, or any. Any other types will
    23  // cause a panic at runtime. Type checking is a lot more effective when the
    24  // generic parameter is a struct type, using map and interface types is somewhat
    25  // similar to using a Writer.
    26  //
    27  // If the option list may explicitly declare a schema, it must be compatible
    28  // with the schema generated from T.
    29  func NewGenericReader[T any](input io.ReaderAt, options ...ReaderOption) *GenericReader[T] {
    30  	c, err := NewReaderConfig(options...)
    31  	if err != nil {
    32  		panic(err)
    33  	}
    34  
    35  	f, err := openFile(input)
    36  	if err != nil {
    37  		panic(err)
    38  	}
    39  
    40  	rowGroup := fileRowGroupOf(f)
    41  
    42  	t := typeOf[T]()
    43  	if c.Schema == nil {
    44  		if t == nil {
    45  			c.Schema = rowGroup.Schema()
    46  		} else {
    47  			c.Schema = schemaOf(dereference(t))
    48  		}
    49  	}
    50  
    51  	r := &GenericReader[T]{
    52  		base: Reader{
    53  			file: reader{
    54  				schema:   c.Schema,
    55  				rowGroup: rowGroup,
    56  			},
    57  		},
    58  	}
    59  
    60  	if !nodesAreEqual(c.Schema, f.schema) {
    61  		r.base.file.rowGroup = convertRowGroupTo(r.base.file.rowGroup, c.Schema)
    62  	}
    63  
    64  	r.base.read.init(r.base.file.schema, r.base.file.rowGroup)
    65  	r.read = readFuncOf[T](t, r.base.file.schema)
    66  	return r
    67  }
    68  
    69  func NewGenericRowGroupReader[T any](rowGroup RowGroup, options ...ReaderOption) *GenericReader[T] {
    70  	c, err := NewReaderConfig(options...)
    71  	if err != nil {
    72  		panic(err)
    73  	}
    74  
    75  	t := typeOf[T]()
    76  	if c.Schema == nil {
    77  		if t == nil {
    78  			c.Schema = rowGroup.Schema()
    79  		} else {
    80  			c.Schema = schemaOf(dereference(t))
    81  		}
    82  	}
    83  
    84  	r := &GenericReader[T]{
    85  		base: Reader{
    86  			file: reader{
    87  				schema:   c.Schema,
    88  				rowGroup: rowGroup,
    89  			},
    90  		},
    91  	}
    92  
    93  	if !nodesAreEqual(c.Schema, rowGroup.Schema()) {
    94  		r.base.file.rowGroup = convertRowGroupTo(r.base.file.rowGroup, c.Schema)
    95  	}
    96  
    97  	r.base.read.init(r.base.file.schema, r.base.file.rowGroup)
    98  	r.read = readFuncOf[T](t, r.base.file.schema)
    99  	return r
   100  }
   101  
   102  func (r *GenericReader[T]) Reset() {
   103  	r.base.Reset()
   104  }
   105  
   106  // Read reads the next rows from the reader into the given rows slice up to len(rows).
   107  //
   108  // The returned values are safe to reuse across Read calls and do not share
   109  // memory with the reader's underlying page buffers.
   110  //
   111  // The method returns the number of rows read and io.EOF when no more rows
   112  // can be read from the reader.
   113  func (r *GenericReader[T]) Read(rows []T) (int, error) {
   114  	return r.read(r, rows)
   115  }
   116  
   117  func (r *GenericReader[T]) ReadRows(rows []Row) (int, error) {
   118  	return r.base.ReadRows(rows)
   119  }
   120  
   121  func (r *GenericReader[T]) Schema() *Schema {
   122  	return r.base.Schema()
   123  }
   124  
   125  func (r *GenericReader[T]) NumRows() int64 {
   126  	return r.base.NumRows()
   127  }
   128  
   129  func (r *GenericReader[T]) SeekToRow(rowIndex int64) error {
   130  	return r.base.SeekToRow(rowIndex)
   131  }
   132  
   133  func (r *GenericReader[T]) Close() error {
   134  	return r.base.Close()
   135  }
   136  
   137  // readRows reads the next rows from the reader into the given rows slice up to len(rows).
   138  //
   139  // The returned values are safe to reuse across readRows calls and do not share
   140  // memory with the reader's underlying page buffers.
   141  //
   142  // The method returns the number of rows read and io.EOF when no more rows
   143  // can be read from the reader.
   144  func (r *GenericReader[T]) readRows(rows []T) (int, error) {
   145  	nRequest := len(rows)
   146  	if cap(r.base.rowbuf) < nRequest {
   147  		r.base.rowbuf = make([]Row, nRequest)
   148  	} else {
   149  		r.base.rowbuf = r.base.rowbuf[:nRequest]
   150  	}
   151  
   152  	var n, nTotal int
   153  	var err error
   154  	for {
   155  		// ReadRows reads the minimum remaining rows in a column page across all columns
   156  		// of the underlying reader, unless the length of the slice passed to it is smaller.
   157  		// In that case, ReadRows will read the number of rows equal to the length of the
   158  		// given slice argument. We limit that length to never be more than requested
   159  		// because sequential reads can cross page boundaries.
   160  		n, err = r.base.ReadRows(r.base.rowbuf[:nRequest-nTotal])
   161  		if n > 0 {
   162  			schema := r.base.Schema()
   163  
   164  			for i, row := range r.base.rowbuf[:n] {
   165  				if err2 := schema.Reconstruct(&rows[nTotal+i], row); err2 != nil {
   166  					return nTotal + i, err2
   167  				}
   168  			}
   169  		}
   170  		nTotal += n
   171  		if n == 0 || nTotal == nRequest || err != nil {
   172  			break
   173  		}
   174  	}
   175  
   176  	return nTotal, err
   177  }
   178  
   179  var (
   180  	_ Rows                = (*GenericReader[any])(nil)
   181  	_ RowReaderWithSchema = (*Reader)(nil)
   182  
   183  	_ Rows                = (*GenericReader[struct{}])(nil)
   184  	_ RowReaderWithSchema = (*GenericReader[struct{}])(nil)
   185  
   186  	_ Rows                = (*GenericReader[map[struct{}]struct{}])(nil)
   187  	_ RowReaderWithSchema = (*GenericReader[map[struct{}]struct{}])(nil)
   188  )
   189  
   190  type readFunc[T any] func(*GenericReader[T], []T) (int, error)
   191  
   192  func readFuncOf[T any](t reflect.Type, schema *Schema) readFunc[T] {
   193  	if t == nil {
   194  		return (*GenericReader[T]).readRows
   195  	}
   196  	switch t.Kind() {
   197  	case reflect.Interface, reflect.Map:
   198  		return (*GenericReader[T]).readRows
   199  
   200  	case reflect.Struct:
   201  		return (*GenericReader[T]).readRows
   202  
   203  	case reflect.Pointer:
   204  		if e := t.Elem(); e.Kind() == reflect.Struct {
   205  			return (*GenericReader[T]).readRows
   206  		}
   207  	}
   208  	panic("cannot create reader for values of type " + t.String())
   209  }