github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/reader_go18.go (about) 1 //go:build go1.18 2 3 package parquet 4 5 import ( 6 "io" 7 "reflect" 8 ) 9 10 // GenericReader is similar to a Reader but uses a type parameter to define the 11 // Go type representing the schema of rows being read. 12 // 13 // See GenericWriter for details about the benefits over the classic Reader API. 14 type GenericReader[T any] struct { 15 base Reader 16 read readFunc[T] 17 } 18 19 // NewGenericReader is like NewReader but returns GenericReader[T] suited to write 20 // rows of Go type T. 21 // 22 // The type parameter T should be a map, struct, or any. Any other types will 23 // cause a panic at runtime. Type checking is a lot more effective when the 24 // generic parameter is a struct type, using map and interface types is somewhat 25 // similar to using a Writer. 26 // 27 // If the option list may explicitly declare a schema, it must be compatible 28 // with the schema generated from T. 29 func NewGenericReader[T any](input io.ReaderAt, options ...ReaderOption) *GenericReader[T] { 30 c, err := NewReaderConfig(options...) 31 if err != nil { 32 panic(err) 33 } 34 35 f, err := openFile(input) 36 if err != nil { 37 panic(err) 38 } 39 40 rowGroup := fileRowGroupOf(f) 41 42 t := typeOf[T]() 43 if c.Schema == nil { 44 if t == nil { 45 c.Schema = rowGroup.Schema() 46 } else { 47 c.Schema = schemaOf(dereference(t)) 48 } 49 } 50 51 r := &GenericReader[T]{ 52 base: Reader{ 53 file: reader{ 54 schema: c.Schema, 55 rowGroup: rowGroup, 56 }, 57 }, 58 } 59 60 if !nodesAreEqual(c.Schema, f.schema) { 61 r.base.file.rowGroup = convertRowGroupTo(r.base.file.rowGroup, c.Schema) 62 } 63 64 r.base.read.init(r.base.file.schema, r.base.file.rowGroup) 65 r.read = readFuncOf[T](t, r.base.file.schema) 66 return r 67 } 68 69 func NewGenericRowGroupReader[T any](rowGroup RowGroup, options ...ReaderOption) *GenericReader[T] { 70 c, err := NewReaderConfig(options...) 71 if err != nil { 72 panic(err) 73 } 74 75 t := typeOf[T]() 76 if c.Schema == nil { 77 if t == nil { 78 c.Schema = rowGroup.Schema() 79 } else { 80 c.Schema = schemaOf(dereference(t)) 81 } 82 } 83 84 r := &GenericReader[T]{ 85 base: Reader{ 86 file: reader{ 87 schema: c.Schema, 88 rowGroup: rowGroup, 89 }, 90 }, 91 } 92 93 if !nodesAreEqual(c.Schema, rowGroup.Schema()) { 94 r.base.file.rowGroup = convertRowGroupTo(r.base.file.rowGroup, c.Schema) 95 } 96 97 r.base.read.init(r.base.file.schema, r.base.file.rowGroup) 98 r.read = readFuncOf[T](t, r.base.file.schema) 99 return r 100 } 101 102 func (r *GenericReader[T]) Reset() { 103 r.base.Reset() 104 } 105 106 // Read reads the next rows from the reader into the given rows slice up to len(rows). 107 // 108 // The returned values are safe to reuse across Read calls and do not share 109 // memory with the reader's underlying page buffers. 110 // 111 // The method returns the number of rows read and io.EOF when no more rows 112 // can be read from the reader. 113 func (r *GenericReader[T]) Read(rows []T) (int, error) { 114 return r.read(r, rows) 115 } 116 117 func (r *GenericReader[T]) ReadRows(rows []Row) (int, error) { 118 return r.base.ReadRows(rows) 119 } 120 121 func (r *GenericReader[T]) Schema() *Schema { 122 return r.base.Schema() 123 } 124 125 func (r *GenericReader[T]) NumRows() int64 { 126 return r.base.NumRows() 127 } 128 129 func (r *GenericReader[T]) SeekToRow(rowIndex int64) error { 130 return r.base.SeekToRow(rowIndex) 131 } 132 133 func (r *GenericReader[T]) Close() error { 134 return r.base.Close() 135 } 136 137 // readRows reads the next rows from the reader into the given rows slice up to len(rows). 138 // 139 // The returned values are safe to reuse across readRows calls and do not share 140 // memory with the reader's underlying page buffers. 141 // 142 // The method returns the number of rows read and io.EOF when no more rows 143 // can be read from the reader. 144 func (r *GenericReader[T]) readRows(rows []T) (int, error) { 145 nRequest := len(rows) 146 if cap(r.base.rowbuf) < nRequest { 147 r.base.rowbuf = make([]Row, nRequest) 148 } else { 149 r.base.rowbuf = r.base.rowbuf[:nRequest] 150 } 151 152 var n, nTotal int 153 var err error 154 for { 155 // ReadRows reads the minimum remaining rows in a column page across all columns 156 // of the underlying reader, unless the length of the slice passed to it is smaller. 157 // In that case, ReadRows will read the number of rows equal to the length of the 158 // given slice argument. We limit that length to never be more than requested 159 // because sequential reads can cross page boundaries. 160 n, err = r.base.ReadRows(r.base.rowbuf[:nRequest-nTotal]) 161 if n > 0 { 162 schema := r.base.Schema() 163 164 for i, row := range r.base.rowbuf[:n] { 165 if err2 := schema.Reconstruct(&rows[nTotal+i], row); err2 != nil { 166 return nTotal + i, err2 167 } 168 } 169 } 170 nTotal += n 171 if n == 0 || nTotal == nRequest || err != nil { 172 break 173 } 174 } 175 176 return nTotal, err 177 } 178 179 var ( 180 _ Rows = (*GenericReader[any])(nil) 181 _ RowReaderWithSchema = (*Reader)(nil) 182 183 _ Rows = (*GenericReader[struct{}])(nil) 184 _ RowReaderWithSchema = (*GenericReader[struct{}])(nil) 185 186 _ Rows = (*GenericReader[map[struct{}]struct{}])(nil) 187 _ RowReaderWithSchema = (*GenericReader[map[struct{}]struct{}])(nil) 188 ) 189 190 type readFunc[T any] func(*GenericReader[T], []T) (int, error) 191 192 func readFuncOf[T any](t reflect.Type, schema *Schema) readFunc[T] { 193 if t == nil { 194 return (*GenericReader[T]).readRows 195 } 196 switch t.Kind() { 197 case reflect.Interface, reflect.Map: 198 return (*GenericReader[T]).readRows 199 200 case reflect.Struct: 201 return (*GenericReader[T]).readRows 202 203 case reflect.Pointer: 204 if e := t.Elem(); e.Kind() == reflect.Struct { 205 return (*GenericReader[T]).readRows 206 } 207 } 208 panic("cannot create reader for values of type " + t.String()) 209 }