github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/reader.go (about) 1 package parquet 2 3 import ( 4 "errors" 5 "fmt" 6 "io" 7 "reflect" 8 ) 9 10 // Deprecated: A Reader reads Go values from parquet files. 11 // 12 // This example showcases a typical use of parquet readers: 13 // 14 // reader := parquet.NewReader(file) 15 // rows := []RowType{} 16 // for { 17 // row := RowType{} 18 // err := reader.Read(&row) 19 // if err != nil { 20 // if err == io.EOF { 21 // break 22 // } 23 // ... 24 // } 25 // rows = append(rows, row) 26 // } 27 // if err := reader.Close(); err != nil { 28 // ... 29 // } 30 // 31 // 32 // For programs building with Go 1.18 or later, the GenericReader[T] type 33 // supersedes this one. 34 type Reader struct { 35 seen reflect.Type 36 file reader 37 read reader 38 rowIndex int64 39 rowbuf []Row 40 } 41 42 // NewReader constructs a parquet reader reading rows from the given 43 // io.ReaderAt. 44 // 45 // In order to read parquet rows, the io.ReaderAt must be converted to a 46 // parquet.File. If r is already a parquet.File it is used directly; otherwise, 47 // the io.ReaderAt value is expected to either have a `Size() int64` method or 48 // implement io.Seeker in order to determine its size. 49 // 50 // The function panics if the reader configuration is invalid. Programs that 51 // cannot guarantee the validity of the options passed to NewReader should 52 // construct the reader configuration independently prior to calling this 53 // function: 54 // 55 // config, err := parquet.NewReaderConfig(options...) 56 // if err != nil { 57 // // handle the configuration error 58 // ... 59 // } else { 60 // // this call to create a reader is guaranteed not to panic 61 // reader := parquet.NewReader(input, config) 62 // ... 63 // } 64 // 65 func NewReader(input io.ReaderAt, options ...ReaderOption) *Reader { 66 c, err := NewReaderConfig(options...) 67 if err != nil { 68 panic(err) 69 } 70 71 f, err := openFile(input) 72 if err != nil { 73 panic(err) 74 } 75 76 r := &Reader{ 77 file: reader{ 78 schema: f.schema, 79 rowGroup: fileRowGroupOf(f), 80 }, 81 } 82 83 if c.Schema != nil { 84 r.file.schema = c.Schema 85 r.file.rowGroup = convertRowGroupTo(r.file.rowGroup, c.Schema) 86 } 87 88 r.read.init(r.file.schema, r.file.rowGroup) 89 return r 90 } 91 92 func openFile(input io.ReaderAt) (*File, error) { 93 f, _ := input.(*File) 94 if f != nil { 95 return f, nil 96 } 97 n, err := sizeOf(input) 98 if err != nil { 99 return nil, err 100 } 101 return OpenFile(input, n) 102 } 103 104 func fileRowGroupOf(f *File) RowGroup { 105 switch rowGroups := f.RowGroups(); len(rowGroups) { 106 case 0: 107 return newEmptyRowGroup(f.Schema()) 108 case 1: 109 return rowGroups[0] 110 default: 111 // TODO: should we attempt to merge the row groups via MergeRowGroups 112 // to preserve the global order of sorting columns within the file? 113 return MultiRowGroup(rowGroups...) 114 } 115 } 116 117 // NewRowGroupReader constructs a new Reader which reads rows from the RowGroup 118 // passed as argument. 119 func NewRowGroupReader(rowGroup RowGroup, options ...ReaderOption) *Reader { 120 c, err := NewReaderConfig(options...) 121 if err != nil { 122 panic(err) 123 } 124 125 if c.Schema != nil { 126 rowGroup = convertRowGroupTo(rowGroup, c.Schema) 127 } 128 129 r := &Reader{ 130 file: reader{ 131 schema: rowGroup.Schema(), 132 rowGroup: rowGroup, 133 }, 134 } 135 136 r.read.init(r.file.schema, r.file.rowGroup) 137 return r 138 } 139 140 func convertRowGroupTo(rowGroup RowGroup, schema *Schema) RowGroup { 141 if rowGroupSchema := rowGroup.Schema(); !nodesAreEqual(schema, rowGroupSchema) { 142 conv, err := Convert(schema, rowGroupSchema) 143 if err != nil { 144 // TODO: this looks like something we should not be panicking on, 145 // but the current NewReader API does not offer a mechanism to 146 // report errors. 147 panic(err) 148 } 149 rowGroup = ConvertRowGroup(rowGroup, conv) 150 } 151 return rowGroup 152 } 153 154 func sizeOf(r io.ReaderAt) (int64, error) { 155 switch f := r.(type) { 156 case interface{ Size() int64 }: 157 return f.Size(), nil 158 case io.Seeker: 159 off, err := f.Seek(0, io.SeekCurrent) 160 if err != nil { 161 return 0, err 162 } 163 end, err := f.Seek(0, io.SeekEnd) 164 if err != nil { 165 return 0, err 166 } 167 _, err = f.Seek(off, io.SeekStart) 168 return end, err 169 default: 170 return 0, fmt.Errorf("cannot determine length of %T", r) 171 } 172 } 173 174 // Reset repositions the reader at the beginning of the underlying parquet file. 175 func (r *Reader) Reset() { 176 r.file.Reset() 177 r.read.Reset() 178 r.rowIndex = 0 179 clearRows(r.rowbuf) 180 } 181 182 // Read reads the next row from r. The type of the row must match the schema 183 // of the underlying parquet file or an error will be returned. 184 // 185 // The method returns io.EOF when no more rows can be read from r. 186 func (r *Reader) Read(row interface{}) error { 187 if rowType := dereference(reflect.TypeOf(row)); rowType.Kind() == reflect.Struct { 188 if r.seen != rowType { 189 if err := r.updateReadSchema(rowType); err != nil { 190 return fmt.Errorf("cannot read parquet row into go value of type %T: %w", row, err) 191 } 192 } 193 } 194 195 if err := r.read.SeekToRow(r.rowIndex); err != nil { 196 if errors.Is(err, io.ErrClosedPipe) { 197 return io.EOF 198 } 199 return fmt.Errorf("seeking reader to row %d: %w", r.rowIndex, err) 200 } 201 202 if cap(r.rowbuf) == 0 { 203 r.rowbuf = make([]Row, 1) 204 } else { 205 r.rowbuf = r.rowbuf[:1] 206 } 207 208 n, err := r.read.ReadRows(r.rowbuf[:]) 209 if n == 0 { 210 return err 211 } 212 213 r.rowIndex++ 214 return r.read.schema.Reconstruct(row, r.rowbuf[0]) 215 } 216 217 func (r *Reader) updateReadSchema(rowType reflect.Type) error { 218 schema := schemaOf(rowType) 219 220 if nodesAreEqual(schema, r.file.schema) { 221 r.read.init(schema, r.file.rowGroup) 222 } else { 223 conv, err := Convert(schema, r.file.schema) 224 if err != nil { 225 return err 226 } 227 r.read.init(schema, ConvertRowGroup(r.file.rowGroup, conv)) 228 } 229 230 r.seen = rowType 231 return nil 232 } 233 234 // ReadRows reads the next rows from r into the given Row buffer. 235 // 236 // The returned values are laid out in the order expected by the 237 // parquet.(*Schema).Reconstruct method. 238 // 239 // The method returns io.EOF when no more rows can be read from r. 240 func (r *Reader) ReadRows(rows []Row) (int, error) { 241 if err := r.file.SeekToRow(r.rowIndex); err != nil { 242 return 0, err 243 } 244 n, err := r.file.ReadRows(rows) 245 r.rowIndex += int64(n) 246 return n, err 247 } 248 249 // Schema returns the schema of rows read by r. 250 func (r *Reader) Schema() *Schema { return r.file.schema } 251 252 // NumRows returns the number of rows that can be read from r. 253 func (r *Reader) NumRows() int64 { return r.file.rowGroup.NumRows() } 254 255 // SeekToRow positions r at the given row index. 256 func (r *Reader) SeekToRow(rowIndex int64) error { 257 if err := r.file.SeekToRow(rowIndex); err != nil { 258 return err 259 } 260 r.rowIndex = rowIndex 261 return nil 262 } 263 264 // Close closes the reader, preventing more rows from being read. 265 func (r *Reader) Close() error { 266 if err := r.read.Close(); err != nil { 267 return err 268 } 269 if err := r.file.Close(); err != nil { 270 return err 271 } 272 return nil 273 } 274 275 // reader is a subtype used in the implementation of Reader to support the two 276 // use cases of either reading rows calling the ReadRow method (where full rows 277 // are read from the underlying parquet file), or calling the Read method to 278 // read rows into Go values, potentially doing partial reads on a subset of the 279 // columns due to using a converted row group view. 280 type reader struct { 281 schema *Schema 282 rowGroup RowGroup 283 rows Rows 284 rowIndex int64 285 } 286 287 func (r *reader) init(schema *Schema, rowGroup RowGroup) { 288 r.schema = schema 289 r.rowGroup = rowGroup 290 r.Reset() 291 } 292 293 func (r *reader) Reset() { 294 r.rowIndex = 0 295 296 if rows, ok := r.rows.(interface{ Reset() }); ok { 297 // This optimization works for the common case where the underlying type 298 // of the Rows instance is rowGroupRows, which should be true in most 299 // cases since even external implementations of the RowGroup interface 300 // can construct values of this type via the NewRowGroupRowReader 301 // function. 302 // 303 // Foreign implementations of the Rows interface may also define a Reset 304 // method in order to participate in this optimization. 305 rows.Reset() 306 return 307 } 308 309 if r.rows != nil { 310 r.rows.Close() 311 r.rows = nil 312 } 313 } 314 315 func (r *reader) ReadRows(rows []Row) (int, error) { 316 if r.rowGroup == nil { 317 return 0, io.EOF 318 } 319 if r.rows == nil { 320 r.rows = r.rowGroup.Rows() 321 if r.rowIndex > 0 { 322 if err := r.rows.SeekToRow(r.rowIndex); err != nil { 323 return 0, err 324 } 325 } 326 } 327 n, err := r.rows.ReadRows(rows) 328 r.rowIndex += int64(n) 329 return n, err 330 } 331 332 func (r *reader) SeekToRow(rowIndex int64) error { 333 if r.rowGroup == nil { 334 return io.ErrClosedPipe 335 } 336 if rowIndex != r.rowIndex { 337 if r.rows != nil { 338 if err := r.rows.SeekToRow(rowIndex); err != nil { 339 return err 340 } 341 } 342 r.rowIndex = rowIndex 343 } 344 return nil 345 } 346 347 func (r *reader) Close() (err error) { 348 r.rowGroup = nil 349 if r.rows != nil { 350 err = r.rows.Close() 351 } 352 return err 353 } 354 355 var ( 356 _ Rows = (*Reader)(nil) 357 _ RowReaderWithSchema = (*Reader)(nil) 358 359 _ RowReader = (*reader)(nil) 360 _ RowSeeker = (*reader)(nil) 361 )