github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/reader.go (about) 1 package parquet 2 3 import ( 4 "errors" 5 "fmt" 6 "io" 7 "reflect" 8 ) 9 10 // GenericReader is similar to a Reader but uses a type parameter to define the 11 // Go type representing the schema of rows being read. 12 // 13 // See GenericWriter for details about the benefits over the classic Reader API. 14 type GenericReader[T any] struct { 15 base Reader 16 read readFunc[T] 17 } 18 19 // NewGenericReader is like NewReader but returns GenericReader[T] suited to write 20 // rows of Go type T. 21 // 22 // The type parameter T should be a map, struct, or any. Any other types will 23 // cause a panic at runtime. Type checking is a lot more effective when the 24 // generic parameter is a struct type, using map and interface types is somewhat 25 // similar to using a Writer. 26 // 27 // If the option list may explicitly declare a schema, it must be compatible 28 // with the schema generated from T. 29 func NewGenericReader[T any](input io.ReaderAt, options ...ReaderOption) *GenericReader[T] { 30 c, err := NewReaderConfig(options...) 31 if err != nil { 32 panic(err) 33 } 34 35 f, err := openFile(input) 36 if err != nil { 37 panic(err) 38 } 39 40 rowGroup := fileRowGroupOf(f) 41 42 t := typeOf[T]() 43 if c.Schema == nil { 44 if t == nil { 45 c.Schema = rowGroup.Schema() 46 } else { 47 c.Schema = schemaOf(dereference(t)) 48 } 49 } 50 51 r := &GenericReader[T]{ 52 base: Reader{ 53 file: reader{ 54 schema: c.Schema, 55 rowGroup: rowGroup, 56 }, 57 }, 58 } 59 60 if !nodesAreEqual(c.Schema, f.schema) { 61 r.base.file.rowGroup = convertRowGroupTo(r.base.file.rowGroup, c.Schema) 62 } 63 64 r.base.read.init(r.base.file.schema, r.base.file.rowGroup) 65 r.read = readFuncOf[T](t, r.base.file.schema) 66 return r 67 } 68 69 func NewGenericRowGroupReader[T any](rowGroup RowGroup, options ...ReaderOption) *GenericReader[T] { 70 c, err := NewReaderConfig(options...) 71 if err != nil { 72 panic(err) 73 } 74 75 t := typeOf[T]() 76 if c.Schema == nil { 77 if t == nil { 78 c.Schema = rowGroup.Schema() 79 } else { 80 c.Schema = schemaOf(dereference(t)) 81 } 82 } 83 84 r := &GenericReader[T]{ 85 base: Reader{ 86 file: reader{ 87 schema: c.Schema, 88 rowGroup: rowGroup, 89 }, 90 }, 91 } 92 93 if !nodesAreEqual(c.Schema, rowGroup.Schema()) { 94 r.base.file.rowGroup = convertRowGroupTo(r.base.file.rowGroup, c.Schema) 95 } 96 97 r.base.read.init(r.base.file.schema, r.base.file.rowGroup) 98 r.read = readFuncOf[T](t, r.base.file.schema) 99 return r 100 } 101 102 func (r *GenericReader[T]) Reset() { 103 r.base.Reset() 104 } 105 106 // Read reads the next rows from the reader into the given rows slice up to len(rows). 107 // 108 // The returned values are safe to reuse across Read calls and do not share 109 // memory with the reader's underlying page buffers. 110 // 111 // The method returns the number of rows read and io.EOF when no more rows 112 // can be read from the reader. 113 func (r *GenericReader[T]) Read(rows []T) (int, error) { 114 return r.read(r, rows) 115 } 116 117 func (r *GenericReader[T]) ReadRows(rows []Row) (int, error) { 118 return r.base.ReadRows(rows) 119 } 120 121 func (r *GenericReader[T]) Schema() *Schema { 122 return r.base.Schema() 123 } 124 125 func (r *GenericReader[T]) NumRows() int64 { 126 return r.base.NumRows() 127 } 128 129 func (r *GenericReader[T]) SeekToRow(rowIndex int64) error { 130 return r.base.SeekToRow(rowIndex) 131 } 132 133 func (r *GenericReader[T]) Close() error { 134 return r.base.Close() 135 } 136 137 // readRows reads the next rows from the reader into the given rows slice up to len(rows). 138 // 139 // The returned values are safe to reuse across readRows calls and do not share 140 // memory with the reader's underlying page buffers. 141 // 142 // The method returns the number of rows read and io.EOF when no more rows 143 // can be read from the reader. 144 func (r *GenericReader[T]) readRows(rows []T) (int, error) { 145 nRequest := len(rows) 146 if cap(r.base.rowbuf) < nRequest { 147 r.base.rowbuf = make([]Row, nRequest) 148 } else { 149 r.base.rowbuf = r.base.rowbuf[:nRequest] 150 } 151 152 var n, nTotal int 153 var err error 154 for { 155 // ReadRows reads the minimum remaining rows in a column page across all columns 156 // of the underlying reader, unless the length of the slice passed to it is smaller. 157 // In that case, ReadRows will read the number of rows equal to the length of the 158 // given slice argument. We limit that length to never be more than requested 159 // because sequential reads can cross page boundaries. 160 n, err = r.base.ReadRows(r.base.rowbuf[:nRequest-nTotal]) 161 if n > 0 { 162 schema := r.base.Schema() 163 164 for i, row := range r.base.rowbuf[:n] { 165 if err2 := schema.Reconstruct(&rows[nTotal+i], row); err2 != nil { 166 return nTotal + i, err2 167 } 168 } 169 } 170 nTotal += n 171 if n == 0 || nTotal == nRequest || err != nil { 172 break 173 } 174 } 175 176 return nTotal, err 177 } 178 179 var ( 180 _ Rows = (*GenericReader[any])(nil) 181 _ RowReaderWithSchema = (*Reader)(nil) 182 183 _ Rows = (*GenericReader[struct{}])(nil) 184 _ RowReaderWithSchema = (*GenericReader[struct{}])(nil) 185 186 _ Rows = (*GenericReader[map[struct{}]struct{}])(nil) 187 _ RowReaderWithSchema = (*GenericReader[map[struct{}]struct{}])(nil) 188 ) 189 190 type readFunc[T any] func(*GenericReader[T], []T) (int, error) 191 192 func readFuncOf[T any](t reflect.Type, schema *Schema) readFunc[T] { 193 if t == nil { 194 return (*GenericReader[T]).readRows 195 } 196 switch t.Kind() { 197 case reflect.Interface, reflect.Map: 198 return (*GenericReader[T]).readRows 199 200 case reflect.Struct: 201 return (*GenericReader[T]).readRows 202 203 case reflect.Pointer: 204 if e := t.Elem(); e.Kind() == reflect.Struct { 205 return (*GenericReader[T]).readRows 206 } 207 } 208 panic("cannot create reader for values of type " + t.String()) 209 } 210 211 // Deprecated: A Reader reads Go values from parquet files. 212 // 213 // This example showcases a typical use of parquet readers: 214 // 215 // reader := parquet.NewReader(file) 216 // rows := []RowType{} 217 // for { 218 // row := RowType{} 219 // err := reader.Read(&row) 220 // if err != nil { 221 // if err == io.EOF { 222 // break 223 // } 224 // ... 225 // } 226 // rows = append(rows, row) 227 // } 228 // if err := reader.Close(); err != nil { 229 // ... 230 // } 231 // 232 // For programs building with Go 1.18 or later, the GenericReader[T] type 233 // supersedes this one. 234 type Reader struct { 235 seen reflect.Type 236 file reader 237 read reader 238 rowIndex int64 239 rowbuf []Row 240 } 241 242 // NewReader constructs a parquet reader reading rows from the given 243 // io.ReaderAt. 244 // 245 // In order to read parquet rows, the io.ReaderAt must be converted to a 246 // parquet.File. If r is already a parquet.File it is used directly; otherwise, 247 // the io.ReaderAt value is expected to either have a `Size() int64` method or 248 // implement io.Seeker in order to determine its size. 249 // 250 // The function panics if the reader configuration is invalid. Programs that 251 // cannot guarantee the validity of the options passed to NewReader should 252 // construct the reader configuration independently prior to calling this 253 // function: 254 // 255 // config, err := parquet.NewReaderConfig(options...) 256 // if err != nil { 257 // // handle the configuration error 258 // ... 259 // } else { 260 // // this call to create a reader is guaranteed not to panic 261 // reader := parquet.NewReader(input, config) 262 // ... 263 // } 264 func NewReader(input io.ReaderAt, options ...ReaderOption) *Reader { 265 c, err := NewReaderConfig(options...) 266 if err != nil { 267 panic(err) 268 } 269 270 f, err := openFile(input) 271 if err != nil { 272 panic(err) 273 } 274 275 r := &Reader{ 276 file: reader{ 277 schema: f.schema, 278 rowGroup: fileRowGroupOf(f), 279 }, 280 } 281 282 if c.Schema != nil { 283 r.file.schema = c.Schema 284 r.file.rowGroup = convertRowGroupTo(r.file.rowGroup, c.Schema) 285 } 286 287 r.read.init(r.file.schema, r.file.rowGroup) 288 return r 289 } 290 291 func openFile(input io.ReaderAt) (*File, error) { 292 f, _ := input.(*File) 293 if f != nil { 294 return f, nil 295 } 296 n, err := sizeOf(input) 297 if err != nil { 298 return nil, err 299 } 300 return OpenFile(input, n) 301 } 302 303 func fileRowGroupOf(f *File) RowGroup { 304 switch rowGroups := f.RowGroups(); len(rowGroups) { 305 case 0: 306 return newEmptyRowGroup(f.Schema()) 307 case 1: 308 return rowGroups[0] 309 default: 310 // TODO: should we attempt to merge the row groups via MergeRowGroups 311 // to preserve the global order of sorting columns within the file? 312 return newMultiRowGroup(f.config.ReadMode, rowGroups...) 313 } 314 } 315 316 // NewRowGroupReader constructs a new Reader which reads rows from the RowGroup 317 // passed as argument. 318 func NewRowGroupReader(rowGroup RowGroup, options ...ReaderOption) *Reader { 319 c, err := NewReaderConfig(options...) 320 if err != nil { 321 panic(err) 322 } 323 324 if c.Schema != nil { 325 rowGroup = convertRowGroupTo(rowGroup, c.Schema) 326 } 327 328 r := &Reader{ 329 file: reader{ 330 schema: rowGroup.Schema(), 331 rowGroup: rowGroup, 332 }, 333 } 334 335 r.read.init(r.file.schema, r.file.rowGroup) 336 return r 337 } 338 339 func convertRowGroupTo(rowGroup RowGroup, schema *Schema) RowGroup { 340 if rowGroupSchema := rowGroup.Schema(); !nodesAreEqual(schema, rowGroupSchema) { 341 conv, err := Convert(schema, rowGroupSchema) 342 if err != nil { 343 // TODO: this looks like something we should not be panicking on, 344 // but the current NewReader API does not offer a mechanism to 345 // report errors. 346 panic(err) 347 } 348 rowGroup = ConvertRowGroup(rowGroup, conv) 349 } 350 return rowGroup 351 } 352 353 func sizeOf(r io.ReaderAt) (int64, error) { 354 switch f := r.(type) { 355 case interface{ Size() int64 }: 356 return f.Size(), nil 357 case io.Seeker: 358 off, err := f.Seek(0, io.SeekCurrent) 359 if err != nil { 360 return 0, err 361 } 362 end, err := f.Seek(0, io.SeekEnd) 363 if err != nil { 364 return 0, err 365 } 366 _, err = f.Seek(off, io.SeekStart) 367 return end, err 368 default: 369 return 0, fmt.Errorf("cannot determine length of %T", r) 370 } 371 } 372 373 // Reset repositions the reader at the beginning of the underlying parquet file. 374 func (r *Reader) Reset() { 375 r.file.Reset() 376 r.read.Reset() 377 r.rowIndex = 0 378 clearRows(r.rowbuf) 379 } 380 381 // Read reads the next row from r. The type of the row must match the schema 382 // of the underlying parquet file or an error will be returned. 383 // 384 // The method returns io.EOF when no more rows can be read from r. 385 func (r *Reader) Read(row interface{}) error { 386 if rowType := dereference(reflect.TypeOf(row)); rowType.Kind() == reflect.Struct { 387 if r.seen != rowType { 388 if err := r.updateReadSchema(rowType); err != nil { 389 return fmt.Errorf("cannot read parquet row into go value of type %T: %w", row, err) 390 } 391 } 392 } 393 394 if err := r.read.SeekToRow(r.rowIndex); err != nil { 395 if errors.Is(err, io.ErrClosedPipe) { 396 return io.EOF 397 } 398 return fmt.Errorf("seeking reader to row %d: %w", r.rowIndex, err) 399 } 400 401 if cap(r.rowbuf) == 0 { 402 r.rowbuf = make([]Row, 1) 403 } else { 404 r.rowbuf = r.rowbuf[:1] 405 } 406 407 n, err := r.read.ReadRows(r.rowbuf[:]) 408 if n == 0 { 409 return err 410 } 411 412 r.rowIndex++ 413 return r.read.schema.Reconstruct(row, r.rowbuf[0]) 414 } 415 416 func (r *Reader) updateReadSchema(rowType reflect.Type) error { 417 schema := schemaOf(rowType) 418 419 if nodesAreEqual(schema, r.file.schema) { 420 r.read.init(schema, r.file.rowGroup) 421 } else { 422 conv, err := Convert(schema, r.file.schema) 423 if err != nil { 424 return err 425 } 426 r.read.init(schema, ConvertRowGroup(r.file.rowGroup, conv)) 427 } 428 429 r.seen = rowType 430 return nil 431 } 432 433 // ReadRows reads the next rows from r into the given Row buffer. 434 // 435 // The returned values are laid out in the order expected by the 436 // parquet.(*Schema).Reconstruct method. 437 // 438 // The method returns io.EOF when no more rows can be read from r. 439 func (r *Reader) ReadRows(rows []Row) (int, error) { 440 if err := r.file.SeekToRow(r.rowIndex); err != nil { 441 return 0, err 442 } 443 n, err := r.file.ReadRows(rows) 444 r.rowIndex += int64(n) 445 return n, err 446 } 447 448 // Schema returns the schema of rows read by r. 449 func (r *Reader) Schema() *Schema { return r.file.schema } 450 451 // NumRows returns the number of rows that can be read from r. 452 func (r *Reader) NumRows() int64 { return r.file.rowGroup.NumRows() } 453 454 // SeekToRow positions r at the given row index. 455 func (r *Reader) SeekToRow(rowIndex int64) error { 456 if err := r.file.SeekToRow(rowIndex); err != nil { 457 return err 458 } 459 r.rowIndex = rowIndex 460 return nil 461 } 462 463 // Close closes the reader, preventing more rows from being read. 464 func (r *Reader) Close() error { 465 if err := r.read.Close(); err != nil { 466 return err 467 } 468 if err := r.file.Close(); err != nil { 469 return err 470 } 471 return nil 472 } 473 474 // reader is a subtype used in the implementation of Reader to support the two 475 // use cases of either reading rows calling the ReadRow method (where full rows 476 // are read from the underlying parquet file), or calling the Read method to 477 // read rows into Go values, potentially doing partial reads on a subset of the 478 // columns due to using a converted row group view. 479 type reader struct { 480 schema *Schema 481 rowGroup RowGroup 482 rows Rows 483 rowIndex int64 484 } 485 486 func (r *reader) init(schema *Schema, rowGroup RowGroup) { 487 r.schema = schema 488 r.rowGroup = rowGroup 489 r.Reset() 490 } 491 492 func (r *reader) Reset() { 493 r.rowIndex = 0 494 495 if rows, ok := r.rows.(interface{ Reset() }); ok { 496 // This optimization works for the common case where the underlying type 497 // of the Rows instance is rowGroupRows, which should be true in most 498 // cases since even external implementations of the RowGroup interface 499 // can construct values of this type via the NewRowGroupRowReader 500 // function. 501 // 502 // Foreign implementations of the Rows interface may also define a Reset 503 // method in order to participate in this optimization. 504 rows.Reset() 505 return 506 } 507 508 if r.rows != nil { 509 r.rows.Close() 510 r.rows = nil 511 } 512 } 513 514 func (r *reader) ReadRows(rows []Row) (int, error) { 515 if r.rowGroup == nil { 516 return 0, io.EOF 517 } 518 if r.rows == nil { 519 r.rows = r.rowGroup.Rows() 520 if r.rowIndex > 0 { 521 if err := r.rows.SeekToRow(r.rowIndex); err != nil { 522 return 0, err 523 } 524 } 525 } 526 n, err := r.rows.ReadRows(rows) 527 r.rowIndex += int64(n) 528 return n, err 529 } 530 531 func (r *reader) SeekToRow(rowIndex int64) error { 532 if r.rowGroup == nil { 533 return io.ErrClosedPipe 534 } 535 if rowIndex != r.rowIndex { 536 if r.rows != nil { 537 if err := r.rows.SeekToRow(rowIndex); err != nil { 538 return err 539 } 540 } 541 r.rowIndex = rowIndex 542 } 543 return nil 544 } 545 546 func (r *reader) Close() (err error) { 547 r.rowGroup = nil 548 if r.rows != nil { 549 err = r.rows.Close() 550 } 551 return err 552 } 553 554 var ( 555 _ Rows = (*Reader)(nil) 556 _ RowReaderWithSchema = (*Reader)(nil) 557 558 _ RowReader = (*reader)(nil) 559 _ RowSeeker = (*reader)(nil) 560 )