github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/reader.go (about) 1 package parquet 2 3 import ( 4 "errors" 5 "fmt" 6 "io" 7 "reflect" 8 ) 9 10 // Deprecated: A Reader reads Go values from parquet files. 11 // 12 // This example showcases a typical use of parquet readers: 13 // 14 // reader := parquet.NewReader(file) 15 // rows := []RowType{} 16 // for { 17 // row := RowType{} 18 // err := reader.Read(&row) 19 // if err != nil { 20 // if err == io.EOF { 21 // break 22 // } 23 // ... 24 // } 25 // rows = append(rows, row) 26 // } 27 // if err := reader.Close(); err != nil { 28 // ... 29 // } 30 // 31 // For programs building with Go 1.18 or later, the GenericReader[T] type 32 // supersedes this one. 33 type Reader struct { 34 seen reflect.Type 35 file reader 36 read reader 37 rowIndex int64 38 rowbuf []Row 39 } 40 41 // NewReader constructs a parquet reader reading rows from the given 42 // io.ReaderAt. 43 // 44 // In order to read parquet rows, the io.ReaderAt must be converted to a 45 // parquet.File. If r is already a parquet.File it is used directly; otherwise, 46 // the io.ReaderAt value is expected to either have a `Size() int64` method or 47 // implement io.Seeker in order to determine its size. 48 // 49 // The function panics if the reader configuration is invalid. Programs that 50 // cannot guarantee the validity of the options passed to NewReader should 51 // construct the reader configuration independently prior to calling this 52 // function: 53 // 54 // config, err := parquet.NewReaderConfig(options...) 55 // if err != nil { 56 // // handle the configuration error 57 // ... 58 // } else { 59 // // this call to create a reader is guaranteed not to panic 60 // reader := parquet.NewReader(input, config) 61 // ... 62 // } 63 func NewReader(input io.ReaderAt, options ...ReaderOption) *Reader { 64 c, err := NewReaderConfig(options...) 65 if err != nil { 66 panic(err) 67 } 68 69 f, err := openFile(input) 70 if err != nil { 71 panic(err) 72 } 73 74 r := &Reader{ 75 file: reader{ 76 schema: f.schema, 77 rowGroup: fileRowGroupOf(f), 78 }, 79 } 80 81 if c.Schema != nil { 82 r.file.schema = c.Schema 83 r.file.rowGroup = convertRowGroupTo(r.file.rowGroup, c.Schema) 84 } 85 86 r.read.init(r.file.schema, r.file.rowGroup) 87 return r 88 } 89 90 func openFile(input io.ReaderAt) (*File, error) { 91 f, _ := input.(*File) 92 if f != nil { 93 return f, nil 94 } 95 n, err := sizeOf(input) 96 if err != nil { 97 return nil, err 98 } 99 return OpenFile(input, n) 100 } 101 102 func fileRowGroupOf(f *File) RowGroup { 103 switch rowGroups := f.RowGroups(); len(rowGroups) { 104 case 0: 105 return newEmptyRowGroup(f.Schema()) 106 case 1: 107 return rowGroups[0] 108 default: 109 // TODO: should we attempt to merge the row groups via MergeRowGroups 110 // to preserve the global order of sorting columns within the file? 111 return newMultiRowGroup(f.config.ReadMode, rowGroups...) 112 } 113 } 114 115 // NewRowGroupReader constructs a new Reader which reads rows from the RowGroup 116 // passed as argument. 117 func NewRowGroupReader(rowGroup RowGroup, options ...ReaderOption) *Reader { 118 c, err := NewReaderConfig(options...) 119 if err != nil { 120 panic(err) 121 } 122 123 if c.Schema != nil { 124 rowGroup = convertRowGroupTo(rowGroup, c.Schema) 125 } 126 127 r := &Reader{ 128 file: reader{ 129 schema: rowGroup.Schema(), 130 rowGroup: rowGroup, 131 }, 132 } 133 134 r.read.init(r.file.schema, r.file.rowGroup) 135 return r 136 } 137 138 func convertRowGroupTo(rowGroup RowGroup, schema *Schema) RowGroup { 139 if rowGroupSchema := rowGroup.Schema(); !nodesAreEqual(schema, rowGroupSchema) { 140 conv, err := Convert(schema, rowGroupSchema) 141 if err != nil { 142 // TODO: this looks like something we should not be panicking on, 143 // but the current NewReader API does not offer a mechanism to 144 // report errors. 145 panic(err) 146 } 147 rowGroup = ConvertRowGroup(rowGroup, conv) 148 } 149 return rowGroup 150 } 151 152 func sizeOf(r io.ReaderAt) (int64, error) { 153 switch f := r.(type) { 154 case interface{ Size() int64 }: 155 return f.Size(), nil 156 case io.Seeker: 157 off, err := f.Seek(0, io.SeekCurrent) 158 if err != nil { 159 return 0, err 160 } 161 end, err := f.Seek(0, io.SeekEnd) 162 if err != nil { 163 return 0, err 164 } 165 _, err = f.Seek(off, io.SeekStart) 166 return end, err 167 default: 168 return 0, fmt.Errorf("cannot determine length of %T", r) 169 } 170 } 171 172 // Reset repositions the reader at the beginning of the underlying parquet file. 173 func (r *Reader) Reset() { 174 r.file.Reset() 175 r.read.Reset() 176 r.rowIndex = 0 177 clearRows(r.rowbuf) 178 } 179 180 // Read reads the next row from r. The type of the row must match the schema 181 // of the underlying parquet file or an error will be returned. 182 // 183 // The method returns io.EOF when no more rows can be read from r. 184 func (r *Reader) Read(row interface{}) error { 185 if rowType := dereference(reflect.TypeOf(row)); rowType.Kind() == reflect.Struct { 186 if r.seen != rowType { 187 if err := r.updateReadSchema(rowType); err != nil { 188 return fmt.Errorf("cannot read parquet row into go value of type %T: %w", row, err) 189 } 190 } 191 } 192 193 if err := r.read.SeekToRow(r.rowIndex); err != nil { 194 if errors.Is(err, io.ErrClosedPipe) { 195 return io.EOF 196 } 197 return fmt.Errorf("seeking reader to row %d: %w", r.rowIndex, err) 198 } 199 200 if cap(r.rowbuf) == 0 { 201 r.rowbuf = make([]Row, 1) 202 } else { 203 r.rowbuf = r.rowbuf[:1] 204 } 205 206 n, err := r.read.ReadRows(r.rowbuf[:]) 207 if n == 0 { 208 return err 209 } 210 211 r.rowIndex++ 212 return r.read.schema.Reconstruct(row, r.rowbuf[0]) 213 } 214 215 func (r *Reader) updateReadSchema(rowType reflect.Type) error { 216 schema := schemaOf(rowType) 217 218 if nodesAreEqual(schema, r.file.schema) { 219 r.read.init(schema, r.file.rowGroup) 220 } else { 221 conv, err := Convert(schema, r.file.schema) 222 if err != nil { 223 return err 224 } 225 r.read.init(schema, ConvertRowGroup(r.file.rowGroup, conv)) 226 } 227 228 r.seen = rowType 229 return nil 230 } 231 232 // ReadRows reads the next rows from r into the given Row buffer. 233 // 234 // The returned values are laid out in the order expected by the 235 // parquet.(*Schema).Reconstruct method. 236 // 237 // The method returns io.EOF when no more rows can be read from r. 238 func (r *Reader) ReadRows(rows []Row) (int, error) { 239 if err := r.file.SeekToRow(r.rowIndex); err != nil { 240 return 0, err 241 } 242 n, err := r.file.ReadRows(rows) 243 r.rowIndex += int64(n) 244 return n, err 245 } 246 247 // Schema returns the schema of rows read by r. 248 func (r *Reader) Schema() *Schema { return r.file.schema } 249 250 // NumRows returns the number of rows that can be read from r. 251 func (r *Reader) NumRows() int64 { return r.file.rowGroup.NumRows() } 252 253 // SeekToRow positions r at the given row index. 254 func (r *Reader) SeekToRow(rowIndex int64) error { 255 if err := r.file.SeekToRow(rowIndex); err != nil { 256 return err 257 } 258 r.rowIndex = rowIndex 259 return nil 260 } 261 262 // Close closes the reader, preventing more rows from being read. 263 func (r *Reader) Close() error { 264 if err := r.read.Close(); err != nil { 265 return err 266 } 267 if err := r.file.Close(); err != nil { 268 return err 269 } 270 return nil 271 } 272 273 // reader is a subtype used in the implementation of Reader to support the two 274 // use cases of either reading rows calling the ReadRow method (where full rows 275 // are read from the underlying parquet file), or calling the Read method to 276 // read rows into Go values, potentially doing partial reads on a subset of the 277 // columns due to using a converted row group view. 278 type reader struct { 279 schema *Schema 280 rowGroup RowGroup 281 rows Rows 282 rowIndex int64 283 } 284 285 func (r *reader) init(schema *Schema, rowGroup RowGroup) { 286 r.schema = schema 287 r.rowGroup = rowGroup 288 r.Reset() 289 } 290 291 func (r *reader) Reset() { 292 r.rowIndex = 0 293 294 if rows, ok := r.rows.(interface{ Reset() }); ok { 295 // This optimization works for the common case where the underlying type 296 // of the Rows instance is rowGroupRows, which should be true in most 297 // cases since even external implementations of the RowGroup interface 298 // can construct values of this type via the NewRowGroupRowReader 299 // function. 300 // 301 // Foreign implementations of the Rows interface may also define a Reset 302 // method in order to participate in this optimization. 303 rows.Reset() 304 return 305 } 306 307 if r.rows != nil { 308 r.rows.Close() 309 r.rows = nil 310 } 311 } 312 313 func (r *reader) ReadRows(rows []Row) (int, error) { 314 if r.rowGroup == nil { 315 return 0, io.EOF 316 } 317 if r.rows == nil { 318 r.rows = r.rowGroup.Rows() 319 if r.rowIndex > 0 { 320 if err := r.rows.SeekToRow(r.rowIndex); err != nil { 321 return 0, err 322 } 323 } 324 } 325 n, err := r.rows.ReadRows(rows) 326 r.rowIndex += int64(n) 327 return n, err 328 } 329 330 func (r *reader) SeekToRow(rowIndex int64) error { 331 if r.rowGroup == nil { 332 return io.ErrClosedPipe 333 } 334 if rowIndex != r.rowIndex { 335 if r.rows != nil { 336 if err := r.rows.SeekToRow(rowIndex); err != nil { 337 return err 338 } 339 } 340 r.rowIndex = rowIndex 341 } 342 return nil 343 } 344 345 func (r *reader) Close() (err error) { 346 r.rowGroup = nil 347 if r.rows != nil { 348 err = r.rows.Close() 349 } 350 return err 351 } 352 353 var ( 354 _ Rows = (*Reader)(nil) 355 _ RowReaderWithSchema = (*Reader)(nil) 356 357 _ RowReader = (*reader)(nil) 358 _ RowSeeker = (*reader)(nil) 359 )