github.com/fraugster/parquet-go@v0.12.0/file_reader.go (about) 1 package goparquet 2 3 import ( 4 "context" 5 "fmt" 6 "io" 7 "runtime" 8 9 "github.com/fraugster/parquet-go/parquet" 10 "github.com/fraugster/parquet-go/parquetschema" 11 ) 12 13 // FileReader is used to read data from a parquet file. Always use NewFileReader or a related 14 // function to create such an object. 15 type FileReader struct { 16 meta *parquet.FileMetaData 17 schemaReader *schema 18 reader io.ReadSeeker 19 20 rowGroupPosition int 21 currentRecord int64 22 skipRowGroup bool 23 24 ctx context.Context 25 26 allocTracker *allocTracker 27 } 28 29 // NewFileReaderWithOptions creates a new FileReader. You can provide a list of FileReaderOptions to configure 30 // aspects of its behaviour, such as limiting the columns to read, the file metadata to use, or the 31 // context to use. For a full list of options, please see the type FileReaderOption. 32 func NewFileReaderWithOptions(r io.ReadSeeker, readerOptions ...FileReaderOption) (*FileReader, error) { 33 opts := newFileReaderOptions() 34 if err := opts.apply(readerOptions); err != nil { 35 return nil, err 36 } 37 38 var err error 39 if opts.metaData == nil { 40 opts.metaData, err = ReadFileMetaData(r, true) 41 if err != nil { 42 return nil, fmt.Errorf("reading file meta data failed: %w", err) 43 } 44 } 45 46 schema, err := makeSchema(opts.metaData, opts.validateCRC, opts.allocTracker) 47 if err != nil { 48 return nil, fmt.Errorf("creating schema failed: %w", err) 49 } 50 51 schema.SetSelectedColumns(opts.columns...) 52 // Reset the reader to the beginning of the file 53 if _, err := r.Seek(4, io.SeekStart); err != nil { 54 return nil, err 55 } 56 return &FileReader{ 57 meta: opts.metaData, 58 schemaReader: schema, 59 reader: r, 60 ctx: opts.ctx, 61 allocTracker: opts.allocTracker, 62 }, nil 63 } 64 65 // FileReaderOption is an option that can be passed on to NewFileReaderWithOptions when 66 // creating a new parquet file reader. 67 type FileReaderOption func(*fileReaderOptions) error 68 type fileReaderOptions struct { 69 metaData *parquet.FileMetaData 70 ctx context.Context 71 columns []ColumnPath 72 validateCRC bool 73 allocTracker *allocTracker 74 } 75 76 func newFileReaderOptions() *fileReaderOptions { 77 return &fileReaderOptions{ctx: context.Background()} 78 } 79 80 func (o *fileReaderOptions) apply(opts []FileReaderOption) error { 81 for _, f := range opts { 82 if err := f(o); err != nil { 83 return err 84 } 85 } 86 return nil 87 } 88 89 // WithReaderContext configures a custom context for the file reader. If none 90 // is provided, context.Background() is used as a default. 91 func WithReaderContext(ctx context.Context) FileReaderOption { 92 return func(opts *fileReaderOptions) error { 93 opts.ctx = ctx 94 return nil 95 } 96 } 97 98 // WithFileMetaData allows you to provide your own file metadata. If none 99 // is set with this option, the file reader will read it from the parquet 100 // file. 101 func WithFileMetaData(metaData *parquet.FileMetaData) FileReaderOption { 102 return func(opts *fileReaderOptions) error { 103 opts.metaData = metaData 104 return nil 105 } 106 } 107 108 // WithColumns limits the columns which are read. If none are set, then 109 // all columns will be read by the parquet file reader. 110 // 111 // Deprecated: use WithColumnPaths instead. 112 func WithColumns(columns ...string) FileReaderOption { 113 return func(opts *fileReaderOptions) error { 114 parsedCols := []ColumnPath{} 115 for _, c := range columns { 116 parsedCols = append(parsedCols, parseColumnPath(c)) 117 } 118 opts.columns = parsedCols 119 return nil 120 } 121 } 122 123 // WithColumnPaths limits the columns which are read. If none are set, then 124 // all columns will be read by the parquet file reader. 125 func WithColumnPaths(columns ...ColumnPath) FileReaderOption { 126 return func(opts *fileReaderOptions) error { 127 opts.columns = columns 128 return nil 129 } 130 } 131 132 // WithCRC32Validation allows you to configure whether CRC32 page checksums will 133 // be validated when they're read. By default, checksum validation is disabled. 134 func WithCRC32Validation(enable bool) FileReaderOption { 135 return func(opts *fileReaderOptions) error { 136 opts.validateCRC = enable 137 return nil 138 } 139 } 140 141 // WithMaximumMemorySize allows you to configure a maximum limit in terms of memory 142 // that shall be allocated when reading this file. If the amount of memory gets over 143 // this limit, further function calls will fail. 144 func WithMaximumMemorySize(maxSizeBytes uint64) FileReaderOption { 145 return func(opts *fileReaderOptions) error { 146 opts.allocTracker = newAllocTracker(maxSizeBytes) 147 return nil 148 } 149 } 150 151 // NewFileReader creates a new FileReader. You can limit the columns that are read by providing 152 // the names of the specific columns to read using dotted notation. If no columns are provided, 153 // then all columns are read. 154 func NewFileReader(r io.ReadSeeker, columns ...string) (*FileReader, error) { 155 return NewFileReaderWithOptions(r, WithColumns(columns...)) 156 } 157 158 // NewFileReaderWithContext creates a new FileReader. You can limit the columns that are read by providing 159 // the names of the specific columns to read using dotted notation. If no columns are provided, 160 // then all columns are read. The provided context.Context overrides the default context (which is a context.Background()) 161 // for use in other methods of the *FileReader type. 162 // 163 // Deprecated: use the function NewFileReaderWithOptions and the option WithContext instead. 164 func NewFileReaderWithContext(ctx context.Context, r io.ReadSeeker, columns ...string) (*FileReader, error) { 165 return NewFileReaderWithOptions(r, WithReaderContext(ctx), WithColumns(columns...)) 166 } 167 168 // NewFileReaderWithMetaData creates a new FileReader with custom file meta data. You can limit the columns that 169 // are read by providing the names of the specific columns to read using dotted notation. If no columns are provided, 170 // then all columns are read. 171 // 172 // Deprecated: use the function NewFileReaderWithOptions and the option WithFileMetaData instead. 173 func NewFileReaderWithMetaData(r io.ReadSeeker, meta *parquet.FileMetaData, columns ...string) (*FileReader, error) { 174 return NewFileReaderWithOptions(r, WithFileMetaData(meta), WithColumns(columns...)) 175 } 176 177 func (*FileReader) recover(errp *error) { 178 if e := recover(); e != nil { 179 if _, ok := e.(runtime.Error); ok { 180 panic(e) 181 } 182 *errp = e.(error) 183 } 184 } 185 186 // SeekToRowGroup seeks to a particular row group, identified by its index. 187 func (f *FileReader) SeekToRowGroup(rowGroupPosition int) (err error) { 188 defer f.recover(&err) 189 return f.SeekToRowGroupWithContext(f.ctx, rowGroupPosition) 190 } 191 192 // SeekToRowGroupWithContext seeks to a particular row group, identified by its index. 193 func (f *FileReader) SeekToRowGroupWithContext(ctx context.Context, rowGroupPosition int) (err error) { 194 defer f.recover(&err) 195 f.rowGroupPosition = rowGroupPosition - 1 196 f.currentRecord = 0 197 return f.readRowGroup(ctx) 198 } 199 200 // readRowGroup read the next row group into memory 201 func (f *FileReader) readRowGroup(ctx context.Context) error { 202 if len(f.meta.RowGroups) <= f.rowGroupPosition { 203 return io.EOF 204 } 205 f.rowGroupPosition++ 206 return f.readRowGroupData(ctx) //, f.reader, f.schemaReader, f.meta.RowGroups[f.rowGroupPosition-1]) 207 } 208 209 // CurrentRowGroup returns information about the current row group. 210 func (f *FileReader) CurrentRowGroup() *parquet.RowGroup { 211 if f == nil || f.meta == nil || f.meta.RowGroups == nil || f.rowGroupPosition-1 >= len(f.meta.RowGroups) { 212 return nil 213 } 214 return f.meta.RowGroups[f.rowGroupPosition-1] 215 } 216 217 // RowGroupCount returns the number of row groups in the parquet file. 218 func (f *FileReader) RowGroupCount() int { 219 return len(f.meta.RowGroups) 220 } 221 222 // NumRows returns the number of rows in the parquet file. This information is directly taken from 223 // the file's meta data. 224 func (f *FileReader) NumRows() int64 { 225 return f.meta.NumRows 226 } 227 228 func (f *FileReader) advanceIfNeeded(ctx context.Context) error { 229 if f.rowGroupPosition == 0 || f.currentRecord >= f.schemaReader.rowGroupNumRecords() || f.skipRowGroup { 230 if err := f.readRowGroup(ctx); err != nil { 231 f.skipRowGroup = true 232 return err 233 } 234 f.currentRecord = 0 235 f.skipRowGroup = false 236 } 237 238 return nil 239 } 240 241 // RowGroupNumRows returns the number of rows in the current RowGroup. 242 func (f *FileReader) RowGroupNumRows() (int64, error) { 243 return f.RowGroupNumRowsWithContext(f.ctx) 244 } 245 246 // RowGroupNumRowsWithContext returns the number of rows in the current RowGroup. 247 func (f *FileReader) RowGroupNumRowsWithContext(ctx context.Context) (numRecords int64, err error) { 248 defer f.recover(&err) 249 250 if err := f.advanceIfNeeded(ctx); err != nil { 251 return 0, err 252 } 253 254 return f.schemaReader.rowGroupNumRecords(), nil 255 } 256 257 // NextRow reads the next row from the parquet file. If required, it will load the next row group. 258 func (f *FileReader) NextRow() (map[string]interface{}, error) { 259 return f.NextRowWithContext(f.ctx) 260 } 261 262 // NextRowWithContext reads the next row from the parquet file. If required, it will load the next row group. 263 func (f *FileReader) NextRowWithContext(ctx context.Context) (row map[string]interface{}, err error) { 264 defer f.recover(&err) 265 266 if err := f.advanceIfNeeded(ctx); err != nil { 267 return nil, err 268 } 269 270 f.currentRecord++ 271 return f.schemaReader.getData() 272 } 273 274 // SkipRowGroup skips the currently loaded row group and advances to the next row group. 275 func (f *FileReader) SkipRowGroup() { 276 f.skipRowGroup = true 277 } 278 279 // PreLoad is used to load the row group if required. It does nothing if the row group is already loaded. 280 func (f *FileReader) PreLoad() error { 281 return f.PreLoadWithContext(f.ctx) 282 } 283 284 // PreLoadWithContext is used to load the row group if required. It does nothing if the row group is already loaded. 285 func (f *FileReader) PreLoadWithContext(ctx context.Context) (err error) { 286 defer f.recover(&err) 287 return f.advanceIfNeeded(ctx) 288 } 289 290 // MetaData returns a map of metadata key-value pairs stored in the parquet file. 291 func (f *FileReader) MetaData() map[string]string { 292 return keyValueMetaDataToMap(f.meta.KeyValueMetadata) 293 } 294 295 // ColumnMetaData returns a map of metadata key-value pairs for the provided column in the current 296 // row group. The column name has to be provided in its dotted notation. 297 // 298 // Deprecated: use ColumnMetaDataPath instead. 299 func (f *FileReader) ColumnMetaData(colName string) (map[string]string, error) { 300 return f.ColumnMetaDataByPath(parseColumnPath(colName)) 301 } 302 303 // ColumnMetaData returns a map of metadata key-value pairs for the provided column in the current 304 // row group. The column is provided as ColumnPath. 305 func (f *FileReader) ColumnMetaDataByPath(path ColumnPath) (metaData map[string]string, err error) { 306 defer f.recover(&err) 307 for _, col := range f.CurrentRowGroup().Columns { 308 if path.Equal(ColumnPath(col.MetaData.PathInSchema)) { 309 return keyValueMetaDataToMap(col.MetaData.KeyValueMetadata), nil 310 } 311 } 312 return nil, fmt.Errorf("column %q not found", path.flatName()) 313 } 314 315 // SetSelectedColumns sets the columns which are read. By default, all columns 316 // will be read. 317 // 318 // Deprecated: use SetSelectedColumnsByPath instead. 319 func (f *FileReader) SetSelectedColumns(cols ...string) { 320 parsedCols := []ColumnPath{} 321 for _, c := range cols { 322 parsedCols = append(parsedCols, parseColumnPath(c)) 323 } 324 f.schemaReader.SetSelectedColumns(parsedCols...) 325 } 326 327 func (f *FileReader) SetSelectedColumnsByPath(cols ...ColumnPath) { 328 f.schemaReader.SetSelectedColumns(cols...) 329 } 330 331 // Columns returns the list of columns. 332 func (f *FileReader) Columns() []*Column { 333 return f.schemaReader.Columns() 334 } 335 336 // GetColumnByName returns a column identified by name. If the column doesn't exist, 337 // the method returns nil. 338 func (f *FileReader) GetColumnByName(name string) *Column { 339 return f.schemaReader.GetColumnByName(name) 340 } 341 342 // GetColumnByPath returns a column identified by its path. If the column doesn't exist, 343 // nil is returned. 344 func (f *FileReader) GetColumnByPath(path ColumnPath) *Column { 345 return f.schemaReader.GetColumnByPath(path) 346 } 347 348 // GetSchemaDefinition returns the current schema definition. 349 func (f *FileReader) GetSchemaDefinition() *parquetschema.SchemaDefinition { 350 return f.schemaReader.GetSchemaDefinition() 351 } 352 353 func keyValueMetaDataToMap(kvMetaData []*parquet.KeyValue) map[string]string { 354 data := make(map[string]string) 355 for _, kv := range kvMetaData { 356 if kv.Value != nil { 357 data[kv.Key] = *kv.Value 358 } 359 } 360 return data 361 }