github.com/apache/arrow/go/v7@v7.0.1/parquet/file/column_reader.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file 18 19 import ( 20 "github.com/apache/arrow/go/v7/arrow/memory" 21 "github.com/apache/arrow/go/v7/parquet" 22 "github.com/apache/arrow/go/v7/parquet/internal/encoding" 23 "github.com/apache/arrow/go/v7/parquet/internal/encryption" 24 format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet" 25 "github.com/apache/arrow/go/v7/parquet/internal/utils" 26 "github.com/apache/arrow/go/v7/parquet/schema" 27 "golang.org/x/xerrors" 28 ) 29 30 const ( 31 // 4 MB is the default maximum page header size 32 defaultMaxPageHeaderSize = 4 * 1024 * 1024 33 // 16 KB is the default expected page header size 34 defaultPageHeaderSize = 16 * 1024 35 ) 36 37 //go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=../internal/encoding/physical_types.tmpldata column_reader_types.gen.go.tmpl 38 39 func isDictIndexEncoding(e format.Encoding) bool { 40 return e == format.Encoding_RLE_DICTIONARY || e == format.Encoding_PLAIN_DICTIONARY 41 } 42 43 // CryptoContext is a context for keeping track of the current methods for decrypting. 44 // It keeps track of the row group and column numbers along with references to the 45 // decryptor objects. 46 type CryptoContext struct { 47 StartDecryptWithDictionaryPage bool 48 RowGroupOrdinal int16 49 ColumnOrdinal int16 50 MetaDecryptor encryption.Decryptor 51 DataDecryptor encryption.Decryptor 52 } 53 54 // ColumnChunkReader is the basic interface for all column readers. It will use 55 // a page reader to read all the pages in a column chunk from a row group. 56 // 57 // To actually Read out the column data, you need to convert to the properly 58 // typed ColumnChunkReader type such as *BooleanColumnReader etc. 59 // 60 // Some things to clarify when working with column readers: 61 // 62 // "Values" refers to the physical data values in a data page. 63 // 64 // This is separate from the number of "rows" in a column and the total number 65 // of "elements" in a column because null values aren't stored physically in the 66 // data page but are represented via definition levels, so the number of values 67 // in a column can be less than the number of rows. 68 // 69 // The total number of "elements" in a column also differs because of potential 70 // repeated fields, where you can have multiple values in the page which 71 // together make up a single element (such as a list) or depending on the repetition 72 // level and definition level, could represent an entire null list or just a null 73 // element inside of a list. 74 type ColumnChunkReader interface { 75 // HasNext returns whether there is more data to be read in this column 76 // and row group. 77 HasNext() bool 78 // Type returns the underlying physical type of the column 79 Type() parquet.Type 80 // Descriptor returns the column schema container 81 Descriptor() *schema.Column 82 // if HasNext returns false because of an error, this will return the error 83 // it encountered. Otherwise this will be nil if it's just the end of the 84 // column 85 Err() error 86 // Skip buffered values 87 consumeBufferedValues(int64) 88 // number of available buffered values that have not been decoded yet 89 // when this returns 0, you're at the end of a page. 90 numAvailValues() int64 91 // read the definition levels and return the number of definitions, 92 // and the number of values to be read (number of def levels == maxdef level) 93 // it also populates the passed in slice which should be sized appropriately. 94 readDefinitionLevels(levels []int16) (int, int64) 95 // read the repetition levels and return the number of repetition levels read 96 // also populates the passed in slice, which should be sized appropriately. 97 readRepetitionLevels(levels []int16) int 98 // a column is made up of potentially multiple pages across potentially multiple 99 // row groups. A PageReader allows looping through the pages in a single row group. 100 // When moving to another row group for reading, use setPageReader to re-use the 101 // column reader for reading the pages of the new row group. 102 pager() PageReader 103 // set a page reader into the columnreader so it can be reused. 104 // 105 // This will clear any current error in the reader but does not 106 // automatically read the first page of the page reader passed in until 107 // HasNext which will read in the next page. 108 setPageReader(PageReader) 109 } 110 111 type columnChunkReader struct { 112 descr *schema.Column 113 rdr PageReader 114 repetitionDecoder encoding.LevelDecoder 115 definitionDecoder encoding.LevelDecoder 116 117 curPage Page 118 curEncoding format.Encoding 119 curDecoder encoding.TypedDecoder 120 121 // number of currently buffered values in the current page 122 numBuffered int64 123 // the number of values we've decoded so far 124 numDecoded int64 125 mem memory.Allocator 126 127 decoders map[format.Encoding]encoding.TypedDecoder 128 decoderTraits encoding.DecoderTraits 129 130 // is set when an error is encountered 131 err error 132 defLvlBuffer []int16 133 } 134 135 // NewColumnReader returns a column reader for the provided column initialized with the given pagereader that will 136 // provide the pages of data for this column. The type is determined from the column passed in. 137 func NewColumnReader(descr *schema.Column, pageReader PageReader, mem memory.Allocator) ColumnChunkReader { 138 base := columnChunkReader{descr: descr, rdr: pageReader, mem: mem, decoders: make(map[format.Encoding]encoding.TypedDecoder)} 139 switch descr.PhysicalType() { 140 case parquet.Types.FixedLenByteArray: 141 base.decoderTraits = &encoding.FixedLenByteArrayDecoderTraits 142 return &FixedLenByteArrayColumnChunkReader{base} 143 case parquet.Types.Float: 144 base.decoderTraits = &encoding.Float32DecoderTraits 145 return &Float32ColumnChunkReader{base} 146 case parquet.Types.Double: 147 base.decoderTraits = &encoding.Float64DecoderTraits 148 return &Float64ColumnChunkReader{base} 149 case parquet.Types.ByteArray: 150 base.decoderTraits = &encoding.ByteArrayDecoderTraits 151 return &ByteArrayColumnChunkReader{base} 152 case parquet.Types.Int32: 153 base.decoderTraits = &encoding.Int32DecoderTraits 154 return &Int32ColumnChunkReader{base} 155 case parquet.Types.Int64: 156 base.decoderTraits = &encoding.Int64DecoderTraits 157 return &Int64ColumnChunkReader{base} 158 case parquet.Types.Int96: 159 base.decoderTraits = &encoding.Int96DecoderTraits 160 return &Int96ColumnChunkReader{base} 161 case parquet.Types.Boolean: 162 base.decoderTraits = &encoding.BooleanDecoderTraits 163 return &BooleanColumnChunkReader{base} 164 } 165 return nil 166 } 167 168 func (c *columnChunkReader) Err() error { return c.err } 169 func (c *columnChunkReader) Type() parquet.Type { return c.descr.PhysicalType() } 170 func (c *columnChunkReader) Descriptor() *schema.Column { return c.descr } 171 func (c *columnChunkReader) consumeBufferedValues(n int64) { c.numDecoded += n } 172 func (c *columnChunkReader) numAvailValues() int64 { return c.numBuffered - c.numDecoded } 173 func (c *columnChunkReader) pager() PageReader { return c.rdr } 174 func (c *columnChunkReader) setPageReader(rdr PageReader) { 175 c.rdr, c.err = rdr, nil 176 c.decoders = make(map[format.Encoding]encoding.TypedDecoder) 177 c.numBuffered, c.numDecoded = 0, 0 178 } 179 180 func (c *columnChunkReader) getDefLvlBuffer(sz int64) []int16 { 181 if int64(len(c.defLvlBuffer)) < sz { 182 c.defLvlBuffer = make([]int16, sz) 183 return c.defLvlBuffer 184 } 185 186 return c.defLvlBuffer[:sz] 187 } 188 189 // HasNext returns whether there is more data to be read in this column 190 // and row group. 191 func (c *columnChunkReader) HasNext() bool { 192 if c.numBuffered == 0 || c.numDecoded == c.numBuffered { 193 return c.readNewPage() && c.numBuffered != 0 194 } 195 return true 196 } 197 198 func (c *columnChunkReader) configureDict(page *DictionaryPage) error { 199 enc := page.encoding 200 if enc == format.Encoding_PLAIN_DICTIONARY || enc == format.Encoding_PLAIN { 201 enc = format.Encoding_RLE_DICTIONARY 202 } 203 204 if _, ok := c.decoders[enc]; ok { 205 return xerrors.New("parquet: column chunk cannot have more than one dictionary.") 206 } 207 208 switch page.Encoding() { 209 case format.Encoding_PLAIN, format.Encoding_PLAIN_DICTIONARY: 210 dict := c.decoderTraits.Decoder(parquet.Encodings.Plain, c.descr, false, c.mem) 211 dict.SetData(int(page.NumValues()), page.Data()) 212 213 decoder := c.decoderTraits.Decoder(parquet.Encodings.Plain, c.descr, true, c.mem).(encoding.DictDecoder) 214 decoder.SetDict(dict) 215 c.decoders[enc] = decoder 216 default: 217 return xerrors.New("parquet: dictionary index must be plain encoding") 218 } 219 220 c.curDecoder = c.decoders[enc] 221 return nil 222 } 223 224 // read a new page from the page reader 225 func (c *columnChunkReader) readNewPage() bool { 226 for c.rdr.Next() { // keep going until we get a data page 227 c.curPage = c.rdr.Page() 228 if c.curPage == nil { 229 break 230 } 231 232 var lvlByteLen int64 233 switch p := c.curPage.(type) { 234 case *DictionaryPage: 235 if err := c.configureDict(p); err != nil { 236 c.err = err 237 return false 238 } 239 continue 240 case *DataPageV1: 241 lvlByteLen, c.err = c.initLevelDecodersV1(p, p.repLvlEncoding, p.defLvlEncoding) 242 if c.err != nil { 243 return false 244 } 245 case *DataPageV2: 246 lvlByteLen, c.err = c.initLevelDecodersV2(p) 247 if c.err != nil { 248 return false 249 } 250 default: 251 // we can skip non-data pages 252 continue 253 } 254 255 c.err = c.initDataDecoder(c.curPage, lvlByteLen) 256 return c.err == nil 257 } 258 c.err = c.rdr.Err() 259 return false 260 } 261 262 func (c *columnChunkReader) initLevelDecodersV2(page *DataPageV2) (int64, error) { 263 c.numBuffered = int64(page.nvals) 264 c.numDecoded = 0 265 buf := page.Data() 266 totalLvlLen := int64(page.repLvlByteLen) + int64(page.defLvlByteLen) 267 268 if totalLvlLen > int64(len(buf)) { 269 return totalLvlLen, xerrors.New("parquet: data page too small for levels (corrupt header?)") 270 } 271 272 if c.descr.MaxRepetitionLevel() > 0 { 273 c.repetitionDecoder.SetDataV2(page.repLvlByteLen, c.descr.MaxRepetitionLevel(), int(c.numBuffered), buf) 274 buf = buf[page.repLvlByteLen:] 275 } 276 277 if c.descr.MaxDefinitionLevel() > 0 { 278 c.definitionDecoder.SetDataV2(page.defLvlByteLen, c.descr.MaxDefinitionLevel(), int(c.numBuffered), buf) 279 } 280 281 return totalLvlLen, nil 282 } 283 284 func (c *columnChunkReader) initLevelDecodersV1(page *DataPageV1, repLvlEncoding, defLvlEncoding format.Encoding) (int64, error) { 285 c.numBuffered = int64(page.nvals) 286 c.numDecoded = 0 287 288 buf := page.Data() 289 maxSize := len(buf) 290 levelsByteLen := int64(0) 291 292 // Data page layout: Repetition Levels - Definition Levels - encoded values. 293 // Levels are encoded as rle or bit-packed 294 if c.descr.MaxRepetitionLevel() > 0 { 295 repBytes, err := c.repetitionDecoder.SetData(parquet.Encoding(repLvlEncoding), c.descr.MaxRepetitionLevel(), int(c.numBuffered), buf) 296 if err != nil { 297 return levelsByteLen, err 298 } 299 buf = buf[repBytes:] 300 maxSize -= repBytes 301 levelsByteLen += int64(repBytes) 302 } 303 304 if c.descr.MaxDefinitionLevel() > 0 { 305 defBytes, err := c.definitionDecoder.SetData(parquet.Encoding(defLvlEncoding), c.descr.MaxDefinitionLevel(), int(c.numBuffered), buf) 306 if err != nil { 307 return levelsByteLen, err 308 } 309 levelsByteLen += int64(defBytes) 310 maxSize -= defBytes 311 } 312 313 return levelsByteLen, nil 314 } 315 316 func (c *columnChunkReader) initDataDecoder(page Page, lvlByteLen int64) error { 317 buf := page.Data() 318 if int64(len(buf)) < lvlByteLen { 319 return xerrors.New("parquet: page smaller than size of encoded levels") 320 } 321 322 buf = buf[lvlByteLen:] 323 encoding := page.Encoding() 324 325 if isDictIndexEncoding(encoding) { 326 encoding = format.Encoding_RLE_DICTIONARY 327 } 328 329 if decoder, ok := c.decoders[encoding]; ok { 330 c.curDecoder = decoder 331 } else { 332 switch encoding { 333 case format.Encoding_PLAIN, 334 format.Encoding_DELTA_BYTE_ARRAY, 335 format.Encoding_DELTA_LENGTH_BYTE_ARRAY, 336 format.Encoding_DELTA_BINARY_PACKED: 337 c.curDecoder = c.decoderTraits.Decoder(parquet.Encoding(encoding), c.descr, false, c.mem) 338 c.decoders[encoding] = c.curDecoder 339 case format.Encoding_RLE_DICTIONARY: 340 return xerrors.New("parquet: dictionary page must be before data page") 341 case format.Encoding_BYTE_STREAM_SPLIT: 342 return xerrors.Errorf("parquet: unsupported data encoding %s", encoding) 343 default: 344 return xerrors.Errorf("parquet: unknown encoding type %s", encoding) 345 } 346 } 347 348 c.curEncoding = encoding 349 c.curDecoder.SetData(int(c.numBuffered), buf) 350 return nil 351 } 352 353 // readDefinitionLevels decodes the definition levels from the page and returns 354 // it returns the total number of levels that were decoded (and thus populated 355 // in the passed in slice) and the number of physical values that exist to read 356 // (the number of levels that are equal to the max definition level). 357 // 358 // If the max definition level is 0, the assumption is that there no nulls in the 359 // column and therefore no definition levels to read, so it will always return 0, 0 360 func (c *columnChunkReader) readDefinitionLevels(levels []int16) (totalDecoded int, valuesToRead int64) { 361 if c.descr.MaxDefinitionLevel() == 0 { 362 return 0, 0 363 } 364 365 return c.definitionDecoder.Decode(levels) 366 } 367 368 // readRepetitionLevels decodes the repetition levels from the page and returns 369 // the total number of values decoded (and thus populated in the passed in levels 370 // slice). 371 // 372 // If max repetition level is 0, it is assumed there are no repetition levels, 373 // and thus will always return 0. 374 func (c *columnChunkReader) readRepetitionLevels(levels []int16) int { 375 if c.descr.MaxRepetitionLevel() == 0 { 376 return 0 377 } 378 379 nlevels, _ := c.repetitionDecoder.Decode(levels) 380 return nlevels 381 } 382 383 // determineNumToRead reads the definition levels (and optionally populates the repetition levels) 384 // in order to determine how many values need to be read to fulfill this batch read. 385 // 386 // batchLen is the number of values it is desired to read. defLvls must be either nil (in which case 387 // a buffer will be used) or must be at least batchLen in length to be safe. repLvls should be either nil 388 // (in which case it is ignored) or should be at least batchLen in length to be safe. 389 // 390 // In the return values: ndef is the number of definition levels that were actually read in which will 391 // typically be the minimum of batchLen and numAvailValues. 392 // toRead is the number of physical values that should be read in based on the definition levels (the number 393 // of definition levels that were equal to maxDefinitionLevel). and err being either nil or any error encountered 394 func (c *columnChunkReader) determineNumToRead(batchLen int64, defLvls, repLvls []int16) (ndefs int, toRead int64, err error) { 395 if !c.HasNext() { 396 return 0, 0, c.err 397 } 398 399 size := utils.Min(batchLen, c.numBuffered-c.numDecoded) 400 401 if c.descr.MaxDefinitionLevel() > 0 { 402 if defLvls == nil { 403 defLvls = c.getDefLvlBuffer(size) 404 } 405 ndefs, toRead = c.readDefinitionLevels(defLvls[:size]) 406 } else { 407 toRead = size 408 } 409 410 if c.descr.MaxRepetitionLevel() > 0 && repLvls != nil { 411 nreps := c.readRepetitionLevels(repLvls[:size]) 412 if defLvls != nil && ndefs != nreps { 413 err = xerrors.New("parquet: number of decoded rep/def levels did not match") 414 } 415 } 416 return 417 } 418 419 // skipValues some number of rows using readFn as the function to read the data and throw it away. 420 // If we can skipValues a whole page based on its metadata, then we do so, otherwise we read the 421 // page until we have skipped the number of rows desired. 422 func (c *columnChunkReader) skipValues(nvalues int64, readFn func(batch int64, buf []byte) (int64, error)) (int64, error) { 423 var err error 424 toskip := nvalues 425 for c.HasNext() && toskip > 0 { 426 // if number to skip is more than the number of undecoded values, skip the page 427 if toskip > (c.numBuffered - c.numDecoded) { 428 toskip -= c.numBuffered - c.numDecoded 429 c.numDecoded = c.numBuffered 430 } else { 431 var ( 432 batchSize int64 = 1024 433 valsRead int64 = 0 434 ) 435 436 scratch := memory.NewResizableBuffer(c.mem) 437 scratch.Reserve(c.decoderTraits.BytesRequired(int(batchSize))) 438 defer scratch.Release() 439 440 for { 441 batchSize = utils.Min(batchSize, toskip) 442 valsRead, err = readFn(batchSize, scratch.Buf()) 443 toskip -= valsRead 444 if valsRead <= 0 || toskip <= 0 || err != nil { 445 break 446 } 447 } 448 } 449 } 450 if c.err != nil { 451 err = c.err 452 } 453 return nvalues - toskip, err 454 } 455 456 type readerFunc func(int64, int64) (int, error) 457 458 // base function for reading a batch of values, this will read until it either reads in batchSize values or 459 // it hits the end of the column chunk, including reading multiple pages. 460 // 461 // totalValues is the total number of values which were read in, and thus would be the total number 462 // of definition levels and repetition levels which were populated (if they were non-nil). totalRead 463 // is the number of physical values that were read in (ie: the number of non-null values) 464 func (c *columnChunkReader) readBatch(batchSize int64, defLvls, repLvls []int16, readFn readerFunc) (totalLvls int64, totalRead int, err error) { 465 var ( 466 read int 467 defs []int16 468 reps []int16 469 ndefs int 470 toRead int64 471 ) 472 473 for c.HasNext() && totalLvls < batchSize && err == nil { 474 if defLvls != nil { 475 defs = defLvls[totalLvls:] 476 } 477 if repLvls != nil { 478 reps = repLvls[totalLvls:] 479 } 480 ndefs, toRead, err = c.determineNumToRead(batchSize-totalLvls, defs, reps) 481 if err != nil { 482 return totalLvls, totalRead, err 483 } 484 485 read, err = readFn(int64(totalRead), toRead) 486 // the total number of values processed here is the maximum of 487 // the number of definition levels or the number of physical values read. 488 // if this is a required field, ndefs will be 0 since there is no definition 489 // levels stored with it and `read` will be the number of values, otherwise 490 // we use ndefs since it will be equal to or greater than read. 491 totalVals := int64(utils.MaxInt(ndefs, read)) 492 c.consumeBufferedValues(totalVals) 493 494 totalLvls += totalVals 495 totalRead += read 496 } 497 return totalLvls, totalRead, err 498 }