github.com/apache/arrow/go/v14@v14.0.1/parquet/file/column_reader.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file 18 19 import ( 20 "fmt" 21 "sync" 22 23 "github.com/apache/arrow/go/v14/arrow/memory" 24 "github.com/apache/arrow/go/v14/internal/utils" 25 "github.com/apache/arrow/go/v14/parquet" 26 "github.com/apache/arrow/go/v14/parquet/internal/encoding" 27 "github.com/apache/arrow/go/v14/parquet/internal/encryption" 28 format "github.com/apache/arrow/go/v14/parquet/internal/gen-go/parquet" 29 "github.com/apache/arrow/go/v14/parquet/schema" 30 "golang.org/x/xerrors" 31 ) 32 33 const ( 34 // 4 MB is the default maximum page header size 35 defaultMaxPageHeaderSize = 4 * 1024 * 1024 36 // 16 KB is the default expected page header size 37 defaultPageHeaderSize = 16 * 1024 38 ) 39 40 //go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=../internal/encoding/physical_types.tmpldata column_reader_types.gen.go.tmpl 41 42 func isDictIndexEncoding(e format.Encoding) bool { 43 return e == format.Encoding_RLE_DICTIONARY || e == format.Encoding_PLAIN_DICTIONARY 44 } 45 46 // CryptoContext is a context for keeping track of the current methods for decrypting. 47 // It keeps track of the row group and column numbers along with references to the 48 // decryptor objects. 49 type CryptoContext struct { 50 StartDecryptWithDictionaryPage bool 51 RowGroupOrdinal int16 52 ColumnOrdinal int16 53 MetaDecryptor encryption.Decryptor 54 DataDecryptor encryption.Decryptor 55 } 56 57 // ColumnChunkReader is the basic interface for all column readers. It will use 58 // a page reader to read all the pages in a column chunk from a row group. 59 // 60 // To actually Read out the column data, you need to convert to the properly 61 // typed ColumnChunkReader type such as *BooleanColumnReader etc. 62 // 63 // Some things to clarify when working with column readers: 64 // 65 // "Values" refers to the physical data values in a data page. 66 // 67 // This is separate from the number of "rows" in a column and the total number 68 // of "elements" in a column because null values aren't stored physically in the 69 // data page but are represented via definition levels, so the number of values 70 // in a column can be less than the number of rows. 71 // 72 // The total number of "elements" in a column also differs because of potential 73 // repeated fields, where you can have multiple values in the page which 74 // together make up a single element (such as a list) or depending on the repetition 75 // level and definition level, could represent an entire null list or just a null 76 // element inside of a list. 77 type ColumnChunkReader interface { 78 // HasNext returns whether there is more data to be read in this column 79 // and row group. 80 HasNext() bool 81 // Type returns the underlying physical type of the column 82 Type() parquet.Type 83 // Descriptor returns the column schema container 84 Descriptor() *schema.Column 85 // if HasNext returns false because of an error, this will return the error 86 // it encountered. Otherwise this will be nil if it's just the end of the 87 // column 88 Err() error 89 // Skip buffered values 90 consumeBufferedValues(int64) 91 // number of available buffered values that have not been decoded yet 92 // when this returns 0, you're at the end of a page. 93 numAvailValues() int64 94 // read the definition levels and return the number of definitions, 95 // and the number of values to be read (number of def levels == maxdef level) 96 // it also populates the passed in slice which should be sized appropriately. 97 readDefinitionLevels(levels []int16) (int, int64) 98 // read the repetition levels and return the number of repetition levels read 99 // also populates the passed in slice, which should be sized appropriately. 100 readRepetitionLevels(levels []int16) int 101 // a column is made up of potentially multiple pages across potentially multiple 102 // row groups. A PageReader allows looping through the pages in a single row group. 103 // When moving to another row group for reading, use setPageReader to re-use the 104 // column reader for reading the pages of the new row group. 105 pager() PageReader 106 // set a page reader into the columnreader so it can be reused. 107 // 108 // This will clear any current error in the reader but does not 109 // automatically read the first page of the page reader passed in until 110 // HasNext which will read in the next page. 111 setPageReader(PageReader) 112 } 113 114 type columnChunkReader struct { 115 descr *schema.Column 116 rdr PageReader 117 repetitionDecoder encoding.LevelDecoder 118 definitionDecoder encoding.LevelDecoder 119 120 curPage Page 121 curEncoding format.Encoding 122 curDecoder encoding.TypedDecoder 123 124 // number of currently buffered values in the current page 125 numBuffered int64 126 // the number of values we've decoded so far 127 numDecoded int64 128 mem memory.Allocator 129 bufferPool *sync.Pool 130 131 decoders map[format.Encoding]encoding.TypedDecoder 132 decoderTraits encoding.DecoderTraits 133 134 // is set when an error is encountered 135 err error 136 defLvlBuffer []int16 137 138 newDictionary bool 139 } 140 141 // NewColumnReader returns a column reader for the provided column initialized with the given pagereader that will 142 // provide the pages of data for this column. The type is determined from the column passed in. 143 // 144 // In addition to the page reader and allocator, a pointer to a shared sync.Pool is expected to provide buffers for temporary 145 // usage to minimize allocations. The bufferPool should provide *memory.Buffer objects that can be resized as necessary, buffers 146 // should have `ResizeNoShrink(0)` called on them before being put back into the pool. 147 func NewColumnReader(descr *schema.Column, pageReader PageReader, mem memory.Allocator, bufferPool *sync.Pool) ColumnChunkReader { 148 base := columnChunkReader{descr: descr, rdr: pageReader, mem: mem, decoders: make(map[format.Encoding]encoding.TypedDecoder), bufferPool: bufferPool} 149 switch descr.PhysicalType() { 150 case parquet.Types.FixedLenByteArray: 151 base.decoderTraits = &encoding.FixedLenByteArrayDecoderTraits 152 return &FixedLenByteArrayColumnChunkReader{base} 153 case parquet.Types.Float: 154 base.decoderTraits = &encoding.Float32DecoderTraits 155 return &Float32ColumnChunkReader{base} 156 case parquet.Types.Double: 157 base.decoderTraits = &encoding.Float64DecoderTraits 158 return &Float64ColumnChunkReader{base} 159 case parquet.Types.ByteArray: 160 base.decoderTraits = &encoding.ByteArrayDecoderTraits 161 return &ByteArrayColumnChunkReader{base} 162 case parquet.Types.Int32: 163 base.decoderTraits = &encoding.Int32DecoderTraits 164 return &Int32ColumnChunkReader{base} 165 case parquet.Types.Int64: 166 base.decoderTraits = &encoding.Int64DecoderTraits 167 return &Int64ColumnChunkReader{base} 168 case parquet.Types.Int96: 169 base.decoderTraits = &encoding.Int96DecoderTraits 170 return &Int96ColumnChunkReader{base} 171 case parquet.Types.Boolean: 172 base.decoderTraits = &encoding.BooleanDecoderTraits 173 return &BooleanColumnChunkReader{base} 174 } 175 return nil 176 } 177 178 func (c *columnChunkReader) Err() error { return c.err } 179 func (c *columnChunkReader) Type() parquet.Type { return c.descr.PhysicalType() } 180 func (c *columnChunkReader) Descriptor() *schema.Column { return c.descr } 181 func (c *columnChunkReader) consumeBufferedValues(n int64) { c.numDecoded += n } 182 func (c *columnChunkReader) numAvailValues() int64 { return c.numBuffered - c.numDecoded } 183 func (c *columnChunkReader) pager() PageReader { return c.rdr } 184 func (c *columnChunkReader) setPageReader(rdr PageReader) { 185 c.rdr, c.err = rdr, nil 186 c.decoders = make(map[format.Encoding]encoding.TypedDecoder) 187 c.numBuffered, c.numDecoded = 0, 0 188 } 189 190 func (c *columnChunkReader) getDefLvlBuffer(sz int64) []int16 { 191 if int64(len(c.defLvlBuffer)) < sz { 192 c.defLvlBuffer = make([]int16, sz) 193 return c.defLvlBuffer 194 } 195 196 return c.defLvlBuffer[:sz] 197 } 198 199 // HasNext returns whether there is more data to be read in this column 200 // and row group. 201 func (c *columnChunkReader) HasNext() bool { 202 if c.numBuffered == 0 || c.numDecoded == c.numBuffered { 203 return c.readNewPage() && c.numBuffered != 0 204 } 205 return true 206 } 207 208 func (c *columnChunkReader) configureDict(page *DictionaryPage) error { 209 enc := page.encoding 210 if enc == format.Encoding_PLAIN_DICTIONARY || enc == format.Encoding_PLAIN { 211 enc = format.Encoding_RLE_DICTIONARY 212 } 213 214 if _, ok := c.decoders[enc]; ok { 215 return xerrors.New("parquet: column chunk cannot have more than one dictionary.") 216 } 217 218 switch page.Encoding() { 219 case format.Encoding_PLAIN, format.Encoding_PLAIN_DICTIONARY: 220 dict := c.decoderTraits.Decoder(parquet.Encodings.Plain, c.descr, false, c.mem) 221 dict.SetData(int(page.NumValues()), page.Data()) 222 223 decoder := c.decoderTraits.Decoder(parquet.Encodings.Plain, c.descr, true, c.mem).(encoding.DictDecoder) 224 decoder.SetDict(dict) 225 c.decoders[enc] = decoder 226 default: 227 return xerrors.New("parquet: dictionary index must be plain encoding") 228 } 229 230 c.newDictionary = true 231 c.curDecoder = c.decoders[enc] 232 return nil 233 } 234 235 // read a new page from the page reader 236 func (c *columnChunkReader) readNewPage() bool { 237 for c.rdr.Next() { // keep going until we get a data page 238 c.curPage = c.rdr.Page() 239 if c.curPage == nil { 240 break 241 } 242 243 var lvlByteLen int64 244 switch p := c.curPage.(type) { 245 case *DictionaryPage: 246 if err := c.configureDict(p); err != nil { 247 c.err = err 248 return false 249 } 250 continue 251 case *DataPageV1: 252 lvlByteLen, c.err = c.initLevelDecodersV1(p, p.repLvlEncoding, p.defLvlEncoding) 253 if c.err != nil { 254 return false 255 } 256 case *DataPageV2: 257 lvlByteLen, c.err = c.initLevelDecodersV2(p) 258 if c.err != nil { 259 return false 260 } 261 default: 262 // we can skip non-data pages 263 continue 264 } 265 266 c.err = c.initDataDecoder(c.curPage, lvlByteLen) 267 return c.err == nil 268 } 269 c.err = c.rdr.Err() 270 return false 271 } 272 273 func (c *columnChunkReader) initLevelDecodersV2(page *DataPageV2) (int64, error) { 274 c.numBuffered = int64(page.nvals) 275 c.numDecoded = 0 276 buf := page.Data() 277 totalLvlLen := int64(page.repLvlByteLen) + int64(page.defLvlByteLen) 278 279 if totalLvlLen > int64(len(buf)) { 280 return totalLvlLen, xerrors.New("parquet: data page too small for levels (corrupt header?)") 281 } 282 283 if c.descr.MaxRepetitionLevel() > 0 { 284 c.repetitionDecoder.SetDataV2(page.repLvlByteLen, c.descr.MaxRepetitionLevel(), int(c.numBuffered), buf) 285 } 286 // ARROW-17453: Some writers will write repetition levels even when 287 // the max repetition level is 0, so we should respect the value 288 // in the page header regardless of whether MaxRepetitionLevel is 0 289 // or not. 290 buf = buf[page.repLvlByteLen:] 291 292 if c.descr.MaxDefinitionLevel() > 0 { 293 c.definitionDecoder.SetDataV2(page.defLvlByteLen, c.descr.MaxDefinitionLevel(), int(c.numBuffered), buf) 294 } 295 296 return totalLvlLen, nil 297 } 298 299 func (c *columnChunkReader) initLevelDecodersV1(page *DataPageV1, repLvlEncoding, defLvlEncoding format.Encoding) (int64, error) { 300 c.numBuffered = int64(page.nvals) 301 c.numDecoded = 0 302 303 buf := page.Data() 304 maxSize := len(buf) 305 levelsByteLen := int64(0) 306 307 // Data page layout: Repetition Levels - Definition Levels - encoded values. 308 // Levels are encoded as rle or bit-packed 309 if c.descr.MaxRepetitionLevel() > 0 { 310 repBytes, err := c.repetitionDecoder.SetData(parquet.Encoding(repLvlEncoding), c.descr.MaxRepetitionLevel(), int(c.numBuffered), buf) 311 if err != nil { 312 return levelsByteLen, err 313 } 314 buf = buf[repBytes:] 315 maxSize -= repBytes 316 levelsByteLen += int64(repBytes) 317 } 318 319 if c.descr.MaxDefinitionLevel() > 0 { 320 defBytes, err := c.definitionDecoder.SetData(parquet.Encoding(defLvlEncoding), c.descr.MaxDefinitionLevel(), int(c.numBuffered), buf) 321 if err != nil { 322 return levelsByteLen, err 323 } 324 levelsByteLen += int64(defBytes) 325 maxSize -= defBytes 326 } 327 328 return levelsByteLen, nil 329 } 330 331 func (c *columnChunkReader) initDataDecoder(page Page, lvlByteLen int64) error { 332 buf := page.Data() 333 if int64(len(buf)) < lvlByteLen { 334 return xerrors.New("parquet: page smaller than size of encoded levels") 335 } 336 337 buf = buf[lvlByteLen:] 338 encoding := page.Encoding() 339 340 if isDictIndexEncoding(encoding) { 341 encoding = format.Encoding_RLE_DICTIONARY 342 } 343 344 if decoder, ok := c.decoders[encoding]; ok { 345 c.curDecoder = decoder 346 } else { 347 switch encoding { 348 case format.Encoding_PLAIN, 349 format.Encoding_DELTA_BYTE_ARRAY, 350 format.Encoding_DELTA_LENGTH_BYTE_ARRAY, 351 format.Encoding_DELTA_BINARY_PACKED: 352 c.curDecoder = c.decoderTraits.Decoder(parquet.Encoding(encoding), c.descr, false, c.mem) 353 c.decoders[encoding] = c.curDecoder 354 case format.Encoding_RLE_DICTIONARY: 355 return xerrors.New("parquet: dictionary page must be before data page") 356 case format.Encoding_BYTE_STREAM_SPLIT: 357 return fmt.Errorf("parquet: unsupported data encoding %s", encoding) 358 default: 359 return fmt.Errorf("parquet: unknown encoding type %s", encoding) 360 } 361 } 362 363 c.curEncoding = encoding 364 c.curDecoder.SetData(int(c.numBuffered), buf) 365 return nil 366 } 367 368 // readDefinitionLevels decodes the definition levels from the page and returns 369 // it returns the total number of levels that were decoded (and thus populated 370 // in the passed in slice) and the number of physical values that exist to read 371 // (the number of levels that are equal to the max definition level). 372 // 373 // If the max definition level is 0, the assumption is that there no nulls in the 374 // column and therefore no definition levels to read, so it will always return 0, 0 375 func (c *columnChunkReader) readDefinitionLevels(levels []int16) (totalDecoded int, valuesToRead int64) { 376 if c.descr.MaxDefinitionLevel() == 0 { 377 return 0, 0 378 } 379 380 return c.definitionDecoder.Decode(levels) 381 } 382 383 // readRepetitionLevels decodes the repetition levels from the page and returns 384 // the total number of values decoded (and thus populated in the passed in levels 385 // slice). 386 // 387 // If max repetition level is 0, it is assumed there are no repetition levels, 388 // and thus will always return 0. 389 func (c *columnChunkReader) readRepetitionLevels(levels []int16) int { 390 if c.descr.MaxRepetitionLevel() == 0 { 391 return 0 392 } 393 394 nlevels, _ := c.repetitionDecoder.Decode(levels) 395 return nlevels 396 } 397 398 // determineNumToRead reads the definition levels (and optionally populates the repetition levels) 399 // in order to determine how many values need to be read to fulfill this batch read. 400 // 401 // batchLen is the number of values it is desired to read. defLvls must be either nil (in which case 402 // a buffer will be used) or must be at least batchLen in length to be safe. repLvls should be either nil 403 // (in which case it is ignored) or should be at least batchLen in length to be safe. 404 // 405 // In the return values: ndef is the number of definition levels that were actually read in which will 406 // typically be the minimum of batchLen and numAvailValues. 407 // toRead is the number of physical values that should be read in based on the definition levels (the number 408 // of definition levels that were equal to maxDefinitionLevel). and err being either nil or any error encountered 409 func (c *columnChunkReader) determineNumToRead(batchLen int64, defLvls, repLvls []int16) (ndefs int, toRead int64, err error) { 410 if !c.HasNext() { 411 return 0, 0, c.err 412 } 413 414 size := utils.Min(batchLen, c.numBuffered-c.numDecoded) 415 416 if c.descr.MaxDefinitionLevel() > 0 { 417 if defLvls == nil { 418 defLvls = c.getDefLvlBuffer(size) 419 } 420 ndefs, toRead = c.readDefinitionLevels(defLvls[:size]) 421 } else { 422 toRead = size 423 } 424 425 if c.descr.MaxRepetitionLevel() > 0 && repLvls != nil { 426 nreps := c.readRepetitionLevels(repLvls[:size]) 427 if defLvls != nil && ndefs != nreps { 428 err = xerrors.New("parquet: number of decoded rep/def levels did not match") 429 } 430 } 431 return 432 } 433 434 // skipValues some number of rows using readFn as the function to read the data and throw it away. 435 // If we can skipValues a whole page based on its metadata, then we do so, otherwise we read the 436 // page until we have skipped the number of rows desired. 437 func (c *columnChunkReader) skipValues(nvalues int64, readFn func(batch int64, buf []byte) (int64, error)) (int64, error) { 438 var err error 439 toskip := nvalues 440 for c.HasNext() && toskip > 0 { 441 // if number to skip is more than the number of undecoded values, skip the page 442 if toskip > (c.numBuffered - c.numDecoded) { 443 toskip -= c.numBuffered - c.numDecoded 444 c.numDecoded = c.numBuffered 445 } else { 446 var ( 447 batchSize int64 = 1024 448 valsRead int64 = 0 449 ) 450 451 scratch := c.bufferPool.Get().(*memory.Buffer) 452 defer func() { 453 scratch.ResizeNoShrink(0) 454 c.bufferPool.Put(scratch) 455 }() 456 bufMult := 1 457 if c.descr.PhysicalType() == parquet.Types.Boolean { 458 // for bools, BytesRequired returns 1 byte per 8 bool, but casting []byte to []bool requires 1 byte per 1 bool 459 bufMult = 8 460 } 461 scratch.Reserve(c.decoderTraits.BytesRequired(int(batchSize) * bufMult)) 462 463 for { 464 batchSize = utils.Min(batchSize, toskip) 465 valsRead, err = readFn(batchSize, scratch.Buf()) 466 toskip -= valsRead 467 if valsRead <= 0 || toskip <= 0 || err != nil { 468 break 469 } 470 } 471 } 472 } 473 if c.err != nil { 474 err = c.err 475 } 476 return nvalues - toskip, err 477 } 478 479 type readerFunc func(int64, int64) (int, error) 480 481 // base function for reading a batch of values, this will read until it either reads in batchSize values or 482 // it hits the end of the column chunk, including reading multiple pages. 483 // 484 // totalValues is the total number of values which were read in, and thus would be the total number 485 // of definition levels and repetition levels which were populated (if they were non-nil). totalRead 486 // is the number of physical values that were read in (ie: the number of non-null values) 487 func (c *columnChunkReader) readBatch(batchSize int64, defLvls, repLvls []int16, readFn readerFunc) (totalLvls int64, totalRead int, err error) { 488 var ( 489 read int 490 defs []int16 491 reps []int16 492 ndefs int 493 toRead int64 494 ) 495 496 for c.HasNext() && totalLvls < batchSize && err == nil { 497 if defLvls != nil { 498 defs = defLvls[totalLvls:] 499 } 500 if repLvls != nil { 501 reps = repLvls[totalLvls:] 502 } 503 ndefs, toRead, err = c.determineNumToRead(batchSize-totalLvls, defs, reps) 504 if err != nil { 505 return totalLvls, totalRead, err 506 } 507 508 read, err = readFn(int64(totalRead), toRead) 509 // the total number of values processed here is the maximum of 510 // the number of definition levels or the number of physical values read. 511 // if this is a required field, ndefs will be 0 since there is no definition 512 // levels stored with it and `read` will be the number of values, otherwise 513 // we use ndefs since it will be equal to or greater than read. 514 totalVals := int64(utils.MaxInt(ndefs, read)) 515 c.consumeBufferedValues(totalVals) 516 517 totalLvls += totalVals 518 totalRead += read 519 } 520 return totalLvls, totalRead, err 521 }