github.com/apache/arrow/go/v16@v16.1.0/parquet/file/column_reader.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file 18 19 import ( 20 "errors" 21 "fmt" 22 "sync" 23 24 "github.com/apache/arrow/go/v16/arrow/memory" 25 "github.com/apache/arrow/go/v16/internal/utils" 26 "github.com/apache/arrow/go/v16/parquet" 27 "github.com/apache/arrow/go/v16/parquet/internal/encoding" 28 "github.com/apache/arrow/go/v16/parquet/internal/encryption" 29 format "github.com/apache/arrow/go/v16/parquet/internal/gen-go/parquet" 30 "github.com/apache/arrow/go/v16/parquet/schema" 31 "golang.org/x/xerrors" 32 ) 33 34 const ( 35 // 4 MB is the default maximum page header size 36 defaultMaxPageHeaderSize = 4 * 1024 * 1024 37 // 16 KB is the default expected page header size 38 defaultPageHeaderSize = 16 * 1024 39 ) 40 41 //go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=../internal/encoding/physical_types.tmpldata column_reader_types.gen.go.tmpl 42 43 func isDictIndexEncoding(e format.Encoding) bool { 44 return e == format.Encoding_RLE_DICTIONARY || e == format.Encoding_PLAIN_DICTIONARY 45 } 46 47 // CryptoContext is a context for keeping track of the current methods for decrypting. 48 // It keeps track of the row group and column numbers along with references to the 49 // decryptor objects. 50 type CryptoContext struct { 51 StartDecryptWithDictionaryPage bool 52 RowGroupOrdinal int16 53 ColumnOrdinal int16 54 MetaDecryptor encryption.Decryptor 55 DataDecryptor encryption.Decryptor 56 } 57 58 // ColumnChunkReader is the basic interface for all column readers. It will use 59 // a page reader to read all the pages in a column chunk from a row group. 60 // 61 // To actually Read out the column data, you need to convert to the properly 62 // typed ColumnChunkReader type such as *BooleanColumnReader etc. 63 // 64 // Some things to clarify when working with column readers: 65 // 66 // "Values" refers to the physical data values in a data page. 67 // 68 // This is separate from the number of "rows" in a column and the total number 69 // of "elements" in a column because null values aren't stored physically in the 70 // data page but are represented via definition levels, so the number of values 71 // in a column can be less than the number of rows. 72 // 73 // The total number of "elements" in a column also differs because of potential 74 // repeated fields, where you can have multiple values in the page which 75 // together make up a single element (such as a list) or depending on the repetition 76 // level and definition level, could represent an entire null list or just a null 77 // element inside of a list. 78 type ColumnChunkReader interface { 79 // HasNext returns whether there is more data to be read in this column 80 // and row group. 81 HasNext() bool 82 // Type returns the underlying physical type of the column 83 Type() parquet.Type 84 // Descriptor returns the column schema container 85 Descriptor() *schema.Column 86 // if HasNext returns false because of an error, this will return the error 87 // it encountered. Otherwise this will be nil if it's just the end of the 88 // column 89 Err() error 90 // Skip buffered values 91 consumeBufferedValues(int64) 92 // number of available buffered values that have not been decoded yet 93 // when this returns 0, you're at the end of a page. 94 numAvailValues() int64 95 // read the definition levels and return the number of definitions, 96 // and the number of values to be read (number of def levels == maxdef level) 97 // it also populates the passed in slice which should be sized appropriately. 98 readDefinitionLevels(levels []int16) (int, int64) 99 // read the repetition levels and return the number of repetition levels read 100 // also populates the passed in slice, which should be sized appropriately. 101 readRepetitionLevels(levels []int16) int 102 // a column is made up of potentially multiple pages across potentially multiple 103 // row groups. A PageReader allows looping through the pages in a single row group. 104 // When moving to another row group for reading, use setPageReader to re-use the 105 // column reader for reading the pages of the new row group. 106 pager() PageReader 107 // set a page reader into the columnreader so it can be reused. 108 // 109 // This will clear any current error in the reader but does not 110 // automatically read the first page of the page reader passed in until 111 // HasNext which will read in the next page. 112 setPageReader(PageReader) 113 } 114 115 type columnChunkReader struct { 116 descr *schema.Column 117 rdr PageReader 118 repetitionDecoder encoding.LevelDecoder 119 definitionDecoder encoding.LevelDecoder 120 121 curPage Page 122 curEncoding format.Encoding 123 curDecoder encoding.TypedDecoder 124 125 // number of currently buffered values in the current page 126 numBuffered int64 127 // the number of values we've decoded so far 128 numDecoded int64 129 mem memory.Allocator 130 bufferPool *sync.Pool 131 132 decoders map[format.Encoding]encoding.TypedDecoder 133 decoderTraits encoding.DecoderTraits 134 135 // is set when an error is encountered 136 err error 137 defLvlBuffer []int16 138 139 newDictionary bool 140 } 141 142 // NewColumnReader returns a column reader for the provided column initialized with the given pagereader that will 143 // provide the pages of data for this column. The type is determined from the column passed in. 144 // 145 // In addition to the page reader and allocator, a pointer to a shared sync.Pool is expected to provide buffers for temporary 146 // usage to minimize allocations. The bufferPool should provide *memory.Buffer objects that can be resized as necessary, buffers 147 // should have `ResizeNoShrink(0)` called on them before being put back into the pool. 148 func NewColumnReader(descr *schema.Column, pageReader PageReader, mem memory.Allocator, bufferPool *sync.Pool) ColumnChunkReader { 149 base := columnChunkReader{descr: descr, rdr: pageReader, mem: mem, decoders: make(map[format.Encoding]encoding.TypedDecoder), bufferPool: bufferPool} 150 switch descr.PhysicalType() { 151 case parquet.Types.FixedLenByteArray: 152 base.decoderTraits = &encoding.FixedLenByteArrayDecoderTraits 153 return &FixedLenByteArrayColumnChunkReader{base} 154 case parquet.Types.Float: 155 base.decoderTraits = &encoding.Float32DecoderTraits 156 return &Float32ColumnChunkReader{base} 157 case parquet.Types.Double: 158 base.decoderTraits = &encoding.Float64DecoderTraits 159 return &Float64ColumnChunkReader{base} 160 case parquet.Types.ByteArray: 161 base.decoderTraits = &encoding.ByteArrayDecoderTraits 162 return &ByteArrayColumnChunkReader{base} 163 case parquet.Types.Int32: 164 base.decoderTraits = &encoding.Int32DecoderTraits 165 return &Int32ColumnChunkReader{base} 166 case parquet.Types.Int64: 167 base.decoderTraits = &encoding.Int64DecoderTraits 168 return &Int64ColumnChunkReader{base} 169 case parquet.Types.Int96: 170 base.decoderTraits = &encoding.Int96DecoderTraits 171 return &Int96ColumnChunkReader{base} 172 case parquet.Types.Boolean: 173 base.decoderTraits = &encoding.BooleanDecoderTraits 174 return &BooleanColumnChunkReader{base} 175 } 176 return nil 177 } 178 179 func (c *columnChunkReader) Err() error { return c.err } 180 func (c *columnChunkReader) Type() parquet.Type { return c.descr.PhysicalType() } 181 func (c *columnChunkReader) Descriptor() *schema.Column { return c.descr } 182 func (c *columnChunkReader) consumeBufferedValues(n int64) { c.numDecoded += n } 183 func (c *columnChunkReader) numAvailValues() int64 { return c.numBuffered - c.numDecoded } 184 func (c *columnChunkReader) pager() PageReader { return c.rdr } 185 func (c *columnChunkReader) setPageReader(rdr PageReader) { 186 c.rdr, c.err = rdr, nil 187 c.decoders = make(map[format.Encoding]encoding.TypedDecoder) 188 c.numBuffered, c.numDecoded = 0, 0 189 } 190 191 func (c *columnChunkReader) getDefLvlBuffer(sz int64) []int16 { 192 if int64(len(c.defLvlBuffer)) < sz { 193 c.defLvlBuffer = make([]int16, sz) 194 return c.defLvlBuffer 195 } 196 197 return c.defLvlBuffer[:sz] 198 } 199 200 // HasNext returns whether there is more data to be read in this column 201 // and row group. 202 func (c *columnChunkReader) HasNext() bool { 203 if c.numBuffered == 0 || c.numDecoded == c.numBuffered { 204 return c.readNewPage() && c.numBuffered != 0 205 } 206 return true 207 } 208 209 func (c *columnChunkReader) configureDict(page *DictionaryPage) error { 210 enc := page.encoding 211 if enc == format.Encoding_PLAIN_DICTIONARY || enc == format.Encoding_PLAIN { 212 enc = format.Encoding_RLE_DICTIONARY 213 } 214 215 if _, ok := c.decoders[enc]; ok { 216 return xerrors.New("parquet: column chunk cannot have more than one dictionary.") 217 } 218 219 switch page.Encoding() { 220 case format.Encoding_PLAIN, format.Encoding_PLAIN_DICTIONARY: 221 dict := c.decoderTraits.Decoder(parquet.Encodings.Plain, c.descr, false, c.mem) 222 dict.SetData(int(page.NumValues()), page.Data()) 223 224 decoder := c.decoderTraits.Decoder(parquet.Encodings.Plain, c.descr, true, c.mem).(encoding.DictDecoder) 225 decoder.SetDict(dict) 226 c.decoders[enc] = decoder 227 default: 228 return xerrors.New("parquet: dictionary index must be plain encoding") 229 } 230 231 c.newDictionary = true 232 c.curDecoder = c.decoders[enc] 233 return nil 234 } 235 236 // read a new page from the page reader 237 func (c *columnChunkReader) readNewPage() bool { 238 for c.rdr.Next() { // keep going until we get a data page 239 c.curPage = c.rdr.Page() 240 if c.curPage == nil { 241 break 242 } 243 244 var lvlByteLen int64 245 switch p := c.curPage.(type) { 246 case *DictionaryPage: 247 if err := c.configureDict(p); err != nil { 248 c.err = err 249 return false 250 } 251 continue 252 case *DataPageV1: 253 lvlByteLen, c.err = c.initLevelDecodersV1(p, p.repLvlEncoding, p.defLvlEncoding) 254 if c.err != nil { 255 return false 256 } 257 case *DataPageV2: 258 lvlByteLen, c.err = c.initLevelDecodersV2(p) 259 if c.err != nil { 260 return false 261 } 262 default: 263 // we can skip non-data pages 264 continue 265 } 266 267 c.err = c.initDataDecoder(c.curPage, lvlByteLen) 268 return c.err == nil 269 } 270 c.err = c.rdr.Err() 271 return false 272 } 273 274 func (c *columnChunkReader) initLevelDecodersV2(page *DataPageV2) (int64, error) { 275 c.numBuffered = int64(page.nvals) 276 c.numDecoded = 0 277 buf := page.Data() 278 totalLvlLen := int64(page.repLvlByteLen) + int64(page.defLvlByteLen) 279 280 if totalLvlLen > int64(len(buf)) { 281 return totalLvlLen, xerrors.New("parquet: data page too small for levels (corrupt header?)") 282 } 283 284 if c.descr.MaxRepetitionLevel() > 0 { 285 c.repetitionDecoder.SetDataV2(page.repLvlByteLen, c.descr.MaxRepetitionLevel(), int(c.numBuffered), buf) 286 } 287 // ARROW-17453: Some writers will write repetition levels even when 288 // the max repetition level is 0, so we should respect the value 289 // in the page header regardless of whether MaxRepetitionLevel is 0 290 // or not. 291 buf = buf[page.repLvlByteLen:] 292 293 if c.descr.MaxDefinitionLevel() > 0 { 294 c.definitionDecoder.SetDataV2(page.defLvlByteLen, c.descr.MaxDefinitionLevel(), int(c.numBuffered), buf) 295 } 296 297 return totalLvlLen, nil 298 } 299 300 func (c *columnChunkReader) initLevelDecodersV1(page *DataPageV1, repLvlEncoding, defLvlEncoding format.Encoding) (int64, error) { 301 c.numBuffered = int64(page.nvals) 302 c.numDecoded = 0 303 304 buf := page.Data() 305 maxSize := len(buf) 306 levelsByteLen := int64(0) 307 308 // Data page layout: Repetition Levels - Definition Levels - encoded values. 309 // Levels are encoded as rle or bit-packed 310 if c.descr.MaxRepetitionLevel() > 0 { 311 repBytes, err := c.repetitionDecoder.SetData(parquet.Encoding(repLvlEncoding), c.descr.MaxRepetitionLevel(), int(c.numBuffered), buf) 312 if err != nil { 313 return levelsByteLen, err 314 } 315 buf = buf[repBytes:] 316 maxSize -= repBytes 317 levelsByteLen += int64(repBytes) 318 } 319 320 if c.descr.MaxDefinitionLevel() > 0 { 321 defBytes, err := c.definitionDecoder.SetData(parquet.Encoding(defLvlEncoding), c.descr.MaxDefinitionLevel(), int(c.numBuffered), buf) 322 if err != nil { 323 return levelsByteLen, err 324 } 325 levelsByteLen += int64(defBytes) 326 maxSize -= defBytes 327 } 328 329 return levelsByteLen, nil 330 } 331 332 func (c *columnChunkReader) initDataDecoder(page Page, lvlByteLen int64) error { 333 buf := page.Data() 334 if int64(len(buf)) < lvlByteLen { 335 return xerrors.New("parquet: page smaller than size of encoded levels") 336 } 337 338 buf = buf[lvlByteLen:] 339 encoding := page.Encoding() 340 341 if isDictIndexEncoding(encoding) { 342 encoding = format.Encoding_RLE_DICTIONARY 343 } 344 345 if decoder, ok := c.decoders[encoding]; ok { 346 c.curDecoder = decoder 347 } else { 348 switch encoding { 349 case format.Encoding_RLE: 350 if c.descr.PhysicalType() != parquet.Types.Boolean { 351 return fmt.Errorf("parquet: only boolean supports RLE encoding, got %s", c.descr.PhysicalType()) 352 } 353 fallthrough 354 case format.Encoding_PLAIN, 355 format.Encoding_DELTA_BYTE_ARRAY, 356 format.Encoding_DELTA_LENGTH_BYTE_ARRAY, 357 format.Encoding_DELTA_BINARY_PACKED: 358 c.curDecoder = c.decoderTraits.Decoder(parquet.Encoding(encoding), c.descr, false, c.mem) 359 c.decoders[encoding] = c.curDecoder 360 case format.Encoding_RLE_DICTIONARY: 361 return errors.New("parquet: dictionary page must be before data page") 362 case format.Encoding_BYTE_STREAM_SPLIT: 363 return fmt.Errorf("parquet: unsupported data encoding %s", encoding) 364 default: 365 return fmt.Errorf("parquet: unknown encoding type %s", encoding) 366 } 367 } 368 369 c.curEncoding = encoding 370 c.curDecoder.SetData(int(c.numBuffered), buf) 371 return nil 372 } 373 374 // readDefinitionLevels decodes the definition levels from the page and returns 375 // it returns the total number of levels that were decoded (and thus populated 376 // in the passed in slice) and the number of physical values that exist to read 377 // (the number of levels that are equal to the max definition level). 378 // 379 // If the max definition level is 0, the assumption is that there no nulls in the 380 // column and therefore no definition levels to read, so it will always return 0, 0 381 func (c *columnChunkReader) readDefinitionLevels(levels []int16) (totalDecoded int, valuesToRead int64) { 382 if c.descr.MaxDefinitionLevel() == 0 { 383 return 0, 0 384 } 385 386 return c.definitionDecoder.Decode(levels) 387 } 388 389 // readRepetitionLevels decodes the repetition levels from the page and returns 390 // the total number of values decoded (and thus populated in the passed in levels 391 // slice). 392 // 393 // If max repetition level is 0, it is assumed there are no repetition levels, 394 // and thus will always return 0. 395 func (c *columnChunkReader) readRepetitionLevels(levels []int16) int { 396 if c.descr.MaxRepetitionLevel() == 0 { 397 return 0 398 } 399 400 nlevels, _ := c.repetitionDecoder.Decode(levels) 401 return nlevels 402 } 403 404 // determineNumToRead reads the definition levels (and optionally populates the repetition levels) 405 // in order to determine how many values need to be read to fulfill this batch read. 406 // 407 // batchLen is the number of values it is desired to read. defLvls must be either nil (in which case 408 // a buffer will be used) or must be at least batchLen in length to be safe. repLvls should be either nil 409 // (in which case it is ignored) or should be at least batchLen in length to be safe. 410 // 411 // In the return values: ndef is the number of definition levels that were actually read in which will 412 // typically be the minimum of batchLen and numAvailValues. 413 // toRead is the number of physical values that should be read in based on the definition levels (the number 414 // of definition levels that were equal to maxDefinitionLevel). and err being either nil or any error encountered 415 func (c *columnChunkReader) determineNumToRead(batchLen int64, defLvls, repLvls []int16) (ndefs int, toRead int64, err error) { 416 if !c.HasNext() { 417 return 0, 0, c.err 418 } 419 420 size := utils.Min(batchLen, c.numBuffered-c.numDecoded) 421 422 if c.descr.MaxDefinitionLevel() > 0 { 423 if defLvls == nil { 424 defLvls = c.getDefLvlBuffer(size) 425 } 426 ndefs, toRead = c.readDefinitionLevels(defLvls[:size]) 427 } else { 428 toRead = size 429 } 430 431 if c.descr.MaxRepetitionLevel() > 0 && repLvls != nil { 432 nreps := c.readRepetitionLevels(repLvls[:size]) 433 if defLvls != nil && ndefs != nreps { 434 err = xerrors.New("parquet: number of decoded rep/def levels did not match") 435 } 436 } 437 return 438 } 439 440 // skipValues some number of rows using readFn as the function to read the data and throw it away. 441 // If we can skipValues a whole page based on its metadata, then we do so, otherwise we read the 442 // page until we have skipped the number of rows desired. 443 func (c *columnChunkReader) skipValues(nvalues int64, readFn func(batch int64, buf []byte) (int64, error)) (int64, error) { 444 var err error 445 toskip := nvalues 446 for c.HasNext() && toskip > 0 { 447 // if number to skip is more than the number of undecoded values, skip the page 448 if toskip > (c.numBuffered - c.numDecoded) { 449 toskip -= c.numBuffered - c.numDecoded 450 c.numDecoded = c.numBuffered 451 } else { 452 var ( 453 batchSize int64 = 1024 454 valsRead int64 = 0 455 ) 456 457 scratch := c.bufferPool.Get().(*memory.Buffer) 458 defer func() { 459 scratch.ResizeNoShrink(0) 460 c.bufferPool.Put(scratch) 461 }() 462 bufMult := 1 463 if c.descr.PhysicalType() == parquet.Types.Boolean { 464 // for bools, BytesRequired returns 1 byte per 8 bool, but casting []byte to []bool requires 1 byte per 1 bool 465 bufMult = 8 466 } 467 scratch.Reserve(c.decoderTraits.BytesRequired(int(batchSize) * bufMult)) 468 469 for { 470 batchSize = utils.Min(batchSize, toskip) 471 valsRead, err = readFn(batchSize, scratch.Buf()) 472 toskip -= valsRead 473 if valsRead <= 0 || toskip <= 0 || err != nil { 474 break 475 } 476 } 477 } 478 } 479 if c.err != nil { 480 err = c.err 481 } 482 return nvalues - toskip, err 483 } 484 485 type readerFunc func(int64, int64) (int, error) 486 487 // base function for reading a batch of values, this will read until it either reads in batchSize values or 488 // it hits the end of the column chunk, including reading multiple pages. 489 // 490 // totalValues is the total number of values which were read in, and thus would be the total number 491 // of definition levels and repetition levels which were populated (if they were non-nil). totalRead 492 // is the number of physical values that were read in (ie: the number of non-null values) 493 func (c *columnChunkReader) readBatch(batchSize int64, defLvls, repLvls []int16, readFn readerFunc) (totalLvls int64, totalRead int, err error) { 494 var ( 495 read int 496 defs []int16 497 reps []int16 498 ndefs int 499 toRead int64 500 ) 501 502 for c.HasNext() && totalLvls < batchSize && err == nil { 503 if defLvls != nil { 504 defs = defLvls[totalLvls:] 505 } 506 if repLvls != nil { 507 reps = repLvls[totalLvls:] 508 } 509 ndefs, toRead, err = c.determineNumToRead(batchSize-totalLvls, defs, reps) 510 if err != nil { 511 return totalLvls, totalRead, err 512 } 513 514 read, err = readFn(int64(totalRead), toRead) 515 // the total number of values processed here is the maximum of 516 // the number of definition levels or the number of physical values read. 517 // if this is a required field, ndefs will be 0 since there is no definition 518 // levels stored with it and `read` will be the number of values, otherwise 519 // we use ndefs since it will be equal to or greater than read. 520 totalVals := int64(utils.Max(ndefs, read)) 521 c.consumeBufferedValues(totalVals) 522 523 totalLvls += totalVals 524 totalRead += read 525 } 526 return totalLvls, totalRead, err 527 }