github.com/apache/arrow/go/v10@v10.0.1/parquet/file/column_reader.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file 18 19 import ( 20 "fmt" 21 "sync" 22 23 "github.com/apache/arrow/go/v10/arrow/memory" 24 "github.com/apache/arrow/go/v10/internal/utils" 25 "github.com/apache/arrow/go/v10/parquet" 26 "github.com/apache/arrow/go/v10/parquet/internal/encoding" 27 "github.com/apache/arrow/go/v10/parquet/internal/encryption" 28 format "github.com/apache/arrow/go/v10/parquet/internal/gen-go/parquet" 29 "github.com/apache/arrow/go/v10/parquet/schema" 30 "golang.org/x/xerrors" 31 ) 32 33 const ( 34 // 4 MB is the default maximum page header size 35 defaultMaxPageHeaderSize = 4 * 1024 * 1024 36 // 16 KB is the default expected page header size 37 defaultPageHeaderSize = 16 * 1024 38 ) 39 40 //go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=../internal/encoding/physical_types.tmpldata column_reader_types.gen.go.tmpl 41 42 func isDictIndexEncoding(e format.Encoding) bool { 43 return e == format.Encoding_RLE_DICTIONARY || e == format.Encoding_PLAIN_DICTIONARY 44 } 45 46 // CryptoContext is a context for keeping track of the current methods for decrypting. 47 // It keeps track of the row group and column numbers along with references to the 48 // decryptor objects. 49 type CryptoContext struct { 50 StartDecryptWithDictionaryPage bool 51 RowGroupOrdinal int16 52 ColumnOrdinal int16 53 MetaDecryptor encryption.Decryptor 54 DataDecryptor encryption.Decryptor 55 } 56 57 // ColumnChunkReader is the basic interface for all column readers. It will use 58 // a page reader to read all the pages in a column chunk from a row group. 59 // 60 // To actually Read out the column data, you need to convert to the properly 61 // typed ColumnChunkReader type such as *BooleanColumnReader etc. 62 // 63 // Some things to clarify when working with column readers: 64 // 65 // "Values" refers to the physical data values in a data page. 66 // 67 // This is separate from the number of "rows" in a column and the total number 68 // of "elements" in a column because null values aren't stored physically in the 69 // data page but are represented via definition levels, so the number of values 70 // in a column can be less than the number of rows. 71 // 72 // The total number of "elements" in a column also differs because of potential 73 // repeated fields, where you can have multiple values in the page which 74 // together make up a single element (such as a list) or depending on the repetition 75 // level and definition level, could represent an entire null list or just a null 76 // element inside of a list. 77 type ColumnChunkReader interface { 78 // HasNext returns whether there is more data to be read in this column 79 // and row group. 80 HasNext() bool 81 // Type returns the underlying physical type of the column 82 Type() parquet.Type 83 // Descriptor returns the column schema container 84 Descriptor() *schema.Column 85 // if HasNext returns false because of an error, this will return the error 86 // it encountered. Otherwise this will be nil if it's just the end of the 87 // column 88 Err() error 89 // Skip buffered values 90 consumeBufferedValues(int64) 91 // number of available buffered values that have not been decoded yet 92 // when this returns 0, you're at the end of a page. 93 numAvailValues() int64 94 // read the definition levels and return the number of definitions, 95 // and the number of values to be read (number of def levels == maxdef level) 96 // it also populates the passed in slice which should be sized appropriately. 97 readDefinitionLevels(levels []int16) (int, int64) 98 // read the repetition levels and return the number of repetition levels read 99 // also populates the passed in slice, which should be sized appropriately. 100 readRepetitionLevels(levels []int16) int 101 // a column is made up of potentially multiple pages across potentially multiple 102 // row groups. A PageReader allows looping through the pages in a single row group. 103 // When moving to another row group for reading, use setPageReader to re-use the 104 // column reader for reading the pages of the new row group. 105 pager() PageReader 106 // set a page reader into the columnreader so it can be reused. 107 // 108 // This will clear any current error in the reader but does not 109 // automatically read the first page of the page reader passed in until 110 // HasNext which will read in the next page. 111 setPageReader(PageReader) 112 } 113 114 type columnChunkReader struct { 115 descr *schema.Column 116 rdr PageReader 117 repetitionDecoder encoding.LevelDecoder 118 definitionDecoder encoding.LevelDecoder 119 120 curPage Page 121 curEncoding format.Encoding 122 curDecoder encoding.TypedDecoder 123 124 // number of currently buffered values in the current page 125 numBuffered int64 126 // the number of values we've decoded so far 127 numDecoded int64 128 mem memory.Allocator 129 bufferPool *sync.Pool 130 131 decoders map[format.Encoding]encoding.TypedDecoder 132 decoderTraits encoding.DecoderTraits 133 134 // is set when an error is encountered 135 err error 136 defLvlBuffer []int16 137 } 138 139 // NewColumnReader returns a column reader for the provided column initialized with the given pagereader that will 140 // provide the pages of data for this column. The type is determined from the column passed in. 141 // 142 // In addition to the page reader and allocator, a pointer to a shared sync.Pool is expected to provide buffers for temporary 143 // usage to minimize allocations. The bufferPool should provide *memory.Buffer objects that can be resized as necessary, buffers 144 // should have `ResizeNoShrink(0)` called on them before being put back into the pool. 145 func NewColumnReader(descr *schema.Column, pageReader PageReader, mem memory.Allocator, bufferPool *sync.Pool) ColumnChunkReader { 146 base := columnChunkReader{descr: descr, rdr: pageReader, mem: mem, decoders: make(map[format.Encoding]encoding.TypedDecoder), bufferPool: bufferPool} 147 switch descr.PhysicalType() { 148 case parquet.Types.FixedLenByteArray: 149 base.decoderTraits = &encoding.FixedLenByteArrayDecoderTraits 150 return &FixedLenByteArrayColumnChunkReader{base} 151 case parquet.Types.Float: 152 base.decoderTraits = &encoding.Float32DecoderTraits 153 return &Float32ColumnChunkReader{base} 154 case parquet.Types.Double: 155 base.decoderTraits = &encoding.Float64DecoderTraits 156 return &Float64ColumnChunkReader{base} 157 case parquet.Types.ByteArray: 158 base.decoderTraits = &encoding.ByteArrayDecoderTraits 159 return &ByteArrayColumnChunkReader{base} 160 case parquet.Types.Int32: 161 base.decoderTraits = &encoding.Int32DecoderTraits 162 return &Int32ColumnChunkReader{base} 163 case parquet.Types.Int64: 164 base.decoderTraits = &encoding.Int64DecoderTraits 165 return &Int64ColumnChunkReader{base} 166 case parquet.Types.Int96: 167 base.decoderTraits = &encoding.Int96DecoderTraits 168 return &Int96ColumnChunkReader{base} 169 case parquet.Types.Boolean: 170 base.decoderTraits = &encoding.BooleanDecoderTraits 171 return &BooleanColumnChunkReader{base} 172 } 173 return nil 174 } 175 176 func (c *columnChunkReader) Err() error { return c.err } 177 func (c *columnChunkReader) Type() parquet.Type { return c.descr.PhysicalType() } 178 func (c *columnChunkReader) Descriptor() *schema.Column { return c.descr } 179 func (c *columnChunkReader) consumeBufferedValues(n int64) { c.numDecoded += n } 180 func (c *columnChunkReader) numAvailValues() int64 { return c.numBuffered - c.numDecoded } 181 func (c *columnChunkReader) pager() PageReader { return c.rdr } 182 func (c *columnChunkReader) setPageReader(rdr PageReader) { 183 c.rdr, c.err = rdr, nil 184 c.decoders = make(map[format.Encoding]encoding.TypedDecoder) 185 c.numBuffered, c.numDecoded = 0, 0 186 } 187 188 func (c *columnChunkReader) getDefLvlBuffer(sz int64) []int16 { 189 if int64(len(c.defLvlBuffer)) < sz { 190 c.defLvlBuffer = make([]int16, sz) 191 return c.defLvlBuffer 192 } 193 194 return c.defLvlBuffer[:sz] 195 } 196 197 // HasNext returns whether there is more data to be read in this column 198 // and row group. 199 func (c *columnChunkReader) HasNext() bool { 200 if c.numBuffered == 0 || c.numDecoded == c.numBuffered { 201 return c.readNewPage() && c.numBuffered != 0 202 } 203 return true 204 } 205 206 func (c *columnChunkReader) configureDict(page *DictionaryPage) error { 207 enc := page.encoding 208 if enc == format.Encoding_PLAIN_DICTIONARY || enc == format.Encoding_PLAIN { 209 enc = format.Encoding_RLE_DICTIONARY 210 } 211 212 if _, ok := c.decoders[enc]; ok { 213 return xerrors.New("parquet: column chunk cannot have more than one dictionary.") 214 } 215 216 switch page.Encoding() { 217 case format.Encoding_PLAIN, format.Encoding_PLAIN_DICTIONARY: 218 dict := c.decoderTraits.Decoder(parquet.Encodings.Plain, c.descr, false, c.mem) 219 dict.SetData(int(page.NumValues()), page.Data()) 220 221 decoder := c.decoderTraits.Decoder(parquet.Encodings.Plain, c.descr, true, c.mem).(encoding.DictDecoder) 222 decoder.SetDict(dict) 223 c.decoders[enc] = decoder 224 default: 225 return xerrors.New("parquet: dictionary index must be plain encoding") 226 } 227 228 c.curDecoder = c.decoders[enc] 229 return nil 230 } 231 232 // read a new page from the page reader 233 func (c *columnChunkReader) readNewPage() bool { 234 for c.rdr.Next() { // keep going until we get a data page 235 c.curPage = c.rdr.Page() 236 if c.curPage == nil { 237 break 238 } 239 240 var lvlByteLen int64 241 switch p := c.curPage.(type) { 242 case *DictionaryPage: 243 if err := c.configureDict(p); err != nil { 244 c.err = err 245 return false 246 } 247 continue 248 case *DataPageV1: 249 lvlByteLen, c.err = c.initLevelDecodersV1(p, p.repLvlEncoding, p.defLvlEncoding) 250 if c.err != nil { 251 return false 252 } 253 case *DataPageV2: 254 lvlByteLen, c.err = c.initLevelDecodersV2(p) 255 if c.err != nil { 256 return false 257 } 258 default: 259 // we can skip non-data pages 260 continue 261 } 262 263 c.err = c.initDataDecoder(c.curPage, lvlByteLen) 264 return c.err == nil 265 } 266 c.err = c.rdr.Err() 267 return false 268 } 269 270 func (c *columnChunkReader) initLevelDecodersV2(page *DataPageV2) (int64, error) { 271 c.numBuffered = int64(page.nvals) 272 c.numDecoded = 0 273 buf := page.Data() 274 totalLvlLen := int64(page.repLvlByteLen) + int64(page.defLvlByteLen) 275 276 if totalLvlLen > int64(len(buf)) { 277 return totalLvlLen, xerrors.New("parquet: data page too small for levels (corrupt header?)") 278 } 279 280 if c.descr.MaxRepetitionLevel() > 0 { 281 c.repetitionDecoder.SetDataV2(page.repLvlByteLen, c.descr.MaxRepetitionLevel(), int(c.numBuffered), buf) 282 } 283 // ARROW-17453: Some writers will write repetition levels even when 284 // the max repetition level is 0, so we should respect the value 285 // in the page header regardless of whether MaxRepetitionLevel is 0 286 // or not. 287 buf = buf[page.repLvlByteLen:] 288 289 if c.descr.MaxDefinitionLevel() > 0 { 290 c.definitionDecoder.SetDataV2(page.defLvlByteLen, c.descr.MaxDefinitionLevel(), int(c.numBuffered), buf) 291 } 292 293 return totalLvlLen, nil 294 } 295 296 func (c *columnChunkReader) initLevelDecodersV1(page *DataPageV1, repLvlEncoding, defLvlEncoding format.Encoding) (int64, error) { 297 c.numBuffered = int64(page.nvals) 298 c.numDecoded = 0 299 300 buf := page.Data() 301 maxSize := len(buf) 302 levelsByteLen := int64(0) 303 304 // Data page layout: Repetition Levels - Definition Levels - encoded values. 305 // Levels are encoded as rle or bit-packed 306 if c.descr.MaxRepetitionLevel() > 0 { 307 repBytes, err := c.repetitionDecoder.SetData(parquet.Encoding(repLvlEncoding), c.descr.MaxRepetitionLevel(), int(c.numBuffered), buf) 308 if err != nil { 309 return levelsByteLen, err 310 } 311 buf = buf[repBytes:] 312 maxSize -= repBytes 313 levelsByteLen += int64(repBytes) 314 } 315 316 if c.descr.MaxDefinitionLevel() > 0 { 317 defBytes, err := c.definitionDecoder.SetData(parquet.Encoding(defLvlEncoding), c.descr.MaxDefinitionLevel(), int(c.numBuffered), buf) 318 if err != nil { 319 return levelsByteLen, err 320 } 321 levelsByteLen += int64(defBytes) 322 maxSize -= defBytes 323 } 324 325 return levelsByteLen, nil 326 } 327 328 func (c *columnChunkReader) initDataDecoder(page Page, lvlByteLen int64) error { 329 buf := page.Data() 330 if int64(len(buf)) < lvlByteLen { 331 return xerrors.New("parquet: page smaller than size of encoded levels") 332 } 333 334 buf = buf[lvlByteLen:] 335 encoding := page.Encoding() 336 337 if isDictIndexEncoding(encoding) { 338 encoding = format.Encoding_RLE_DICTIONARY 339 } 340 341 if decoder, ok := c.decoders[encoding]; ok { 342 c.curDecoder = decoder 343 } else { 344 switch encoding { 345 case format.Encoding_PLAIN, 346 format.Encoding_DELTA_BYTE_ARRAY, 347 format.Encoding_DELTA_LENGTH_BYTE_ARRAY, 348 format.Encoding_DELTA_BINARY_PACKED: 349 c.curDecoder = c.decoderTraits.Decoder(parquet.Encoding(encoding), c.descr, false, c.mem) 350 c.decoders[encoding] = c.curDecoder 351 case format.Encoding_RLE_DICTIONARY: 352 return xerrors.New("parquet: dictionary page must be before data page") 353 case format.Encoding_BYTE_STREAM_SPLIT: 354 return fmt.Errorf("parquet: unsupported data encoding %s", encoding) 355 default: 356 return fmt.Errorf("parquet: unknown encoding type %s", encoding) 357 } 358 } 359 360 c.curEncoding = encoding 361 c.curDecoder.SetData(int(c.numBuffered), buf) 362 return nil 363 } 364 365 // readDefinitionLevels decodes the definition levels from the page and returns 366 // it returns the total number of levels that were decoded (and thus populated 367 // in the passed in slice) and the number of physical values that exist to read 368 // (the number of levels that are equal to the max definition level). 369 // 370 // If the max definition level is 0, the assumption is that there no nulls in the 371 // column and therefore no definition levels to read, so it will always return 0, 0 372 func (c *columnChunkReader) readDefinitionLevels(levels []int16) (totalDecoded int, valuesToRead int64) { 373 if c.descr.MaxDefinitionLevel() == 0 { 374 return 0, 0 375 } 376 377 return c.definitionDecoder.Decode(levels) 378 } 379 380 // readRepetitionLevels decodes the repetition levels from the page and returns 381 // the total number of values decoded (and thus populated in the passed in levels 382 // slice). 383 // 384 // If max repetition level is 0, it is assumed there are no repetition levels, 385 // and thus will always return 0. 386 func (c *columnChunkReader) readRepetitionLevels(levels []int16) int { 387 if c.descr.MaxRepetitionLevel() == 0 { 388 return 0 389 } 390 391 nlevels, _ := c.repetitionDecoder.Decode(levels) 392 return nlevels 393 } 394 395 // determineNumToRead reads the definition levels (and optionally populates the repetition levels) 396 // in order to determine how many values need to be read to fulfill this batch read. 397 // 398 // batchLen is the number of values it is desired to read. defLvls must be either nil (in which case 399 // a buffer will be used) or must be at least batchLen in length to be safe. repLvls should be either nil 400 // (in which case it is ignored) or should be at least batchLen in length to be safe. 401 // 402 // In the return values: ndef is the number of definition levels that were actually read in which will 403 // typically be the minimum of batchLen and numAvailValues. 404 // toRead is the number of physical values that should be read in based on the definition levels (the number 405 // of definition levels that were equal to maxDefinitionLevel). and err being either nil or any error encountered 406 func (c *columnChunkReader) determineNumToRead(batchLen int64, defLvls, repLvls []int16) (ndefs int, toRead int64, err error) { 407 if !c.HasNext() { 408 return 0, 0, c.err 409 } 410 411 size := utils.Min(batchLen, c.numBuffered-c.numDecoded) 412 413 if c.descr.MaxDefinitionLevel() > 0 { 414 if defLvls == nil { 415 defLvls = c.getDefLvlBuffer(size) 416 } 417 ndefs, toRead = c.readDefinitionLevels(defLvls[:size]) 418 } else { 419 toRead = size 420 } 421 422 if c.descr.MaxRepetitionLevel() > 0 && repLvls != nil { 423 nreps := c.readRepetitionLevels(repLvls[:size]) 424 if defLvls != nil && ndefs != nreps { 425 err = xerrors.New("parquet: number of decoded rep/def levels did not match") 426 } 427 } 428 return 429 } 430 431 // skipValues some number of rows using readFn as the function to read the data and throw it away. 432 // If we can skipValues a whole page based on its metadata, then we do so, otherwise we read the 433 // page until we have skipped the number of rows desired. 434 func (c *columnChunkReader) skipValues(nvalues int64, readFn func(batch int64, buf []byte) (int64, error)) (int64, error) { 435 var err error 436 toskip := nvalues 437 for c.HasNext() && toskip > 0 { 438 // if number to skip is more than the number of undecoded values, skip the page 439 if toskip > (c.numBuffered - c.numDecoded) { 440 toskip -= c.numBuffered - c.numDecoded 441 c.numDecoded = c.numBuffered 442 } else { 443 var ( 444 batchSize int64 = 1024 445 valsRead int64 = 0 446 ) 447 448 scratch := c.bufferPool.Get().(*memory.Buffer) 449 defer func() { 450 scratch.ResizeNoShrink(0) 451 c.bufferPool.Put(scratch) 452 }() 453 bufMult := 1 454 if c.descr.PhysicalType() == parquet.Types.Boolean { 455 // for bools, BytesRequired returns 1 byte per 8 bool, but casting []byte to []bool requires 1 byte per 1 bool 456 bufMult = 8 457 } 458 scratch.Reserve(c.decoderTraits.BytesRequired(int(batchSize) * bufMult)) 459 460 for { 461 batchSize = utils.Min(batchSize, toskip) 462 valsRead, err = readFn(batchSize, scratch.Buf()) 463 toskip -= valsRead 464 if valsRead <= 0 || toskip <= 0 || err != nil { 465 break 466 } 467 } 468 } 469 } 470 if c.err != nil { 471 err = c.err 472 } 473 return nvalues - toskip, err 474 } 475 476 type readerFunc func(int64, int64) (int, error) 477 478 // base function for reading a batch of values, this will read until it either reads in batchSize values or 479 // it hits the end of the column chunk, including reading multiple pages. 480 // 481 // totalValues is the total number of values which were read in, and thus would be the total number 482 // of definition levels and repetition levels which were populated (if they were non-nil). totalRead 483 // is the number of physical values that were read in (ie: the number of non-null values) 484 func (c *columnChunkReader) readBatch(batchSize int64, defLvls, repLvls []int16, readFn readerFunc) (totalLvls int64, totalRead int, err error) { 485 var ( 486 read int 487 defs []int16 488 reps []int16 489 ndefs int 490 toRead int64 491 ) 492 493 for c.HasNext() && totalLvls < batchSize && err == nil { 494 if defLvls != nil { 495 defs = defLvls[totalLvls:] 496 } 497 if repLvls != nil { 498 reps = repLvls[totalLvls:] 499 } 500 ndefs, toRead, err = c.determineNumToRead(batchSize-totalLvls, defs, reps) 501 if err != nil { 502 return totalLvls, totalRead, err 503 } 504 505 read, err = readFn(int64(totalRead), toRead) 506 // the total number of values processed here is the maximum of 507 // the number of definition levels or the number of physical values read. 508 // if this is a required field, ndefs will be 0 since there is no definition 509 // levels stored with it and `read` will be the number of values, otherwise 510 // we use ndefs since it will be equal to or greater than read. 511 totalVals := int64(utils.MaxInt(ndefs, read)) 512 c.consumeBufferedValues(totalVals) 513 514 totalLvls += totalVals 515 totalRead += read 516 } 517 return totalLvls, totalRead, err 518 }