github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/file.go (about) 1 package parquet 2 3 import ( 4 "bufio" 5 "encoding/binary" 6 "fmt" 7 "hash/crc32" 8 "io" 9 "sort" 10 "sync" 11 12 "github.com/segmentio/encoding/thrift" 13 "github.com/vc42/parquet-go/format" 14 ) 15 16 const ( 17 defaultDictBufferSize = 8192 18 defaultReadBufferSize = 4096 19 defaultLevelBufferSize = 1024 20 ) 21 22 // File represents a parquet file. The layout of a Parquet file can be found 23 // here: https://github.com/apache/parquet-format#file-format 24 type File struct { 25 metadata format.FileMetaData 26 protocol thrift.CompactProtocol 27 reader io.ReaderAt 28 size int64 29 schema *Schema 30 root *Column 31 columnIndexes []format.ColumnIndex 32 offsetIndexes []format.OffsetIndex 33 rowGroups []RowGroup 34 } 35 36 // OpenFile opens a parquet file and reads the content between offset 0 and the given 37 // size in r. 38 // 39 // Only the parquet magic bytes and footer are read, column chunks and other 40 // parts of the file are left untouched; this means that successfully opening 41 // a file does not validate that the pages have valid checksums. 42 func OpenFile(r io.ReaderAt, size int64, options ...FileOption) (*File, error) { 43 b := make([]byte, 8) 44 f := &File{reader: r, size: size} 45 c, err := NewFileConfig(options...) 46 if err != nil { 47 return nil, err 48 } 49 50 if _, err := r.ReadAt(b[:4], 0); err != nil { 51 return nil, fmt.Errorf("reading magic header of parquet file: %w", err) 52 } 53 if string(b[:4]) != "PAR1" { 54 return nil, fmt.Errorf("invalid magic header of parquet file: %q", b[:4]) 55 } 56 57 if cast, ok := f.reader.(interface{ SetMagicFooterSection(offset, length int64) }); ok { 58 cast.SetMagicFooterSection(size-8, 8) 59 } 60 if _, err := r.ReadAt(b[:8], size-8); err != nil { 61 return nil, fmt.Errorf("reading magic footer of parquet file: %w", err) 62 } 63 if string(b[4:8]) != "PAR1" { 64 return nil, fmt.Errorf("invalid magic footer of parquet file: %q", b[4:8]) 65 } 66 67 footerSize := int64(binary.LittleEndian.Uint32(b[:4])) 68 footerData := make([]byte, footerSize) 69 70 if cast, ok := f.reader.(interface{ SetFooterSection(offset, length int64) }); ok { 71 cast.SetFooterSection(size-(footerSize+8), footerSize) 72 } 73 if _, err := f.reader.ReadAt(footerData, size-(footerSize+8)); err != nil { 74 return nil, fmt.Errorf("reading footer of parquet file: %w", err) 75 } 76 if err := thrift.Unmarshal(&f.protocol, footerData, &f.metadata); err != nil { 77 return nil, fmt.Errorf("reading parquet file metadata: %w", err) 78 } 79 if len(f.metadata.Schema) == 0 { 80 return nil, ErrMissingRootColumn 81 } 82 83 if !c.SkipPageIndex { 84 if f.columnIndexes, f.offsetIndexes, err = f.ReadPageIndex(); err != nil { 85 return nil, fmt.Errorf("reading page index of parquet file: %w", err) 86 } 87 } 88 89 if f.root, err = openColumns(f); err != nil { 90 return nil, fmt.Errorf("opening columns of parquet file: %w", err) 91 } 92 93 schema := NewSchema(f.root.Name(), f.root) 94 columns := make([]*Column, 0, numLeafColumnsOf(f.root)) 95 f.schema = schema 96 f.root.forEachLeaf(func(c *Column) { columns = append(columns, c) }) 97 98 rowGroups := make([]fileRowGroup, len(f.metadata.RowGroups)) 99 for i := range rowGroups { 100 rowGroups[i].init(f, schema, columns, &f.metadata.RowGroups[i]) 101 } 102 f.rowGroups = make([]RowGroup, len(rowGroups)) 103 for i := range rowGroups { 104 f.rowGroups[i] = &rowGroups[i] 105 } 106 107 if !c.SkipBloomFilters { 108 h := format.BloomFilterHeader{} 109 p := thrift.CompactProtocol{} 110 s := io.NewSectionReader(r, 0, size) 111 d := thrift.NewDecoder(p.NewReader(s)) 112 113 for i := range rowGroups { 114 g := &rowGroups[i] 115 116 for j := range g.columns { 117 c := g.columns[j].(*fileColumnChunk) 118 119 if offset := c.chunk.MetaData.BloomFilterOffset; offset > 0 { 120 s.Seek(offset, io.SeekStart) 121 h = format.BloomFilterHeader{} 122 if err := d.Decode(&h); err != nil { 123 return nil, err 124 } 125 offset, _ = s.Seek(0, io.SeekCurrent) 126 if cast, ok := r.(interface{ SetBloomFilterSection(offset, length int64) }); ok { 127 bloomFilterOffset := c.chunk.MetaData.BloomFilterOffset 128 bloomFilterLength := (offset - bloomFilterOffset) + int64(h.NumBytes) 129 cast.SetBloomFilterSection(bloomFilterOffset, bloomFilterLength) 130 } 131 132 c.bloomFilter = newBloomFilter(r, offset, &h) 133 } 134 } 135 } 136 } 137 138 sortKeyValueMetadata(f.metadata.KeyValueMetadata) 139 return f, nil 140 } 141 142 // ReadPageIndex reads the page index section of the parquet file f. 143 // 144 // If the file did not contain a page index, the method returns two empty slices 145 // and a nil error. 146 // 147 // Only leaf columns have indexes, the returned indexes are arranged using the 148 // following layout: 149 // 150 // + -------------- + 151 // | col 0: chunk 0 | 152 // + -------------- + 153 // | col 1: chunk 0 | 154 // + -------------- + 155 // | ... | 156 // + -------------- + 157 // | col 0: chunk 1 | 158 // + -------------- + 159 // | col 1: chunk 1 | 160 // + -------------- + 161 // | ... | 162 // + -------------- + 163 // 164 // This method is useful in combination with the SkipPageIndex option to delay 165 // reading the page index section until after the file was opened. Note that in 166 // this case the page index is not cached within the file, programs are expected 167 // to make use of independently from the parquet package. 168 func (f *File) ReadPageIndex() ([]format.ColumnIndex, []format.OffsetIndex, error) { 169 columnIndexOffset := f.metadata.RowGroups[0].Columns[0].ColumnIndexOffset 170 offsetIndexOffset := f.metadata.RowGroups[0].Columns[0].OffsetIndexOffset 171 columnIndexLength := int64(0) 172 offsetIndexLength := int64(0) 173 174 if columnIndexOffset == 0 || offsetIndexOffset == 0 { 175 return nil, nil, nil 176 } 177 178 forEachColumnChunk := func(do func(int, int, *format.ColumnChunk) error) error { 179 for i := range f.metadata.RowGroups { 180 for j := range f.metadata.RowGroups[i].Columns { 181 c := &f.metadata.RowGroups[i].Columns[j] 182 if err := do(i, j, c); err != nil { 183 return err 184 } 185 } 186 } 187 return nil 188 } 189 190 forEachColumnChunk(func(_, _ int, c *format.ColumnChunk) error { 191 columnIndexLength += int64(c.ColumnIndexLength) 192 offsetIndexLength += int64(c.OffsetIndexLength) 193 return nil 194 }) 195 196 numRowGroups := len(f.metadata.RowGroups) 197 numColumns := len(f.metadata.RowGroups[0].Columns) 198 numColumnChunks := numRowGroups * numColumns 199 200 columnIndexes := make([]format.ColumnIndex, numColumnChunks) 201 offsetIndexes := make([]format.OffsetIndex, numColumnChunks) 202 indexBuffer := make([]byte, max(int(columnIndexLength), int(offsetIndexLength))) 203 204 if columnIndexOffset > 0 { 205 columnIndexData := indexBuffer[:columnIndexLength] 206 207 if cast, ok := f.reader.(interface{ SetColumnIndexSection(offset, length int64) }); ok { 208 cast.SetColumnIndexSection(columnIndexOffset, columnIndexLength) 209 } 210 if _, err := f.reader.ReadAt(columnIndexData, columnIndexOffset); err != nil { 211 return nil, nil, fmt.Errorf("reading %d bytes column index at offset %d: %w", columnIndexLength, columnIndexOffset, err) 212 } 213 214 err := forEachColumnChunk(func(i, j int, c *format.ColumnChunk) error { 215 offset := c.ColumnIndexOffset - columnIndexOffset 216 length := int64(c.ColumnIndexLength) 217 buffer := columnIndexData[offset : offset+length] 218 if err := thrift.Unmarshal(&f.protocol, buffer, &columnIndexes[(i*numColumns)+j]); err != nil { 219 return fmt.Errorf("decoding column index: rowGroup=%d columnChunk=%d/%d: %w", i, j, numColumns, err) 220 } 221 return nil 222 }) 223 if err != nil { 224 return nil, nil, err 225 } 226 } 227 228 if offsetIndexOffset > 0 { 229 offsetIndexData := indexBuffer[:offsetIndexLength] 230 231 if cast, ok := f.reader.(interface{ SetOffsetIndexSection(offset, length int64) }); ok { 232 cast.SetOffsetIndexSection(offsetIndexOffset, offsetIndexLength) 233 } 234 if _, err := f.reader.ReadAt(offsetIndexData, offsetIndexOffset); err != nil { 235 return nil, nil, fmt.Errorf("reading %d bytes offset index at offset %d: %w", offsetIndexLength, offsetIndexOffset, err) 236 } 237 238 err := forEachColumnChunk(func(i, j int, c *format.ColumnChunk) error { 239 offset := c.OffsetIndexOffset - offsetIndexOffset 240 length := int64(c.OffsetIndexLength) 241 buffer := offsetIndexData[offset : offset+length] 242 if err := thrift.Unmarshal(&f.protocol, buffer, &offsetIndexes[(i*numColumns)+j]); err != nil { 243 return fmt.Errorf("decoding column index: rowGroup=%d columnChunk=%d/%d: %w", i, j, numColumns, err) 244 } 245 return nil 246 }) 247 if err != nil { 248 return nil, nil, err 249 } 250 } 251 252 return columnIndexes, offsetIndexes, nil 253 } 254 255 // NumRows returns the number of rows in the file. 256 func (f *File) NumRows() int64 { return f.metadata.NumRows } 257 258 // RowGroups returns the list of row group in the file. 259 func (f *File) RowGroups() []RowGroup { return f.rowGroups } 260 261 // Root returns the root column of f. 262 func (f *File) Root() *Column { return f.root } 263 264 // Schema returns the schema of f. 265 func (f *File) Schema() *Schema { return f.schema } 266 267 // Metadata returns the metadata of f. 268 func (f *File) Metadata() *format.FileMetaData { return &f.metadata } 269 270 // Size returns the size of f (in bytes). 271 func (f *File) Size() int64 { return f.size } 272 273 // ReadAt reads bytes into b from f at the given offset. 274 // 275 // The method satisfies the io.ReaderAt interface. 276 func (f *File) ReadAt(b []byte, off int64) (int, error) { 277 if off < 0 || off >= f.size { 278 return 0, io.EOF 279 } 280 281 if limit := f.size - off; limit < int64(len(b)) { 282 n, err := f.reader.ReadAt(b[:limit], off) 283 if err == nil { 284 err = io.EOF 285 } 286 return n, err 287 } 288 289 return f.reader.ReadAt(b, off) 290 } 291 292 // ColumnIndexes returns the page index of the parquet file f. 293 // 294 // If the file did not contain a column index, the method returns an empty slice 295 // and nil error. 296 func (f *File) ColumnIndexes() []format.ColumnIndex { return f.columnIndexes } 297 298 // OffsetIndexes returns the page index of the parquet file f. 299 // 300 // If the file did not contain an offset index, the method returns an empty 301 // slice and nil error. 302 func (f *File) OffsetIndexes() []format.OffsetIndex { return f.offsetIndexes } 303 304 // Lookup returns the value associated with the given key in the file key/value 305 // metadata. 306 // 307 // The ok boolean will be true if the key was found, false otherwise. 308 func (f *File) Lookup(key string) (value string, ok bool) { 309 return lookupKeyValueMetadata(f.metadata.KeyValueMetadata, key) 310 } 311 312 func (f *File) hasIndexes() bool { 313 return f.columnIndexes != nil && f.offsetIndexes != nil 314 } 315 316 var _ io.ReaderAt = (*File)(nil) 317 318 func sortKeyValueMetadata(keyValueMetadata []format.KeyValue) { 319 sort.Slice(keyValueMetadata, func(i, j int) bool { 320 switch { 321 case keyValueMetadata[i].Key < keyValueMetadata[j].Key: 322 return true 323 case keyValueMetadata[i].Key > keyValueMetadata[j].Key: 324 return false 325 default: 326 return keyValueMetadata[i].Value < keyValueMetadata[j].Value 327 } 328 }) 329 } 330 331 func lookupKeyValueMetadata(keyValueMetadata []format.KeyValue, key string) (value string, ok bool) { 332 i := sort.Search(len(keyValueMetadata), func(i int) bool { 333 return keyValueMetadata[i].Key >= key 334 }) 335 if i == len(keyValueMetadata) || keyValueMetadata[i].Key != key { 336 return "", false 337 } 338 return keyValueMetadata[i].Value, true 339 } 340 341 type fileRowGroup struct { 342 schema *Schema 343 rowGroup *format.RowGroup 344 columns []ColumnChunk 345 sorting []SortingColumn 346 } 347 348 func (g *fileRowGroup) init(file *File, schema *Schema, columns []*Column, rowGroup *format.RowGroup) { 349 g.schema = schema 350 g.rowGroup = rowGroup 351 g.columns = make([]ColumnChunk, len(rowGroup.Columns)) 352 g.sorting = make([]SortingColumn, len(rowGroup.SortingColumns)) 353 fileColumnChunks := make([]fileColumnChunk, len(rowGroup.Columns)) 354 355 for i := range g.columns { 356 fileColumnChunks[i] = fileColumnChunk{ 357 file: file, 358 column: columns[i], 359 rowGroup: rowGroup, 360 chunk: &rowGroup.Columns[i], 361 } 362 363 if file.hasIndexes() { 364 j := (int(rowGroup.Ordinal) * len(columns)) + i 365 fileColumnChunks[i].columnIndex = &file.columnIndexes[j] 366 fileColumnChunks[i].offsetIndex = &file.offsetIndexes[j] 367 } 368 369 g.columns[i] = &fileColumnChunks[i] 370 } 371 372 for i := range g.sorting { 373 g.sorting[i] = &fileSortingColumn{ 374 column: columns[rowGroup.SortingColumns[i].ColumnIdx], 375 descending: rowGroup.SortingColumns[i].Descending, 376 nullsFirst: rowGroup.SortingColumns[i].NullsFirst, 377 } 378 } 379 } 380 381 func (g *fileRowGroup) Schema() *Schema { return g.schema } 382 func (g *fileRowGroup) NumRows() int64 { return g.rowGroup.NumRows } 383 func (g *fileRowGroup) ColumnChunks() []ColumnChunk { return g.columns } 384 func (g *fileRowGroup) SortingColumns() []SortingColumn { return g.sorting } 385 func (g *fileRowGroup) Rows() Rows { return &rowGroupRows{rowGroup: g} } 386 387 type fileSortingColumn struct { 388 column *Column 389 descending bool 390 nullsFirst bool 391 } 392 393 func (s *fileSortingColumn) Path() []string { return s.column.Path() } 394 func (s *fileSortingColumn) Descending() bool { return s.descending } 395 func (s *fileSortingColumn) NullsFirst() bool { return s.nullsFirst } 396 397 type fileColumnChunk struct { 398 file *File 399 column *Column 400 bloomFilter *bloomFilter 401 rowGroup *format.RowGroup 402 columnIndex *format.ColumnIndex 403 offsetIndex *format.OffsetIndex 404 chunk *format.ColumnChunk 405 } 406 407 func (c *fileColumnChunk) Type() Type { 408 return c.column.Type() 409 } 410 411 func (c *fileColumnChunk) Column() int { 412 return int(c.column.Index()) 413 } 414 415 func (c *fileColumnChunk) Pages() Pages { 416 r := new(filePages) 417 r.init(c) 418 return r 419 } 420 421 func (c *fileColumnChunk) ColumnIndex() ColumnIndex { 422 if c.columnIndex == nil { 423 return nil 424 } 425 return fileColumnIndex{c} 426 } 427 428 func (c *fileColumnChunk) OffsetIndex() OffsetIndex { 429 if c.offsetIndex == nil { 430 return nil 431 } 432 return (*fileOffsetIndex)(c.offsetIndex) 433 } 434 435 func (c *fileColumnChunk) BloomFilter() BloomFilter { 436 if c.bloomFilter == nil { 437 return nil 438 } 439 return c.bloomFilter 440 } 441 442 func (c *fileColumnChunk) NumValues() int64 { 443 return c.chunk.MetaData.NumValues 444 } 445 446 type filePages struct { 447 chunk *fileColumnChunk 448 dictPage *dictPage 449 dataPage *dataPage 450 rbuf *bufio.Reader 451 section io.SectionReader 452 453 protocol thrift.CompactProtocol 454 decoder thrift.Decoder 455 456 baseOffset int64 457 dataOffset int64 458 dictOffset int64 459 index int 460 skip int64 461 } 462 463 func (f *filePages) init(c *fileColumnChunk) { 464 f.dataPage = acquireDataPage() 465 f.chunk = c 466 f.baseOffset = c.chunk.MetaData.DataPageOffset 467 f.dataOffset = f.baseOffset 468 469 if c.chunk.MetaData.DictionaryPageOffset != 0 { 470 f.baseOffset = c.chunk.MetaData.DictionaryPageOffset 471 f.dictOffset = f.baseOffset 472 } 473 474 f.section = *io.NewSectionReader(c.file, f.baseOffset, c.chunk.MetaData.TotalCompressedSize) 475 f.rbuf = acquireReadBuffer(&f.section) 476 f.decoder.Reset(f.protocol.NewReader(f.rbuf)) 477 } 478 479 func (f *filePages) ReadPage() (Page, error) { 480 if f.chunk == nil { 481 return nil, io.EOF 482 } 483 484 for { 485 header := new(format.PageHeader) 486 if err := f.decoder.Decode(header); err != nil { 487 return nil, err 488 } 489 if err := f.readPage(header, f.dataPage, f.rbuf); err != nil { 490 return nil, err 491 } 492 493 var page Page 494 var err error 495 496 switch header.Type { 497 case format.DataPageV2: 498 page, err = f.readDataPageV2(header) 499 case format.DataPage: 500 page, err = f.readDataPageV1(header) 501 case format.DictionaryPage: 502 // Sometimes parquet files do not have the dictionary page offset 503 // recorded in the column metadata. We account for this by lazily 504 // reading dictionary pages when we encounter them. 505 err = f.readDictionaryPage(header, f.dataPage) 506 default: 507 err = fmt.Errorf("cannot read values of type %s from page", header.Type) 508 } 509 510 if err != nil { 511 return nil, fmt.Errorf("decoding page %d of column %q: %w", f.index, f.columnPath(), err) 512 } 513 514 if page != nil { 515 f.index++ 516 if f.skip == 0 { 517 return page, nil 518 } 519 520 // TODO: what about pages that don't embed the number of rows? 521 // (data page v1 with no offset index in the column chunk). 522 numRows := page.NumRows() 523 if numRows > f.skip { 524 seek := f.skip 525 f.skip = 0 526 if seek > 0 { 527 page = page.Buffer().Slice(seek, numRows) 528 } 529 return page, nil 530 } 531 532 f.skip -= numRows 533 } 534 } 535 } 536 537 func (f *filePages) readDictionary() error { 538 chunk := io.NewSectionReader(f.chunk.file, f.baseOffset, f.chunk.chunk.MetaData.TotalCompressedSize) 539 rbuf := acquireReadBuffer(chunk) 540 defer releaseReadBuffer(rbuf) 541 542 decoder := thrift.NewDecoder(f.protocol.NewReader(rbuf)) 543 header := new(format.PageHeader) 544 545 if err := decoder.Decode(header); err != nil { 546 return err 547 } 548 549 page := acquireDataPage() 550 defer releaseDataPage(page) 551 552 if err := f.readPage(header, page, rbuf); err != nil { 553 return err 554 } 555 556 return f.readDictionaryPage(header, page) 557 } 558 559 func (f *filePages) readDictionaryPage(header *format.PageHeader, page *dataPage) (err error) { 560 if header.DictionaryPageHeader == nil { 561 return ErrMissingPageHeader 562 } 563 f.dictPage, _ = dictPagePool.Get().(*dictPage) 564 if f.dictPage == nil { 565 f.dictPage = new(dictPage) 566 } 567 f.dataPage.dictionary, err = f.chunk.column.decodeDictionary( 568 DictionaryPageHeader{header.DictionaryPageHeader}, 569 page, 570 f.dictPage, 571 ) 572 return err 573 } 574 575 func (f *filePages) readDataPageV1(header *format.PageHeader) (Page, error) { 576 if header.DataPageHeader == nil { 577 return nil, ErrMissingPageHeader 578 } 579 if isDictionaryFormat(header.DataPageHeader.Encoding) && f.dataPage.dictionary == nil { 580 if err := f.readDictionary(); err != nil { 581 return nil, err 582 } 583 } 584 return f.chunk.column.decodeDataPageV1(DataPageHeaderV1{header.DataPageHeader}, f.dataPage) 585 } 586 587 func (f *filePages) readDataPageV2(header *format.PageHeader) (Page, error) { 588 if header.DataPageHeaderV2 == nil { 589 return nil, ErrMissingPageHeader 590 } 591 if isDictionaryFormat(header.DataPageHeaderV2.Encoding) && f.dataPage.dictionary == nil { 592 // If the program seeked to a row passed the first page, the dictionary 593 // page may not have been seen, in which case we have to lazily load it 594 // from the beginning of column chunk. 595 if err := f.readDictionary(); err != nil { 596 return nil, err 597 } 598 } 599 return f.chunk.column.decodeDataPageV2(DataPageHeaderV2{header.DataPageHeaderV2}, f.dataPage) 600 } 601 602 func (f *filePages) readPage(header *format.PageHeader, page *dataPage, reader *bufio.Reader) error { 603 compressedPageSize, uncompressedPageSize := int(header.CompressedPageSize), int(header.UncompressedPageSize) 604 605 if cap(page.data) < compressedPageSize { 606 page.data = make([]byte, compressedPageSize) 607 } else { 608 page.data = page.data[:compressedPageSize] 609 } 610 if cap(page.values) < uncompressedPageSize { 611 page.values = make([]byte, 0, uncompressedPageSize) 612 } 613 614 if _, err := io.ReadFull(reader, page.data); err != nil { 615 return err 616 } 617 618 if header.CRC != 0 { 619 headerChecksum := uint32(header.CRC) 620 bufferChecksum := crc32.ChecksumIEEE(page.data) 621 622 if headerChecksum != bufferChecksum { 623 // The parquet specs indicate that corruption errors could be 624 // handled gracefully by skipping pages, tho this may not always 625 // be practical. Depending on how the pages are consumed, 626 // missing rows may cause unpredictable behaviors in algorithms. 627 // 628 // For now, we assume these errors to be fatal, but we may 629 // revisit later and improve error handling to be more resilient 630 // to data corruption. 631 return fmt.Errorf("crc32 checksum mismatch in page of column %q: want=0x%08X got=0x%08X: %w", 632 f.columnPath(), 633 headerChecksum, 634 bufferChecksum, 635 ErrCorrupted, 636 ) 637 } 638 } 639 640 return nil 641 } 642 643 func (f *filePages) SeekToRow(rowIndex int64) (err error) { 644 if f.chunk == nil { 645 return io.ErrClosedPipe 646 } 647 if f.chunk.offsetIndex == nil { 648 _, err = f.section.Seek(f.dataOffset-f.baseOffset, io.SeekStart) 649 f.skip = rowIndex 650 f.index = 0 651 if f.dictOffset > 0 { 652 f.index = 1 653 } 654 } else { 655 pages := f.chunk.offsetIndex.PageLocations 656 index := sort.Search(len(pages), func(i int) bool { 657 return pages[i].FirstRowIndex > rowIndex 658 }) - 1 659 if index < 0 { 660 return ErrSeekOutOfRange 661 } 662 _, err = f.section.Seek(pages[index].Offset-f.baseOffset, io.SeekStart) 663 f.skip = rowIndex - pages[index].FirstRowIndex 664 f.index = index 665 } 666 f.rbuf.Reset(&f.section) 667 return err 668 } 669 670 func (f *filePages) Close() error { 671 releaseDictPage(f.dictPage) 672 releaseDataPage(f.dataPage) 673 releaseReadBuffer(f.rbuf) 674 f.chunk = nil 675 f.dictPage = nil 676 f.dataPage = nil 677 f.section = io.SectionReader{} 678 f.rbuf = nil 679 f.baseOffset = 0 680 f.dataOffset = 0 681 f.dictOffset = 0 682 f.index = 0 683 f.skip = 0 684 return nil 685 } 686 687 func (f *filePages) columnPath() columnPath { 688 return columnPath(f.chunk.column.Path()) 689 } 690 691 var ( 692 dictPagePool sync.Pool // *dictPage 693 dataPagePool sync.Pool // *dataPage 694 readBufferPool sync.Pool // *bufio.Reader 695 ) 696 697 func acquireDictPage() *dictPage { 698 p, _ := dictPagePool.Get().(*dictPage) 699 if p == nil { 700 p = new(dictPage) 701 } 702 return p 703 } 704 705 func releaseDictPage(p *dictPage) { 706 if p != nil { 707 p.reset() 708 dictPagePool.Put(p) 709 } 710 } 711 712 func acquireDataPage() *dataPage { 713 p, _ := dataPagePool.Get().(*dataPage) 714 if p == nil { 715 p = new(dataPage) 716 } 717 return p 718 } 719 720 func releaseDataPage(p *dataPage) { 721 if p != nil { 722 p.reset() 723 dataPagePool.Put(p) 724 } 725 } 726 727 func acquireReadBuffer(r io.Reader) *bufio.Reader { 728 b, _ := readBufferPool.Get().(*bufio.Reader) 729 if b == nil { 730 b = bufio.NewReaderSize(r, defaultReadBufferSize) 731 } else { 732 b.Reset(r) 733 } 734 return b 735 } 736 737 func releaseReadBuffer(b *bufio.Reader) { 738 if b != nil { 739 b.Reset(nil) 740 readBufferPool.Put(b) 741 } 742 }