storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/pkg/s3select/internal/parquet-go/page.go (about) 1 /* 2 * Minio Cloud Storage, (C) 2018 Minio, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package parquet 18 19 import ( 20 "bytes" 21 "context" 22 "errors" 23 "fmt" 24 "io" 25 "math" 26 "strings" 27 28 "git.apache.org/thrift.git/lib/go/thrift" 29 30 "storj.io/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" 31 ) 32 33 // getBitWidth - returns bits required to place num e.g. 34 // 35 // num | width 36 // -----|------- 37 // 0 | 0 38 // 1 | 1 39 // 2 | 2 40 // 3 | 2 41 // 4 | 3 42 // 5 | 3 43 // ... | ... 44 // ... | ... 45 // 46 func getBitWidth(num uint64) (width uint64) { 47 for ; num != 0; num >>= 1 { 48 width++ 49 } 50 51 return width 52 } 53 54 // getMaxDefLevel - get maximum definition level. 55 func getMaxDefLevel(nameIndexMap map[string]int, schemaElements []*parquet.SchemaElement, path []string) (v int) { 56 for i := 1; i <= len(path); i++ { 57 name := strings.Join(path[:i], ".") 58 if index, ok := nameIndexMap[name]; ok { 59 if schemaElements[index].GetRepetitionType() != parquet.FieldRepetitionType_REQUIRED { 60 v++ 61 } 62 } 63 } 64 65 return v 66 } 67 68 // getMaxRepLevel - get maximum repetition level. 69 func getMaxRepLevel(nameIndexMap map[string]int, schemaElements []*parquet.SchemaElement, path []string) (v int) { 70 for i := 1; i <= len(path); i++ { 71 name := strings.Join(path[:i], ".") 72 if index, ok := nameIndexMap[name]; ok { 73 if schemaElements[index].GetRepetitionType() == parquet.FieldRepetitionType_REPEATED { 74 v++ 75 } 76 } 77 } 78 79 return v 80 } 81 82 func readPageHeader(reader *thrift.TBufferedTransport) (*parquet.PageHeader, error) { 83 pageHeader := parquet.NewPageHeader() 84 if err := pageHeader.Read(thrift.NewTCompactProtocol(reader)); err != nil { 85 return nil, err 86 } 87 88 return pageHeader, nil 89 } 90 91 func readPage( 92 thriftReader *thrift.TBufferedTransport, 93 metadata *parquet.ColumnMetaData, 94 columnNameIndexMap map[string]int, 95 schemaElements []*parquet.SchemaElement, 96 ) (page *page, definitionLevels, numRows int64, err error) { 97 98 pageHeader, err := readPageHeader(thriftReader) 99 if err != nil { 100 return nil, 0, 0, err 101 } 102 103 read := func() (data []byte, err error) { 104 var repLevelsLen, defLevelsLen int32 105 var repLevelsBuf, defLevelsBuf []byte 106 107 if pageHeader.GetType() == parquet.PageType_DATA_PAGE_V2 { 108 if pageHeader.DataPageHeaderV2 == nil { 109 return nil, errors.New("parquet: Header not set") 110 } 111 repLevelsLen = pageHeader.DataPageHeaderV2.GetRepetitionLevelsByteLength() 112 repLevelsBuf = make([]byte, repLevelsLen) 113 114 n, err := io.ReadFull(thriftReader, repLevelsBuf) 115 if err != nil { 116 return nil, err 117 } 118 if n != int(repLevelsLen) { 119 return nil, fmt.Errorf("expected parquet header repetition levels %d, got %d", repLevelsLen, n) 120 } 121 122 defLevelsLen = pageHeader.DataPageHeaderV2.GetDefinitionLevelsByteLength() 123 defLevelsBuf = make([]byte, defLevelsLen) 124 125 n, err = io.ReadFull(thriftReader, defLevelsBuf) 126 if err != nil { 127 return nil, err 128 } 129 if n != int(defLevelsLen) { 130 return nil, fmt.Errorf("expected parquet header definition levels %d, got %d", defLevelsLen, n) 131 } 132 } 133 dbLen := pageHeader.GetCompressedPageSize() - repLevelsLen - defLevelsLen 134 if dbLen < 0 { 135 return nil, errors.New("parquet: negative data length") 136 } 137 138 dataBuf := make([]byte, dbLen) 139 n, err := io.ReadFull(thriftReader, dataBuf) 140 if err != nil { 141 return nil, err 142 } 143 if n != int(dbLen) { 144 return nil, fmt.Errorf("expected parquet data buffer %d, got %d", dbLen, n) 145 } 146 147 if dataBuf, err = compressionCodec(metadata.GetCodec()).uncompress(dataBuf); err != nil { 148 return nil, err 149 } 150 151 if repLevelsLen == 0 && defLevelsLen == 0 { 152 return dataBuf, nil 153 } 154 155 if repLevelsLen > 0 { 156 data = append(data, uint32ToBytes(uint32(repLevelsLen))...) 157 data = append(data, repLevelsBuf...) 158 } 159 160 if defLevelsLen > 0 { 161 data = append(data, uint32ToBytes(uint32(defLevelsLen))...) 162 data = append(data, defLevelsBuf...) 163 } 164 165 data = append(data, dataBuf...) 166 167 return data, nil 168 } 169 170 buf, err := read() 171 if err != nil { 172 return nil, 0, 0, err 173 } 174 if metadata == nil { 175 return nil, 0, 0, errors.New("parquet: metadata not set") 176 } 177 path := append([]string{}, metadata.GetPathInSchema()...) 178 179 bytesReader := bytes.NewReader(buf) 180 pageType := pageHeader.GetType() 181 switch pageType { 182 case parquet.PageType_INDEX_PAGE: 183 return nil, 0, 0, fmt.Errorf("page type %v is not supported", parquet.PageType_INDEX_PAGE) 184 185 case parquet.PageType_DICTIONARY_PAGE: 186 page = newDictPage() 187 page.Header = pageHeader 188 table := new(table) 189 table.Path = path 190 if pageHeader.DictionaryPageHeader == nil { 191 return nil, 0, 0, errors.New("parquet: dictionary not set") 192 } 193 values, err := readValues(bytesReader, metadata.GetType(), 194 uint64(pageHeader.DictionaryPageHeader.GetNumValues()), 0) 195 if err != nil { 196 return nil, 0, 0, err 197 } 198 table.Values = getTableValues(values, metadata.GetType()) 199 page.DataTable = table 200 201 return page, 0, 0, nil 202 203 case parquet.PageType_DATA_PAGE, parquet.PageType_DATA_PAGE_V2: 204 name := strings.Join(path, ".") 205 206 page = newDataPage() 207 page.Header = pageHeader 208 209 maxDefinitionLevel := getMaxDefLevel(columnNameIndexMap, schemaElements, path) 210 maxRepetitionLevel := getMaxRepLevel(columnNameIndexMap, schemaElements, path) 211 212 var numValues uint64 213 var encodingType parquet.Encoding 214 215 if pageHeader.GetType() == parquet.PageType_DATA_PAGE { 216 if pageHeader.DataPageHeader == nil { 217 return nil, 0, 0, errors.New("parquet: Header not set") 218 } 219 numValues = uint64(pageHeader.DataPageHeader.GetNumValues()) 220 encodingType = pageHeader.DataPageHeader.GetEncoding() 221 } else { 222 if pageHeader.DataPageHeaderV2 == nil { 223 return nil, 0, 0, errors.New("parquet: Header not set") 224 } 225 numValues = uint64(pageHeader.DataPageHeaderV2.GetNumValues()) 226 encodingType = pageHeader.DataPageHeaderV2.GetEncoding() 227 } 228 229 var repetitionLevels []int64 230 if maxRepetitionLevel > 0 { 231 values, _, err := readDataPageValues(bytesReader, parquet.Encoding_RLE, parquet.Type_INT64, 232 -1, numValues, getBitWidth(uint64(maxRepetitionLevel))) 233 if err != nil { 234 return nil, 0, 0, err 235 } 236 237 if repetitionLevels = values.([]int64); len(repetitionLevels) > int(numValues) && int(numValues) >= 0 { 238 repetitionLevels = repetitionLevels[:numValues] 239 } 240 } else { 241 if numValues > math.MaxInt64/8 { 242 return nil, 0, 0, errors.New("parquet: numvalues too large") 243 } 244 repetitionLevels = make([]int64, numValues) 245 } 246 247 var definitionLevels []int64 248 if maxDefinitionLevel > 0 { 249 values, _, err := readDataPageValues(bytesReader, parquet.Encoding_RLE, parquet.Type_INT64, 250 -1, numValues, getBitWidth(uint64(maxDefinitionLevel))) 251 if err != nil { 252 return nil, 0, 0, err 253 } 254 if numValues > math.MaxInt64/8 { 255 return nil, 0, 0, errors.New("parquet: numvalues too large") 256 } 257 if definitionLevels = values.([]int64); len(definitionLevels) > int(numValues) { 258 definitionLevels = definitionLevels[:numValues] 259 } 260 } else { 261 if numValues > math.MaxInt64/8 { 262 return nil, 0, 0, errors.New("parquet: numvalues too large") 263 } 264 definitionLevels = make([]int64, numValues) 265 } 266 267 var numNulls uint64 268 for i := 0; i < len(definitionLevels); i++ { 269 if definitionLevels[i] != int64(maxDefinitionLevel) { 270 numNulls++ 271 } 272 } 273 274 var convertedType parquet.ConvertedType = -1 275 if schemaElements[columnNameIndexMap[name]].IsSetConvertedType() { 276 convertedType = schemaElements[columnNameIndexMap[name]].GetConvertedType() 277 } 278 values, valueType, err := readDataPageValues(bytesReader, encodingType, metadata.GetType(), 279 convertedType, uint64(len(definitionLevels))-numNulls, 280 uint64(schemaElements[columnNameIndexMap[name]].GetTypeLength())) 281 if err != nil { 282 return nil, 0, 0, err 283 } 284 tableValues := getTableValues(values, valueType) 285 286 table := new(table) 287 table.Path = path 288 table.RepetitionType = schemaElements[columnNameIndexMap[name]].GetRepetitionType() 289 table.MaxRepetitionLevel = int32(maxRepetitionLevel) 290 table.MaxDefinitionLevel = int32(maxDefinitionLevel) 291 table.Values = make([]interface{}, len(definitionLevels)) 292 table.RepetitionLevels = make([]int32, len(definitionLevels)) 293 table.DefinitionLevels = make([]int32, len(definitionLevels)) 294 295 j := 0 296 numRows := int64(0) 297 for i := 0; i < len(definitionLevels); i++ { 298 table.RepetitionLevels[i] = int32(repetitionLevels[i]) 299 table.DefinitionLevels[i] = int32(definitionLevels[i]) 300 if int(table.DefinitionLevels[i]) == maxDefinitionLevel { 301 table.Values[i] = tableValues[j] 302 j++ 303 } 304 if table.RepetitionLevels[i] == 0 { 305 numRows++ 306 } 307 } 308 page.DataTable = table 309 310 return page, int64(len(definitionLevels)), numRows, nil 311 } 312 313 return nil, 0, 0, fmt.Errorf("unknown page type %v", pageType) 314 } 315 316 type page struct { 317 Header *parquet.PageHeader // Header of a page 318 DataTable *table // Table to store values 319 RawData []byte // Compressed data of the page, which is written in parquet file 320 CompressType parquet.CompressionCodec // Compress type: gzip/snappy/none 321 DataType parquet.Type // Parquet type of the values in the page 322 Path []string // Path in schema(include the root) 323 MaxVal interface{} // Maximum of the values 324 MinVal interface{} // Minimum of the values 325 PageSize int32 326 } 327 328 func newPage() *page { 329 return &page{ 330 Header: parquet.NewPageHeader(), 331 PageSize: defaultPageSize, 332 } 333 } 334 335 func newDictPage() *page { 336 page := newPage() 337 page.Header.DictionaryPageHeader = parquet.NewDictionaryPageHeader() 338 return page 339 } 340 341 func newDataPage() *page { 342 page := newPage() 343 page.Header.DataPageHeader = parquet.NewDataPageHeader() 344 return page 345 } 346 347 func (page *page) decode(dictPage *page) { 348 if dictPage == nil || page == nil || page.Header.DataPageHeader == nil || 349 (page.Header.DataPageHeader.Encoding != parquet.Encoding_RLE_DICTIONARY && 350 page.Header.DataPageHeader.Encoding != parquet.Encoding_PLAIN_DICTIONARY) { 351 return 352 } 353 354 for i := 0; i < len(page.DataTable.Values); i++ { 355 if page.DataTable.Values[i] != nil { 356 index, ok := page.DataTable.Values[i].(int64) 357 if !ok || int(index) >= len(dictPage.DataTable.Values) { 358 return 359 } 360 page.DataTable.Values[i] = dictPage.DataTable.Values[index] 361 } 362 } 363 } 364 365 // Get RepetitionLevels and Definitions from RawData 366 func (page *page) getRLDLFromRawData(columnNameIndexMap map[string]int, schemaElements []*parquet.SchemaElement) (numValues int64, numRows int64, err error) { 367 bytesReader := bytes.NewReader(page.RawData) 368 369 pageType := page.Header.GetType() 370 371 var buf []byte 372 if pageType == parquet.PageType_DATA_PAGE_V2 { 373 var repLevelsLen, defLevelsLen int32 374 var repLevelsBuf, defLevelsBuf []byte 375 if page.Header.DataPageHeaderV2 == nil { 376 return 0, 0, errors.New("parquet: Header not set") 377 } 378 repLevelsLen = page.Header.DataPageHeaderV2.GetRepetitionLevelsByteLength() 379 repLevelsBuf = make([]byte, repLevelsLen) 380 if _, err = bytesReader.Read(repLevelsBuf); err != nil { 381 return 0, 0, err 382 } 383 384 defLevelsLen = page.Header.DataPageHeaderV2.GetDefinitionLevelsByteLength() 385 defLevelsBuf = make([]byte, defLevelsLen) 386 if _, err = bytesReader.Read(defLevelsBuf); err != nil { 387 return 0, 0, err 388 } 389 390 dataBuf := make([]byte, len(page.RawData)-int(repLevelsLen)-int(defLevelsLen)) 391 if _, err = bytesReader.Read(dataBuf); err != nil { 392 return 0, 0, err 393 } 394 395 if repLevelsLen == 0 && defLevelsLen == 0 { 396 buf = dataBuf 397 } else { 398 if repLevelsLen > 0 { 399 buf = append(buf, uint32ToBytes(uint32(repLevelsLen))...) 400 buf = append(buf, repLevelsBuf...) 401 } 402 403 if defLevelsLen > 0 { 404 buf = append(buf, uint32ToBytes(uint32(defLevelsLen))...) 405 buf = append(buf, defLevelsBuf...) 406 } 407 408 buf = append(buf, dataBuf...) 409 } 410 } else { 411 if buf, err = compressionCodec(page.CompressType).uncompress(page.RawData); err != nil { 412 return 0, 0, err 413 } 414 } 415 416 bytesReader = bytes.NewReader(buf) 417 418 switch pageType { 419 case parquet.PageType_DICTIONARY_PAGE: 420 table := new(table) 421 table.Path = page.Path 422 page.DataTable = table 423 return 0, 0, nil 424 425 case parquet.PageType_DATA_PAGE, parquet.PageType_DATA_PAGE_V2: 426 var numValues uint64 427 if pageType == parquet.PageType_DATA_PAGE { 428 if page.Header.DataPageHeader == nil { 429 return 0, 0, errors.New("parquet: Header not set") 430 } 431 numValues = uint64(page.Header.DataPageHeader.GetNumValues()) 432 } else { 433 if page.Header.DataPageHeaderV2 == nil { 434 return 0, 0, errors.New("parquet: Header not set") 435 } 436 numValues = uint64(page.Header.DataPageHeaderV2.GetNumValues()) 437 } 438 439 maxDefinitionLevel := getMaxDefLevel(columnNameIndexMap, schemaElements, page.Path) 440 maxRepetitionLevel := getMaxRepLevel(columnNameIndexMap, schemaElements, page.Path) 441 442 var repetitionLevels []int64 443 if maxRepetitionLevel > 0 { 444 values, _, err := readDataPageValues(bytesReader, parquet.Encoding_RLE, parquet.Type_INT64, 445 -1, numValues, getBitWidth(uint64(maxRepetitionLevel))) 446 if err != nil { 447 return 0, 0, err 448 } 449 450 if repetitionLevels = values.([]int64); uint64(len(repetitionLevels)) > numValues { 451 repetitionLevels = repetitionLevels[:numValues] 452 } 453 } else { 454 repetitionLevels = make([]int64, numValues) 455 } 456 457 var definitionLevels []int64 458 if maxDefinitionLevel > 0 { 459 values, _, err := readDataPageValues(bytesReader, parquet.Encoding_RLE, parquet.Type_INT64, 460 -1, numValues, getBitWidth(uint64(maxDefinitionLevel))) 461 if err != nil { 462 return 0, 0, err 463 } 464 if definitionLevels = values.([]int64); uint64(len(definitionLevels)) > numValues { 465 definitionLevels = definitionLevels[:numValues] 466 } 467 } else { 468 definitionLevels = make([]int64, numValues) 469 } 470 471 table := new(table) 472 table.Path = page.Path 473 name := strings.Join(page.Path, ".") 474 table.RepetitionType = schemaElements[columnNameIndexMap[name]].GetRepetitionType() 475 table.MaxRepetitionLevel = int32(maxRepetitionLevel) 476 table.MaxDefinitionLevel = int32(maxDefinitionLevel) 477 table.Values = make([]interface{}, len(definitionLevels)) 478 table.RepetitionLevels = make([]int32, len(definitionLevels)) 479 table.DefinitionLevels = make([]int32, len(definitionLevels)) 480 481 numRows := int64(0) 482 for i := 0; i < len(definitionLevels); i++ { 483 table.RepetitionLevels[i] = int32(repetitionLevels[i]) 484 table.DefinitionLevels[i] = int32(definitionLevels[i]) 485 if table.RepetitionLevels[i] == 0 { 486 numRows++ 487 } 488 } 489 page.DataTable = table 490 page.RawData = buf[len(buf)-bytesReader.Len():] 491 492 return int64(numValues), numRows, nil 493 } 494 495 return 0, 0, fmt.Errorf("Unsupported page type %v", pageType) 496 } 497 498 func (page *page) getValueFromRawData(columnNameIndexMap map[string]int, schemaElements []*parquet.SchemaElement) (err error) { 499 pageType := page.Header.GetType() 500 switch pageType { 501 case parquet.PageType_DICTIONARY_PAGE: 502 bytesReader := bytes.NewReader(page.RawData) 503 var values interface{} 504 if page.Header.DictionaryPageHeader == nil { 505 return errors.New("parquet: dictionary not set") 506 } 507 values, err = readValues(bytesReader, page.DataType, 508 uint64(page.Header.DictionaryPageHeader.GetNumValues()), 0) 509 if err != nil { 510 return err 511 } 512 513 page.DataTable.Values = getTableValues(values, page.DataType) 514 return nil 515 516 case parquet.PageType_DATA_PAGE_V2: 517 if page.RawData, err = compressionCodec(page.CompressType).uncompress(page.RawData); err != nil { 518 return err 519 } 520 fallthrough 521 case parquet.PageType_DATA_PAGE: 522 encodingType := page.Header.DataPageHeader.GetEncoding() 523 bytesReader := bytes.NewReader(page.RawData) 524 525 var numNulls uint64 526 for i := 0; i < len(page.DataTable.DefinitionLevels); i++ { 527 if page.DataTable.DefinitionLevels[i] != page.DataTable.MaxDefinitionLevel { 528 numNulls++ 529 } 530 } 531 532 name := strings.Join(page.DataTable.Path, ".") 533 var convertedType parquet.ConvertedType = -1 534 535 if schemaElements[columnNameIndexMap[name]].IsSetConvertedType() { 536 convertedType = schemaElements[columnNameIndexMap[name]].GetConvertedType() 537 } 538 539 values, _, err := readDataPageValues(bytesReader, encodingType, page.DataType, 540 convertedType, uint64(len(page.DataTable.DefinitionLevels))-numNulls, 541 uint64(schemaElements[columnNameIndexMap[name]].GetTypeLength())) 542 if err != nil { 543 return err 544 } 545 546 tableValues := getTableValues(values, page.DataType) 547 548 j := 0 549 for i := 0; i < len(page.DataTable.DefinitionLevels); i++ { 550 if page.DataTable.DefinitionLevels[i] == page.DataTable.MaxDefinitionLevel { 551 page.DataTable.Values[i] = tableValues[j] 552 j++ 553 } 554 } 555 556 page.RawData = []byte{} 557 return nil 558 } 559 560 return fmt.Errorf("unsupported page type %v", pageType) 561 } 562 563 func (page *page) toDataPage(compressType parquet.CompressionCodec) []byte { 564 values := []interface{}{} 565 for i := range page.DataTable.DefinitionLevels { 566 if page.DataTable.DefinitionLevels[i] == page.DataTable.MaxDefinitionLevel { 567 values = append(values, page.DataTable.Values[i]) 568 } 569 } 570 valuesBytes := encodeValues(interfacesToValues(values, page.DataTable.Type), page.DataType, page.DataTable.Encoding, page.DataTable.BitWidth) 571 572 var defLevelBytes []byte 573 if page.DataTable.MaxDefinitionLevel > 0 { 574 defLevels := make([]int64, len(page.DataTable.DefinitionLevels)) 575 for i := range page.DataTable.DefinitionLevels { 576 defLevels[i] = int64(page.DataTable.DefinitionLevels[i]) 577 } 578 defLevelBytes = valuesToRLEBitPackedHybridBytes( 579 defLevels, 580 int32(getBitWidth(uint64(page.DataTable.MaxDefinitionLevel))), 581 parquet.Type_INT64, 582 ) 583 } 584 585 var repLevelBytes []byte 586 if page.DataTable.MaxRepetitionLevel > 0 { 587 repLevels := make([]int64, len(page.DataTable.DefinitionLevels)) 588 for i := range page.DataTable.DefinitionLevels { 589 repLevels[i] = int64(page.DataTable.RepetitionLevels[i]) 590 } 591 repLevelBytes = valuesToRLEBitPackedHybridBytes( 592 repLevels, 593 int32(getBitWidth(uint64(page.DataTable.MaxRepetitionLevel))), 594 parquet.Type_INT64, 595 ) 596 } 597 598 data := repLevelBytes 599 data = append(data, defLevelBytes...) 600 data = append(data, valuesBytes...) 601 602 compressedData, err := compressionCodec(compressType).compress(data) 603 if err != nil { 604 panic(err) 605 } 606 607 page.Header = parquet.NewPageHeader() 608 page.Header.Type = parquet.PageType_DATA_PAGE 609 page.Header.CompressedPageSize = int32(len(compressedData)) 610 page.Header.UncompressedPageSize = int32(len(data)) 611 page.Header.DataPageHeader = parquet.NewDataPageHeader() 612 page.Header.DataPageHeader.NumValues = int32(len(page.DataTable.DefinitionLevels)) 613 page.Header.DataPageHeader.DefinitionLevelEncoding = parquet.Encoding_RLE 614 page.Header.DataPageHeader.RepetitionLevelEncoding = parquet.Encoding_RLE 615 page.Header.DataPageHeader.Encoding = page.DataTable.Encoding 616 page.Header.DataPageHeader.Statistics = parquet.NewStatistics() 617 if page.MaxVal != nil { 618 tmpBuf := valueToBytes(page.MaxVal, page.DataType) 619 if page.DataType == parquet.Type_BYTE_ARRAY { 620 switch page.DataTable.ConvertedType { 621 case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL: 622 tmpBuf = tmpBuf[4:] 623 } 624 } 625 page.Header.DataPageHeader.Statistics.Max = tmpBuf 626 } 627 if page.MinVal != nil { 628 tmpBuf := valueToBytes(page.MinVal, page.DataType) 629 if page.DataType == parquet.Type_BYTE_ARRAY { 630 switch page.DataTable.ConvertedType { 631 case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL: 632 tmpBuf = tmpBuf[4:] 633 } 634 } 635 page.Header.DataPageHeader.Statistics.Min = tmpBuf 636 } 637 638 ts := thrift.NewTSerializer() 639 ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport) 640 pageHeaderBytes, err := ts.Write(context.TODO(), page.Header) 641 if err != nil { 642 panic(err) 643 } 644 645 page.RawData = append(pageHeaderBytes, compressedData...) 646 return page.RawData 647 } 648 649 func (page *page) toDataPageV2(compressType parquet.CompressionCodec) []byte { 650 values := []interface{}{} 651 for i := range page.DataTable.DefinitionLevels { 652 if page.DataTable.DefinitionLevels[i] == page.DataTable.MaxDefinitionLevel { 653 values = append(values, page.DataTable.Values[i]) 654 } 655 } 656 valuesBytes := encodeValues(values, page.DataType, page.DataTable.Encoding, page.DataTable.BitWidth) 657 658 var defLevelBytes []byte 659 if page.DataTable.MaxDefinitionLevel > 0 { 660 defLevels := make([]int64, len(page.DataTable.DefinitionLevels)) 661 for i := range page.DataTable.DefinitionLevels { 662 defLevels[i] = int64(page.DataTable.DefinitionLevels[i]) 663 } 664 defLevelBytes = valuesToRLEBytes( 665 defLevels, 666 int32(getBitWidth(uint64(page.DataTable.MaxDefinitionLevel))), 667 parquet.Type_INT64, 668 ) 669 } 670 671 var repLevelBytes []byte 672 numRows := int32(0) 673 if page.DataTable.MaxRepetitionLevel > 0 { 674 repLevels := make([]int64, len(page.DataTable.DefinitionLevels)) 675 for i := range page.DataTable.DefinitionLevels { 676 repLevels[i] = int64(page.DataTable.RepetitionLevels[i]) 677 if page.DataTable.RepetitionLevels[i] == 0 { 678 numRows++ 679 } 680 } 681 repLevelBytes = valuesToRLEBytes( 682 repLevels, 683 int32(getBitWidth(uint64(page.DataTable.MaxRepetitionLevel))), 684 parquet.Type_INT64, 685 ) 686 } 687 688 compressedData, err := compressionCodec(compressType).compress(valuesBytes) 689 if err != nil { 690 panic(err) 691 } 692 693 page.Header = parquet.NewPageHeader() 694 page.Header.Type = parquet.PageType_DATA_PAGE_V2 695 page.Header.CompressedPageSize = int32(len(compressedData) + len(defLevelBytes) + len(repLevelBytes)) 696 page.Header.UncompressedPageSize = int32(len(valuesBytes) + len(defLevelBytes) + len(repLevelBytes)) 697 page.Header.DataPageHeaderV2 = parquet.NewDataPageHeaderV2() 698 page.Header.DataPageHeaderV2.NumValues = int32(len(page.DataTable.Values)) 699 page.Header.DataPageHeaderV2.NumNulls = page.Header.DataPageHeaderV2.NumValues - int32(len(values)) 700 page.Header.DataPageHeaderV2.NumRows = numRows 701 page.Header.DataPageHeaderV2.Encoding = page.DataTable.Encoding 702 page.Header.DataPageHeaderV2.DefinitionLevelsByteLength = int32(len(defLevelBytes)) 703 page.Header.DataPageHeaderV2.RepetitionLevelsByteLength = int32(len(repLevelBytes)) 704 page.Header.DataPageHeaderV2.IsCompressed = true 705 706 page.Header.DataPageHeaderV2.Statistics = parquet.NewStatistics() 707 if page.MaxVal != nil { 708 tmpBuf := valueToBytes(page.MaxVal, page.DataType) 709 if page.DataType == parquet.Type_BYTE_ARRAY { 710 switch page.DataTable.ConvertedType { 711 case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL: 712 tmpBuf = tmpBuf[4:] 713 } 714 } 715 page.Header.DataPageHeaderV2.Statistics.Max = tmpBuf 716 } 717 if page.MinVal != nil { 718 tmpBuf := valueToBytes(page.MinVal, page.DataType) 719 if page.DataType == parquet.Type_BYTE_ARRAY { 720 switch page.DataTable.ConvertedType { 721 case parquet.ConvertedType_UTF8, parquet.ConvertedType_DECIMAL: 722 tmpBuf = tmpBuf[4:] 723 } 724 } 725 page.Header.DataPageHeaderV2.Statistics.Min = tmpBuf 726 } 727 728 ts := thrift.NewTSerializer() 729 ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport) 730 pageHeaderBytes, err := ts.Write(context.TODO(), page.Header) 731 if err != nil { 732 panic(err) 733 } 734 735 page.RawData = append(pageHeaderBytes, repLevelBytes...) 736 page.RawData = append(page.RawData, defLevelBytes...) 737 page.RawData = append(page.RawData, compressedData...) 738 739 return page.RawData 740 } 741 742 func (page *page) toDictPage(compressType parquet.CompressionCodec, dataType parquet.Type) []byte { 743 valuesBytes := valuesToBytes(page.DataTable.Values, dataType) 744 compressedData, err := compressionCodec(compressType).compress(valuesBytes) 745 if err != nil { 746 panic(err) 747 } 748 749 page.Header = parquet.NewPageHeader() 750 page.Header.Type = parquet.PageType_DICTIONARY_PAGE 751 page.Header.CompressedPageSize = int32(len(compressedData)) 752 page.Header.UncompressedPageSize = int32(len(valuesBytes)) 753 page.Header.DictionaryPageHeader = parquet.NewDictionaryPageHeader() 754 page.Header.DictionaryPageHeader.NumValues = int32(len(page.DataTable.Values)) 755 page.Header.DictionaryPageHeader.Encoding = parquet.Encoding_PLAIN 756 757 ts := thrift.NewTSerializer() 758 ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport) 759 pageHeaderBytes, err := ts.Write(context.TODO(), page.Header) 760 if err != nil { 761 panic(err) 762 } 763 764 page.RawData = append(pageHeaderBytes, compressedData...) 765 return page.RawData 766 } 767 768 func (page *page) toDictDataPage(compressType parquet.CompressionCodec, bitWidth int32) []byte { 769 valuesBytes := append([]byte{byte(bitWidth)}, valuesToRLEBytes(page.DataTable.Values, bitWidth, parquet.Type_INT32)...) 770 771 var defLevelBytes []byte 772 if page.DataTable.MaxDefinitionLevel > 0 { 773 defLevels := make([]int64, len(page.DataTable.DefinitionLevels)) 774 for i := range page.DataTable.DefinitionLevels { 775 defLevels[i] = int64(page.DataTable.DefinitionLevels[i]) 776 } 777 defLevelBytes = valuesToRLEBitPackedHybridBytes( 778 defLevels, 779 int32(getBitWidth(uint64(page.DataTable.MaxDefinitionLevel))), 780 parquet.Type_INT64, 781 ) 782 } 783 784 var repLevelBytes []byte 785 if page.DataTable.MaxRepetitionLevel > 0 { 786 repLevels := make([]int64, len(page.DataTable.DefinitionLevels)) 787 for i := range page.DataTable.DefinitionLevels { 788 repLevels[i] = int64(page.DataTable.RepetitionLevels[i]) 789 } 790 repLevelBytes = valuesToRLEBitPackedHybridBytes( 791 repLevels, 792 int32(getBitWidth(uint64(page.DataTable.MaxRepetitionLevel))), 793 parquet.Type_INT64, 794 ) 795 } 796 797 data := append(repLevelBytes, defLevelBytes...) 798 data = append(data, valuesBytes...) 799 800 compressedData, err := compressionCodec(compressType).compress(data) 801 if err != nil { 802 panic(err) 803 } 804 805 page.Header = parquet.NewPageHeader() 806 page.Header.Type = parquet.PageType_DATA_PAGE 807 page.Header.CompressedPageSize = int32(len(compressedData)) 808 page.Header.UncompressedPageSize = int32(len(data)) 809 page.Header.DataPageHeader = parquet.NewDataPageHeader() 810 page.Header.DataPageHeader.NumValues = int32(len(page.DataTable.DefinitionLevels)) 811 page.Header.DataPageHeader.DefinitionLevelEncoding = parquet.Encoding_RLE 812 page.Header.DataPageHeader.RepetitionLevelEncoding = parquet.Encoding_RLE 813 page.Header.DataPageHeader.Encoding = parquet.Encoding_PLAIN_DICTIONARY 814 815 ts := thrift.NewTSerializer() 816 ts.Protocol = thrift.NewTCompactProtocolFactory().GetProtocol(ts.Transport) 817 pageHeaderBytes, err := ts.Write(context.TODO(), page.Header) 818 if err != nil { 819 panic(err) 820 } 821 822 page.RawData = append(pageHeaderBytes, compressedData...) 823 return page.RawData 824 }