storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/pkg/s3select/internal/parquet-go/encoding/rledict-encode.go (about)

     1  /*
     2   * Minio Cloud Storage, (C) 2019 Minio, Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package encoding
    18  
    19  import (
    20  	"storj.io/minio/pkg/s3select/internal/parquet-go/common"
    21  	"storj.io/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
    22  )
    23  
    24  // RLEDictEncode encodes values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#dictionary-encoding-plain_dictionary--2-and-rle_dictionary--8 and returns dictionary page data and data page data.
    25  //
    26  // Dictionary page data contains PLAIN encodeed slice of uniquely fully defined non-nil values.
    27  // Data page data contains RLE/Bit-Packed Hybrid encoded indices of fully defined non-nil values.
    28  //
    29  // Supported Types: BOOLEAN, INT32, INT64, FLOAT, DOUBLE, BYTE_ARRAY
    30  func RLEDictEncode(values []interface{}, parquetType parquet.Type, bitWidth int32) (dictPageData, dataPageData []byte, dictValueCount int32, indexBitWidth uint8) {
    31  	var definedValues []interface{}
    32  	var indices []int32
    33  
    34  	valueIndexMap := make(map[interface{}]int32)
    35  	j := 0
    36  	for i := 0; i < len(values); i = j {
    37  		for j = i; j < len(values); j++ {
    38  			value := values[j]
    39  			if value == nil {
    40  				continue
    41  			}
    42  
    43  			index, found := valueIndexMap[value]
    44  			if !found {
    45  				index = int32(len(definedValues))
    46  				definedValues = append(definedValues, value)
    47  				valueIndexMap[value] = index
    48  			}
    49  
    50  			indices = append(indices, index)
    51  		}
    52  	}
    53  
    54  	indexBitWidth = uint8(common.BitWidth(uint64(indices[len(indices)-1])))
    55  
    56  	dictPageData = PlainEncode(common.ToSliceValue(definedValues, parquetType), parquetType)
    57  	dataPageData = RLEBitPackedHybridEncode(indices, int32(indexBitWidth), parquet.Type_INT32)
    58  
    59  	return dictPageData, dataPageData, int32(len(definedValues)), indexBitWidth
    60  }