storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/pkg/s3select/internal/parquet-go/encoding/delta-encode.go (about)

     1  /*
     2   * Minio Cloud Storage, (C) 2019 Minio, Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package encoding
    18  
    19  import (
    20  	"fmt"
    21  
    22  	"storj.io/minio/pkg/s3select/internal/parquet-go/common"
    23  	"storj.io/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
    24  )
    25  
    26  const (
    27  	blockSize      = 128
    28  	miniBlockSize  = 32
    29  	miniBlockCount = blockSize / miniBlockSize
    30  )
    31  
    32  var deltaEncodeHeaderBytes []byte
    33  
    34  func init() {
    35  	deltaEncodeHeaderBytes = varIntEncode(blockSize)
    36  	deltaEncodeHeaderBytes = append(deltaEncodeHeaderBytes, varIntEncode(miniBlockCount)...)
    37  }
    38  
    39  // Supported Types: BOOLEAN, INT32, INT64
    40  func bitPackedEncode(values interface{}, bitWidth uint64, withHeader bool, parquetType parquet.Type) []byte {
    41  	var i64s []int64
    42  	switch parquetType {
    43  	case parquet.Type_BOOLEAN:
    44  		bs, ok := values.([]bool)
    45  		if !ok {
    46  			panic(fmt.Errorf("expected slice of bool"))
    47  		}
    48  
    49  		i64s = make([]int64, len(bs))
    50  		for i := range bs {
    51  			if bs[i] {
    52  				i64s[i] = 1
    53  			}
    54  		}
    55  	case parquet.Type_INT32:
    56  		i32s, ok := values.([]int32)
    57  		if !ok {
    58  			panic(fmt.Errorf("expected slice of int32"))
    59  		}
    60  
    61  		for i := range i32s {
    62  			i64s[i] = int64(i32s[i])
    63  		}
    64  	case parquet.Type_INT64:
    65  		var ok bool
    66  		i64s, ok = values.([]int64)
    67  		if !ok {
    68  			panic(fmt.Errorf("expected slice of int64"))
    69  		}
    70  	default:
    71  		panic(fmt.Errorf("%v parquet type unsupported", parquetType))
    72  	}
    73  
    74  	if len(i64s) == 0 {
    75  		return nil
    76  	}
    77  
    78  	var valueByte byte
    79  	bitsSet := uint64(0)
    80  	bitsNeeded := uint64(8)
    81  	bitsToSet := bitWidth
    82  	value := i64s[0]
    83  
    84  	valueBytes := []byte{}
    85  	for i := 0; i < len(i64s); {
    86  		if bitsToSet >= bitsNeeded {
    87  			valueByte |= byte(((value >> bitsSet) & ((1 << bitsNeeded) - 1)) << (8 - bitsNeeded))
    88  			valueBytes = append(valueBytes, valueByte)
    89  			bitsToSet -= bitsNeeded
    90  			bitsSet += bitsNeeded
    91  
    92  			bitsNeeded = 8
    93  			valueByte = 0
    94  
    95  			if bitsToSet <= 0 && (i+1) < len(i64s) {
    96  				i++
    97  				value = i64s[i]
    98  				bitsToSet = bitWidth
    99  				bitsSet = 0
   100  			}
   101  		} else {
   102  			valueByte |= byte((value >> bitsSet) << (8 - bitsNeeded))
   103  			i++
   104  
   105  			if i < len(i64s) {
   106  				value = i64s[i]
   107  			}
   108  
   109  			bitsNeeded -= bitsToSet
   110  			bitsToSet = bitWidth
   111  			bitsSet = 0
   112  		}
   113  	}
   114  
   115  	if withHeader {
   116  		header := uint64(((len(i64s) / 8) << 1) | 1)
   117  		headerBytes := varIntEncode(header)
   118  		return append(headerBytes, valueBytes...)
   119  	}
   120  
   121  	return valueBytes
   122  }
   123  
   124  func deltaEncodeInt32s(i32s []int32) (data []byte) {
   125  	getValue := func(i32 int32) uint64 {
   126  		return uint64((i32 >> 31) ^ (i32 << 1))
   127  	}
   128  
   129  	data = append(data, deltaEncodeHeaderBytes...)
   130  	data = append(data, varIntEncode(uint64(len(i32s)))...)
   131  	data = append(data, varIntEncode(getValue(i32s[0]))...)
   132  
   133  	for i := 1; i < len(i32s); {
   134  		block := []int32{}
   135  		minDelta := int32(0x7FFFFFFF)
   136  
   137  		for ; i < len(i32s) && len(block) < blockSize; i++ {
   138  			delta := i32s[i] - i32s[i-1]
   139  			block = append(block, delta)
   140  			if delta < minDelta {
   141  				minDelta = delta
   142  			}
   143  		}
   144  
   145  		for len(block) < blockSize {
   146  			block = append(block, minDelta)
   147  		}
   148  
   149  		bitWidths := make([]byte, miniBlockCount)
   150  		for j := 0; j < miniBlockCount; j++ {
   151  			maxValue := int32(0)
   152  			for k := j * miniBlockSize; k < (j+1)*miniBlockSize; k++ {
   153  				block[k] -= minDelta
   154  				if block[k] > maxValue {
   155  					maxValue = block[k]
   156  				}
   157  			}
   158  
   159  			bitWidths[j] = byte(common.BitWidth(uint64(maxValue)))
   160  		}
   161  
   162  		minDeltaZigZag := getValue(minDelta)
   163  		data = append(data, varIntEncode(minDeltaZigZag)...)
   164  		data = append(data, bitWidths...)
   165  
   166  		for j := 0; j < miniBlockCount; j++ {
   167  			bitPacked := bitPackedEncode(
   168  				block[j*miniBlockSize:(j+1)*miniBlockSize],
   169  				uint64(bitWidths[j]),
   170  				false,
   171  				parquet.Type_INT32,
   172  			)
   173  			data = append(data, bitPacked...)
   174  		}
   175  	}
   176  
   177  	return data
   178  }
   179  
   180  func deltaEncodeInt64s(i64s []int64) (data []byte) {
   181  	getValue := func(i64 int64) uint64 {
   182  		return uint64((i64 >> 63) ^ (i64 << 1))
   183  	}
   184  
   185  	data = append(data, deltaEncodeHeaderBytes...)
   186  	data = append(data, varIntEncode(uint64(len(i64s)))...)
   187  	data = append(data, varIntEncode(getValue(i64s[0]))...)
   188  
   189  	for i := 1; i < len(i64s); {
   190  		block := []int64{}
   191  		minDelta := int64(0x7FFFFFFFFFFFFFFF)
   192  
   193  		for ; i < len(i64s) && len(block) < blockSize; i++ {
   194  			delta := i64s[i] - i64s[i-1]
   195  			block = append(block, delta)
   196  			if delta < minDelta {
   197  				minDelta = delta
   198  			}
   199  		}
   200  
   201  		for len(block) < blockSize {
   202  			block = append(block, minDelta)
   203  		}
   204  
   205  		bitWidths := make([]byte, miniBlockCount)
   206  		for j := 0; j < miniBlockCount; j++ {
   207  			maxValue := int64(0)
   208  			for k := j * miniBlockSize; k < (j+1)*miniBlockSize; k++ {
   209  				block[k] -= minDelta
   210  				if block[k] > maxValue {
   211  					maxValue = block[k]
   212  				}
   213  			}
   214  
   215  			bitWidths[j] = byte(common.BitWidth(uint64(maxValue)))
   216  		}
   217  
   218  		minDeltaZigZag := getValue(minDelta)
   219  		data = append(data, varIntEncode(minDeltaZigZag)...)
   220  		data = append(data, bitWidths...)
   221  
   222  		for j := 0; j < miniBlockCount; j++ {
   223  			bitPacked := bitPackedEncode(
   224  				block[j*miniBlockSize:(j+1)*miniBlockSize],
   225  				uint64(bitWidths[j]),
   226  				false,
   227  				parquet.Type_INT64,
   228  			)
   229  			data = append(data, bitPacked...)
   230  		}
   231  	}
   232  
   233  	return data
   234  }
   235  
   236  // DeltaEncode encodes values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-encoding-delta_binary_packed--5
   237  //
   238  // Supported Types: INT32, INT64.
   239  func DeltaEncode(values interface{}, parquetType parquet.Type) []byte {
   240  	switch parquetType {
   241  	case parquet.Type_INT32:
   242  		i32s, ok := values.([]int32)
   243  		if !ok {
   244  			panic(fmt.Errorf("expected slice of int32"))
   245  		}
   246  		return deltaEncodeInt32s(i32s)
   247  	case parquet.Type_INT64:
   248  		i64s, ok := values.([]int64)
   249  		if !ok {
   250  			panic(fmt.Errorf("expected slice of int64"))
   251  		}
   252  		return deltaEncodeInt64s(i64s)
   253  	}
   254  
   255  	panic(fmt.Errorf("%v parquet type unsupported", parquetType))
   256  }
   257  
   258  // DeltaLengthByteArrayEncode encodes bytes slices specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-length-byte-array-delta_length_byte_array--6
   259  //
   260  // Supported Types: BYTE_ARRAY
   261  func DeltaLengthByteArrayEncode(bytesSlices [][]byte) (data []byte) {
   262  	lengths := make([]int32, len(bytesSlices))
   263  	for i, bytes := range bytesSlices {
   264  		lengths[i] = int32(len(bytes))
   265  	}
   266  
   267  	data = deltaEncodeInt32s(lengths)
   268  	for _, bytes := range bytesSlices {
   269  		data = append(data, []byte(bytes)...)
   270  	}
   271  
   272  	return data
   273  }
   274  
   275  // DeltaByteArrayEncode encodes sequence of strings values specified in https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-strings-delta_byte_array--7
   276  //
   277  // Supported Types: BYTE_ARRAY
   278  func DeltaByteArrayEncode(bytesSlices [][]byte) (data []byte) {
   279  	prefixLengths := make([]int32, len(bytesSlices))
   280  	suffixes := make([][]byte, len(bytesSlices))
   281  
   282  	var i, j int
   283  	for i = 1; i < len(bytesSlices); i++ {
   284  		for j = 0; j < len(bytesSlices[i-1]) && j < len(bytesSlices[i]); j++ {
   285  			if bytesSlices[i-1][j] != bytesSlices[i][j] {
   286  				break
   287  			}
   288  		}
   289  
   290  		prefixLengths[i] = int32(j)
   291  		suffixes[i] = bytesSlices[i][j:]
   292  	}
   293  
   294  	data = deltaEncodeInt32s(prefixLengths)
   295  	return append(data, DeltaLengthByteArrayEncode(suffixes)...)
   296  }