storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/pkg/s3select/internal/parquet-go/encode.go (about)

     1  /*
     2   * Minio Cloud Storage, (C) 2019 Minio, Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package parquet
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/binary"
    22  	"errors"
    23  	"fmt"
    24  	"math"
    25  
    26  	"storj.io/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
    27  )
    28  
    29  func boolsToBytes(bs []bool) []byte {
    30  	size := (len(bs) + 7) / 8
    31  	result := make([]byte, size)
    32  	for i := range bs {
    33  		if bs[i] {
    34  			result[i/8] |= 1 << uint32(i%8)
    35  		}
    36  	}
    37  
    38  	return result
    39  }
    40  
    41  func int32sToBytes(i32s []int32) []byte {
    42  	buf := make([]byte, 4*len(i32s))
    43  	for i, i32 := range i32s {
    44  		binary.LittleEndian.PutUint32(buf[i*4:], uint32(i32))
    45  	}
    46  	return buf
    47  }
    48  
    49  func int64sToBytes(i64s []int64) []byte {
    50  	buf := make([]byte, 8*len(i64s))
    51  	for i, i64 := range i64s {
    52  		binary.LittleEndian.PutUint64(buf[i*8:], uint64(i64))
    53  	}
    54  	return buf
    55  }
    56  
    57  func float32sToBytes(f32s []float32) []byte {
    58  	buf := make([]byte, 4*len(f32s))
    59  	for i, f32 := range f32s {
    60  		binary.LittleEndian.PutUint32(buf[i*4:], math.Float32bits(f32))
    61  	}
    62  	return buf
    63  }
    64  
    65  func float64sToBytes(f64s []float64) []byte {
    66  	buf := make([]byte, 8*len(f64s))
    67  	for i, f64 := range f64s {
    68  		binary.LittleEndian.PutUint64(buf[i*8:], math.Float64bits(f64))
    69  	}
    70  	return buf
    71  }
    72  
    73  func byteSlicesToBytes(byteSlices [][]byte) []byte {
    74  	buf := new(bytes.Buffer)
    75  	for _, s := range byteSlices {
    76  		if err := binary.Write(buf, binary.LittleEndian, uint32(len(s))); err != nil {
    77  			panic(err)
    78  		}
    79  
    80  		if _, err := buf.Write(s); err != nil {
    81  			panic(err)
    82  		}
    83  	}
    84  
    85  	return buf.Bytes()
    86  }
    87  
    88  func byteArraysToBytes(arrayList [][]byte) []byte {
    89  	buf := new(bytes.Buffer)
    90  	arrayLen := -1
    91  	for _, array := range arrayList {
    92  		if arrayLen != -1 && len(array) != arrayLen {
    93  			panic(errors.New("array list does not have same length"))
    94  		}
    95  
    96  		arrayLen = len(array)
    97  		if _, err := buf.Write(array); err != nil {
    98  			panic(err)
    99  		}
   100  	}
   101  
   102  	return buf.Bytes()
   103  }
   104  
   105  func int96sToBytes(i96s [][]byte) []byte {
   106  	return byteArraysToBytes(i96s)
   107  }
   108  
   109  func valuesToBytes(values interface{}, dataType parquet.Type) []byte {
   110  	switch dataType {
   111  	case parquet.Type_BOOLEAN:
   112  		return boolsToBytes(values.([]bool))
   113  	case parquet.Type_INT32:
   114  		return int32sToBytes(values.([]int32))
   115  	case parquet.Type_INT64:
   116  		return int64sToBytes(values.([]int64))
   117  	case parquet.Type_INT96:
   118  		return int96sToBytes(values.([][]byte))
   119  	case parquet.Type_FLOAT:
   120  		return float32sToBytes(values.([]float32))
   121  	case parquet.Type_DOUBLE:
   122  		return float64sToBytes(values.([]float64))
   123  	case parquet.Type_BYTE_ARRAY:
   124  		return byteSlicesToBytes(values.([][]byte))
   125  	case parquet.Type_FIXED_LEN_BYTE_ARRAY:
   126  		return byteArraysToBytes(values.([][]byte))
   127  	}
   128  
   129  	return []byte{}
   130  }
   131  
   132  func valueToBytes(value interface{}, dataType parquet.Type) []byte {
   133  	var values interface{}
   134  	switch dataType {
   135  	case parquet.Type_BOOLEAN:
   136  		values = []bool{value.(bool)}
   137  	case parquet.Type_INT32:
   138  		values = []int32{value.(int32)}
   139  	case parquet.Type_INT64:
   140  		values = []int64{value.(int64)}
   141  	case parquet.Type_INT96:
   142  		values = [][]byte{value.([]byte)}
   143  	case parquet.Type_FLOAT:
   144  		values = []float32{value.(float32)}
   145  	case parquet.Type_DOUBLE:
   146  		values = []float64{value.(float64)}
   147  	case parquet.Type_BYTE_ARRAY, parquet.Type_FIXED_LEN_BYTE_ARRAY:
   148  		values = [][]byte{value.([]byte)}
   149  	}
   150  
   151  	return valuesToBytes(values, dataType)
   152  }
   153  
   154  func unsignedVarIntToBytes(ui64 uint64) []byte {
   155  	size := (getBitWidth(ui64) + 6) / 7
   156  	if size == 0 {
   157  		return []byte{0}
   158  	}
   159  
   160  	buf := make([]byte, size)
   161  	for i := uint64(0); i < size; i++ {
   162  		buf[i] = byte(ui64&0x7F) | 0x80
   163  		ui64 >>= 7
   164  	}
   165  	buf[size-1] &= 0x7F
   166  
   167  	return buf
   168  }
   169  
   170  func valuesToRLEBytes(values interface{}, bitWidth int32, valueType parquet.Type) []byte {
   171  	vals := valuesToInterfaces(values, valueType)
   172  	result := []byte{}
   173  	j := 0
   174  	for i := 0; i < len(vals); i = j {
   175  		for j = i + 1; j < len(vals) && vals[i] == vals[j]; j++ {
   176  		}
   177  		headerBytes := unsignedVarIntToBytes(uint64((j - i) << 1))
   178  		result = append(result, headerBytes...)
   179  
   180  		valBytes := valueToBytes(vals[i], valueType)
   181  		byteCount := (bitWidth + 7) / 8
   182  		result = append(result, valBytes[:byteCount]...)
   183  	}
   184  
   185  	return result
   186  }
   187  
   188  func valuesToRLEBitPackedHybridBytes(values interface{}, bitWidth int32, dataType parquet.Type) []byte {
   189  	rleBytes := valuesToRLEBytes(values, bitWidth, dataType)
   190  	lenBytes := valueToBytes(int32(len(rleBytes)), parquet.Type_INT32)
   191  	return append(lenBytes, rleBytes...)
   192  }
   193  
   194  func valuesToBitPackedBytes(values interface{}, bitWidth int64, withHeader bool, dataType parquet.Type) []byte {
   195  	var i64s []int64
   196  	switch dataType {
   197  	case parquet.Type_BOOLEAN:
   198  		bs := values.([]bool)
   199  		i64s = make([]int64, len(bs))
   200  		for i := range bs {
   201  			if bs[i] {
   202  				i64s[i] = 1
   203  			}
   204  		}
   205  	case parquet.Type_INT32:
   206  		i32s := values.([]int32)
   207  		i64s = make([]int64, len(i32s))
   208  		for i := range i32s {
   209  			i64s[i] = int64(i32s[i])
   210  		}
   211  	case parquet.Type_INT64:
   212  		i64s = values.([]int64)
   213  	default:
   214  		panic(fmt.Errorf("data type %v is not supported for bit packing", dataType))
   215  	}
   216  
   217  	if len(i64s) == 0 {
   218  		return nil
   219  	}
   220  
   221  	var valueByte byte
   222  	bitsSet := uint64(0)
   223  	bitsNeeded := uint64(8)
   224  	bitsToSet := uint64(bitWidth)
   225  	value := i64s[0]
   226  
   227  	valueBytes := []byte{}
   228  	for i := 0; i < len(i64s); {
   229  		if bitsToSet >= bitsNeeded {
   230  			valueByte |= byte(((value >> bitsSet) & ((1 << bitsNeeded) - 1)) << (8 - bitsNeeded))
   231  			valueBytes = append(valueBytes, valueByte)
   232  			bitsToSet -= bitsNeeded
   233  			bitsSet += bitsNeeded
   234  
   235  			bitsNeeded = 8
   236  			valueByte = 0
   237  
   238  			if bitsToSet <= 0 && (i+1) < len(i64s) {
   239  				i++
   240  				value = i64s[i]
   241  				bitsToSet = uint64(bitWidth)
   242  				bitsSet = 0
   243  			}
   244  		} else {
   245  			valueByte |= byte((value >> bitsSet) << (8 - bitsNeeded))
   246  			i++
   247  
   248  			if i < len(i64s) {
   249  				value = i64s[i]
   250  			}
   251  
   252  			bitsNeeded -= bitsToSet
   253  			bitsToSet = uint64(bitWidth)
   254  			bitsSet = 0
   255  		}
   256  	}
   257  
   258  	if withHeader {
   259  		header := uint64(((len(i64s) / 8) << 1) | 1)
   260  		headerBytes := unsignedVarIntToBytes(header)
   261  		return append(headerBytes, valueBytes...)
   262  	}
   263  
   264  	return valueBytes
   265  }
   266  
   267  const (
   268  	blockSize     = 128
   269  	subBlockSize  = 32
   270  	subBlockCount = blockSize / subBlockSize
   271  )
   272  
   273  var (
   274  	blockSizeBytes     = unsignedVarIntToBytes(blockSize)
   275  	subBlockCountBytes = unsignedVarIntToBytes(subBlockCount)
   276  )
   277  
   278  func int32ToDeltaBytes(i32s []int32) []byte {
   279  	getValue := func(i32 int32) uint64 {
   280  		return uint64((i32 >> 31) ^ (i32 << 1))
   281  	}
   282  
   283  	result := append([]byte{}, blockSizeBytes...)
   284  	result = append(result, subBlockCountBytes...)
   285  	result = append(result, unsignedVarIntToBytes(uint64(len(i32s)))...)
   286  	result = append(result, unsignedVarIntToBytes(getValue(i32s[0]))...)
   287  
   288  	for i := 1; i < len(i32s); {
   289  		block := []int32{}
   290  		minDelta := int32(0x7FFFFFFF)
   291  
   292  		for ; i < len(i32s) && len(block) < blockSize; i++ {
   293  			delta := i32s[i] - i32s[i-1]
   294  			block = append(block, delta)
   295  			if delta < minDelta {
   296  				minDelta = delta
   297  			}
   298  		}
   299  
   300  		for len(block) < blockSize {
   301  			block = append(block, minDelta)
   302  		}
   303  
   304  		bitWidths := make([]byte, subBlockCount)
   305  		for j := 0; j < subBlockCount; j++ {
   306  			maxValue := int32(0)
   307  			for k := j * subBlockSize; k < (j+1)*subBlockSize; k++ {
   308  				block[k] -= minDelta
   309  				if block[k] > maxValue {
   310  					maxValue = block[k]
   311  				}
   312  			}
   313  
   314  			bitWidths[j] = byte(getBitWidth(uint64(maxValue)))
   315  		}
   316  
   317  		minDeltaZigZag := getValue(minDelta)
   318  		result = append(result, unsignedVarIntToBytes(minDeltaZigZag)...)
   319  		result = append(result, bitWidths...)
   320  
   321  		for j := 0; j < subBlockCount; j++ {
   322  			bitPacked := valuesToBitPackedBytes(
   323  				block[j*subBlockSize:(j+1)*subBlockSize],
   324  				int64(bitWidths[j]),
   325  				false,
   326  				parquet.Type_INT32,
   327  			)
   328  			result = append(result, bitPacked...)
   329  		}
   330  	}
   331  
   332  	return result
   333  }
   334  
   335  func int64ToDeltaBytes(i64s []int64) []byte {
   336  	getValue := func(i64 int64) uint64 {
   337  		return uint64((i64 >> 63) ^ (i64 << 1))
   338  	}
   339  
   340  	result := append([]byte{}, blockSizeBytes...)
   341  	result = append(result, subBlockCountBytes...)
   342  	result = append(result, unsignedVarIntToBytes(uint64(len(i64s)))...)
   343  	result = append(result, unsignedVarIntToBytes(getValue(i64s[0]))...)
   344  
   345  	for i := 1; i < len(i64s); {
   346  		block := []int64{}
   347  		minDelta := int64(0x7FFFFFFFFFFFFFFF)
   348  
   349  		for ; i < len(i64s) && len(block) < blockSize; i++ {
   350  			delta := i64s[i] - i64s[i-1]
   351  			block = append(block, delta)
   352  			if delta < minDelta {
   353  				minDelta = delta
   354  			}
   355  		}
   356  
   357  		for len(block) < blockSize {
   358  			block = append(block, minDelta)
   359  		}
   360  
   361  		bitWidths := make([]byte, subBlockCount)
   362  		for j := 0; j < subBlockCount; j++ {
   363  			maxValue := int64(0)
   364  			for k := j * subBlockSize; k < (j+1)*subBlockSize; k++ {
   365  				block[k] -= minDelta
   366  				if block[k] > maxValue {
   367  					maxValue = block[k]
   368  				}
   369  			}
   370  
   371  			bitWidths[j] = byte(getBitWidth(uint64(maxValue)))
   372  		}
   373  
   374  		minDeltaZigZag := getValue(minDelta)
   375  		result = append(result, unsignedVarIntToBytes(minDeltaZigZag)...)
   376  		result = append(result, bitWidths...)
   377  
   378  		for j := 0; j < subBlockCount; j++ {
   379  			bitPacked := valuesToBitPackedBytes(
   380  				block[j*subBlockSize:(j+1)*subBlockSize],
   381  				int64(bitWidths[j]),
   382  				false,
   383  				parquet.Type_INT64,
   384  			)
   385  			result = append(result, bitPacked...)
   386  		}
   387  	}
   388  
   389  	return result
   390  }
   391  
   392  func valuesToDeltaBytes(values interface{}, dataType parquet.Type) []byte {
   393  	switch dataType {
   394  	case parquet.Type_INT32:
   395  		return int32ToDeltaBytes(values.([]int32))
   396  	case parquet.Type_INT64:
   397  		return int64ToDeltaBytes(values.([]int64))
   398  	}
   399  
   400  	return nil
   401  }
   402  
   403  func stringsToDeltaLengthByteArrayBytes(strs []string) []byte {
   404  	lengths := make([]int32, len(strs))
   405  	for i, s := range strs {
   406  		lengths[i] = int32(len(s))
   407  	}
   408  
   409  	result := int32ToDeltaBytes(lengths)
   410  	for _, s := range strs {
   411  		result = append(result, []byte(s)...)
   412  	}
   413  
   414  	return result
   415  }
   416  
   417  func stringsToDeltaByteArrayBytes(strs []string) []byte {
   418  	prefixLengths := make([]int32, len(strs))
   419  	suffixes := make([]string, len(strs))
   420  
   421  	var i, j int
   422  	for i = 1; i < len(strs); i++ {
   423  		for j = 0; j < len(strs[i-1]) && j < len(strs[i]); j++ {
   424  			if strs[i-1][j] != strs[i][j] {
   425  				break
   426  			}
   427  		}
   428  
   429  		prefixLengths[i] = int32(j)
   430  		suffixes[i] = strs[i][j:]
   431  	}
   432  
   433  	result := int32ToDeltaBytes(prefixLengths)
   434  	return append(result, stringsToDeltaLengthByteArrayBytes(suffixes)...)
   435  }
   436  
   437  func encodeValues(values interface{}, dataType parquet.Type, encoding parquet.Encoding, bitWidth int32) []byte {
   438  	switch encoding {
   439  	case parquet.Encoding_RLE:
   440  		return valuesToRLEBitPackedHybridBytes(values, bitWidth, dataType)
   441  	case parquet.Encoding_DELTA_BINARY_PACKED:
   442  		return valuesToDeltaBytes(values, dataType)
   443  	case parquet.Encoding_DELTA_BYTE_ARRAY:
   444  		return stringsToDeltaByteArrayBytes(values.([]string))
   445  	case parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY:
   446  		return stringsToDeltaLengthByteArrayBytes(values.([]string))
   447  	}
   448  
   449  	return valuesToBytes(values, dataType)
   450  }