storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/pkg/s3select/internal/parquet-go/decode.go (about)

     1  /*
     2   * Minio Cloud Storage, (C) 2018 Minio, Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package parquet
    18  
    19  import (
    20  	"bytes"
    21  	"errors"
    22  	"fmt"
    23  	"math"
    24  
    25  	"storj.io/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
    26  )
    27  
    28  func i64sToi32s(i64s []int64) (i32s []int32) {
    29  	i32s = make([]int32, len(i64s))
    30  	for i := range i64s {
    31  		i32s[i] = int32(i64s[i])
    32  	}
    33  
    34  	return i32s
    35  }
    36  
    37  func readBitPacked(reader *bytes.Reader, header, bitWidth uint64) (result []int64, err error) {
    38  	count := header * 8
    39  
    40  	if count == 0 {
    41  		return result, nil
    42  	}
    43  
    44  	if bitWidth == 0 {
    45  		return make([]int64, count), nil
    46  	}
    47  
    48  	data := make([]byte, header*bitWidth)
    49  	if _, err = reader.Read(data); err != nil {
    50  		return nil, err
    51  	}
    52  
    53  	var val, used, left, b uint64
    54  
    55  	valNeedBits := bitWidth
    56  	i := -1
    57  	for {
    58  		if left <= 0 {
    59  			i++
    60  			if i >= len(data) {
    61  				break
    62  			}
    63  
    64  			b = uint64(data[i])
    65  			left = 8
    66  			used = 0
    67  		}
    68  
    69  		if left >= valNeedBits {
    70  			val |= ((b >> used) & ((1 << valNeedBits) - 1)) << (bitWidth - valNeedBits)
    71  			result = append(result, int64(val))
    72  			val = 0
    73  			left -= valNeedBits
    74  			used += valNeedBits
    75  			valNeedBits = bitWidth
    76  		} else {
    77  			val |= (b >> used) << (bitWidth - valNeedBits)
    78  			valNeedBits -= left
    79  			left = 0
    80  		}
    81  	}
    82  
    83  	return result, nil
    84  }
    85  
    86  func readBools(reader *bytes.Reader, count uint64) (result []bool, err error) {
    87  	i64s, err := readBitPacked(reader, count, 1)
    88  	if err != nil {
    89  		return nil, err
    90  	}
    91  
    92  	var i uint64
    93  	for i = 0; i < count; i++ {
    94  		result = append(result, i64s[i] > 0)
    95  	}
    96  
    97  	return result, nil
    98  }
    99  
   100  func readInt32s(reader *bytes.Reader, count uint64) (result []int32, err error) {
   101  	buf := make([]byte, 4)
   102  
   103  	var i uint64
   104  	for i = 0; i < count; i++ {
   105  		if _, err = reader.Read(buf); err != nil {
   106  			return nil, err
   107  		}
   108  
   109  		result = append(result, int32(bytesToUint32(buf)))
   110  	}
   111  
   112  	return result, nil
   113  }
   114  
   115  func readInt64s(reader *bytes.Reader, count uint64) (result []int64, err error) {
   116  	buf := make([]byte, 8)
   117  
   118  	var i uint64
   119  	for i = 0; i < count; i++ {
   120  		if _, err = reader.Read(buf); err != nil {
   121  			return nil, err
   122  		}
   123  
   124  		result = append(result, int64(bytesToUint64(buf)))
   125  	}
   126  
   127  	return result, nil
   128  }
   129  
   130  func readInt96s(reader *bytes.Reader, count uint64) (result [][]byte, err error) {
   131  	var i uint64
   132  	for i = 0; i < count; i++ {
   133  		buf := make([]byte, 12)
   134  
   135  		if _, err = reader.Read(buf); err != nil {
   136  			return nil, err
   137  		}
   138  
   139  		result = append(result, buf)
   140  	}
   141  
   142  	return result, nil
   143  }
   144  
   145  func readFloats(reader *bytes.Reader, count uint64) (result []float32, err error) {
   146  	buf := make([]byte, 4)
   147  
   148  	var i uint64
   149  	for i = 0; i < count; i++ {
   150  		if _, err = reader.Read(buf); err != nil {
   151  			return nil, err
   152  		}
   153  
   154  		result = append(result, math.Float32frombits(bytesToUint32(buf)))
   155  	}
   156  
   157  	return result, nil
   158  }
   159  
   160  func readDoubles(reader *bytes.Reader, count uint64) (result []float64, err error) {
   161  	buf := make([]byte, 8)
   162  
   163  	var i uint64
   164  	for i = 0; i < count; i++ {
   165  		if _, err = reader.Read(buf); err != nil {
   166  			return nil, err
   167  		}
   168  
   169  		result = append(result, math.Float64frombits(bytesToUint64(buf)))
   170  	}
   171  
   172  	return result, nil
   173  }
   174  
   175  func readByteArrays(reader *bytes.Reader, count uint64) (result [][]byte, err error) {
   176  	buf := make([]byte, 4)
   177  	var length uint32
   178  	var data []byte
   179  
   180  	var i uint64
   181  	for i = 0; i < count; i++ {
   182  		if _, err = reader.Read(buf); err != nil {
   183  			return nil, err
   184  		}
   185  
   186  		length = bytesToUint32(buf)
   187  		data = make([]byte, length)
   188  		if length > 0 {
   189  			if _, err = reader.Read(data); err != nil {
   190  				return nil, err
   191  			}
   192  		}
   193  
   194  		result = append(result, data)
   195  	}
   196  
   197  	return result, nil
   198  }
   199  
   200  func readFixedLenByteArrays(reader *bytes.Reader, count, length uint64) (result [][]byte, err error) {
   201  	var i uint64
   202  	for i = 0; i < count; i++ {
   203  		data := make([]byte, length)
   204  		if _, err = reader.Read(data); err != nil {
   205  			return nil, err
   206  		}
   207  
   208  		result = append(result, data)
   209  	}
   210  
   211  	return result, nil
   212  }
   213  
   214  func readValues(reader *bytes.Reader, dataType parquet.Type, count, length uint64) (interface{}, error) {
   215  	switch dataType {
   216  	case parquet.Type_BOOLEAN:
   217  		return readBools(reader, count)
   218  	case parquet.Type_INT32:
   219  		return readInt32s(reader, count)
   220  	case parquet.Type_INT64:
   221  		return readInt64s(reader, count)
   222  	case parquet.Type_INT96:
   223  		return readInt96s(reader, count)
   224  	case parquet.Type_FLOAT:
   225  		return readFloats(reader, count)
   226  	case parquet.Type_DOUBLE:
   227  		return readDoubles(reader, count)
   228  	case parquet.Type_BYTE_ARRAY:
   229  		return readByteArrays(reader, count)
   230  	case parquet.Type_FIXED_LEN_BYTE_ARRAY:
   231  		return readFixedLenByteArrays(reader, count, length)
   232  	}
   233  
   234  	return nil, fmt.Errorf("unknown parquet type %v", dataType)
   235  }
   236  
   237  func readUnsignedVarInt(reader *bytes.Reader) (v uint64, err error) {
   238  	var b byte
   239  	var shift uint64
   240  
   241  	for {
   242  		if b, err = reader.ReadByte(); err != nil {
   243  			return 0, err
   244  		}
   245  
   246  		if v |= ((uint64(b) & 0x7F) << shift); b&0x80 == 0 {
   247  			break
   248  		}
   249  
   250  		shift += 7
   251  	}
   252  
   253  	return v, nil
   254  }
   255  
   256  func readRLE(reader *bytes.Reader, header, bitWidth uint64) (result []int64, err error) {
   257  	width := (bitWidth + 7) / 8
   258  	data := make([]byte, width)
   259  	if width > 0 {
   260  		if _, err = reader.Read(data); err != nil {
   261  			return nil, err
   262  		}
   263  	}
   264  
   265  	if width < 4 {
   266  		data = append(data, make([]byte, 4-width)...)
   267  	}
   268  
   269  	val := int64(bytesToUint32(data))
   270  	count := header >> 1
   271  	if count > math.MaxInt64/8 {
   272  		// 8 bytes/element.
   273  		return nil, errors.New("parquet: size too large")
   274  	}
   275  	result = make([]int64, count)
   276  	for i := range result {
   277  		result[i] = val
   278  	}
   279  
   280  	return result, nil
   281  }
   282  
   283  func readRLEBitPackedHybrid(reader *bytes.Reader, length, bitWidth uint64) (result []int64, err error) {
   284  	if length <= 0 {
   285  		var i32s []int32
   286  		i32s, err = readInt32s(reader, 1)
   287  		if err != nil {
   288  			return nil, err
   289  		}
   290  		if i32s[0] < 0 {
   291  			return nil, errors.New("parquet: negative RLEBitPackedHybrid length")
   292  		}
   293  		length = uint64(i32s[0])
   294  	}
   295  
   296  	buf := make([]byte, length)
   297  	if _, err = reader.Read(buf); err != nil {
   298  		return nil, err
   299  	}
   300  
   301  	reader = bytes.NewReader(buf)
   302  	for reader.Len() > 0 {
   303  		header, err := readUnsignedVarInt(reader)
   304  		if err != nil {
   305  			return nil, err
   306  		}
   307  
   308  		var i64s []int64
   309  		if header&1 == 0 {
   310  			i64s, err = readRLE(reader, header, bitWidth)
   311  		} else {
   312  			i64s, err = readBitPacked(reader, header>>1, bitWidth)
   313  		}
   314  
   315  		if err != nil {
   316  			return nil, err
   317  		}
   318  
   319  		result = append(result, i64s...)
   320  	}
   321  
   322  	return result, nil
   323  }
   324  
   325  func readDeltaBinaryPackedInt(reader *bytes.Reader) (result []int64, err error) {
   326  	blockSize, err := readUnsignedVarInt(reader)
   327  	if err != nil {
   328  		return nil, err
   329  	}
   330  
   331  	numMiniblocksInBlock, err := readUnsignedVarInt(reader)
   332  	if err != nil {
   333  		return nil, err
   334  	}
   335  
   336  	numValues, err := readUnsignedVarInt(reader)
   337  	if err != nil {
   338  		return nil, err
   339  	}
   340  
   341  	firstValueZigZag, err := readUnsignedVarInt(reader)
   342  	if err != nil {
   343  		return nil, err
   344  	}
   345  
   346  	v := int64(firstValueZigZag>>1) ^ (-int64(firstValueZigZag & 1))
   347  	result = append(result, v)
   348  	if numMiniblocksInBlock == 0 {
   349  		return nil, errors.New("parquet: zero mini blocks in block")
   350  	}
   351  	numValuesInMiniBlock := blockSize / numMiniblocksInBlock
   352  
   353  	bitWidths := make([]uint64, numMiniblocksInBlock)
   354  	for uint64(len(result)) < numValues {
   355  		minDeltaZigZag, err := readUnsignedVarInt(reader)
   356  		if err != nil {
   357  			return nil, err
   358  		}
   359  
   360  		for i := 0; uint64(i) < numMiniblocksInBlock; i++ {
   361  			b, err := reader.ReadByte()
   362  			if err != nil {
   363  				return nil, err
   364  			}
   365  			bitWidths[i] = uint64(b)
   366  		}
   367  
   368  		minDelta := int64(minDeltaZigZag>>1) ^ (-int64(minDeltaZigZag & 1))
   369  		for i := 0; uint64(i) < numMiniblocksInBlock; i++ {
   370  			i64s, err := readBitPacked(reader, numValuesInMiniBlock/8, bitWidths[i])
   371  			if err != nil {
   372  				return nil, err
   373  			}
   374  
   375  			for j := range i64s {
   376  				v += i64s[j] + minDelta
   377  				result = append(result, v)
   378  			}
   379  		}
   380  	}
   381  
   382  	return result[:numValues], nil
   383  }
   384  
   385  func readDeltaLengthByteArrays(reader *bytes.Reader) (result [][]byte, err error) {
   386  	i64s, err := readDeltaBinaryPackedInt(reader)
   387  	if err != nil {
   388  		return nil, err
   389  	}
   390  
   391  	for i := 0; i < len(i64s); i++ {
   392  		arrays, err := readFixedLenByteArrays(reader, 1, uint64(i64s[i]))
   393  		if err != nil {
   394  			return nil, err
   395  		}
   396  
   397  		result = append(result, arrays[0])
   398  	}
   399  
   400  	return result, nil
   401  }
   402  
   403  func readDeltaByteArrays(reader *bytes.Reader) (result [][]byte, err error) {
   404  	i64s, err := readDeltaBinaryPackedInt(reader)
   405  	if err != nil {
   406  		return nil, err
   407  	}
   408  
   409  	suffixes, err := readDeltaLengthByteArrays(reader)
   410  	if err != nil {
   411  		return nil, err
   412  	}
   413  
   414  	result = append(result, suffixes[0])
   415  	for i := 1; i < len(i64s); i++ {
   416  		prefixLength := i64s[i]
   417  		val := append([]byte{}, result[i-1][:prefixLength]...)
   418  		val = append(val, suffixes[i]...)
   419  		result = append(result, val)
   420  	}
   421  
   422  	return result, nil
   423  }
   424  
   425  func readDataPageValues(
   426  	bytesReader *bytes.Reader,
   427  	encoding parquet.Encoding,
   428  	dataType parquet.Type,
   429  	convertedType parquet.ConvertedType,
   430  	count, bitWidth uint64,
   431  ) (result interface{}, resultDataType parquet.Type, err error) {
   432  	switch encoding {
   433  	case parquet.Encoding_PLAIN:
   434  		result, err = readValues(bytesReader, dataType, count, bitWidth)
   435  		return result, dataType, err
   436  
   437  	case parquet.Encoding_PLAIN_DICTIONARY:
   438  		b, err := bytesReader.ReadByte()
   439  		if err != nil {
   440  			return nil, -1, err
   441  		}
   442  
   443  		i64s, err := readRLEBitPackedHybrid(bytesReader, uint64(bytesReader.Len()), uint64(b))
   444  		if err != nil {
   445  			return nil, -1, err
   446  		}
   447  		if len(i64s) < int(count) || count > math.MaxInt64/8 {
   448  			return nil, -1, errors.New("parquet: value out of range")
   449  		}
   450  		return i64s[:count], parquet.Type_INT64, nil
   451  
   452  	case parquet.Encoding_RLE:
   453  		i64s, err := readRLEBitPackedHybrid(bytesReader, 0, bitWidth)
   454  		if err != nil {
   455  			return nil, -1, err
   456  		}
   457  
   458  		if len(i64s) < int(count) || count > math.MaxInt64/8 {
   459  			return nil, -1, errors.New("parquet: value out of range")
   460  		}
   461  		i64s = i64s[:count]
   462  
   463  		if dataType == parquet.Type_INT32 {
   464  			return i64sToi32s(i64s), parquet.Type_INT32, nil
   465  		}
   466  
   467  		return i64s, parquet.Type_INT64, nil
   468  
   469  	case parquet.Encoding_BIT_PACKED:
   470  		return nil, -1, fmt.Errorf("deprecated parquet encoding %v", parquet.Encoding_BIT_PACKED)
   471  
   472  	case parquet.Encoding_DELTA_BINARY_PACKED:
   473  		i64s, err := readDeltaBinaryPackedInt(bytesReader)
   474  		if err != nil {
   475  			return nil, -1, err
   476  		}
   477  
   478  		if len(i64s) < int(count) || count > math.MaxInt64/8 {
   479  			return nil, -1, errors.New("parquet: value out of range")
   480  		}
   481  		i64s = i64s[:count]
   482  
   483  		if dataType == parquet.Type_INT32 {
   484  			return i64sToi32s(i64s), parquet.Type_INT32, nil
   485  		}
   486  
   487  		return i64s, parquet.Type_INT64, nil
   488  
   489  	case parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY:
   490  		byteSlices, err := readDeltaLengthByteArrays(bytesReader)
   491  		if err != nil {
   492  			return nil, -1, err
   493  		}
   494  		if len(byteSlices) < int(count) || count > math.MaxInt64/24 {
   495  			return nil, -1, errors.New("parquet: value out of range")
   496  		}
   497  
   498  		return byteSlices[:count], parquet.Type_FIXED_LEN_BYTE_ARRAY, nil
   499  
   500  	case parquet.Encoding_DELTA_BYTE_ARRAY:
   501  		byteSlices, err := readDeltaByteArrays(bytesReader)
   502  		if err != nil {
   503  			return nil, -1, err
   504  		}
   505  		if len(byteSlices) < int(count) || count > math.MaxInt64/24 {
   506  			return nil, -1, errors.New("parquet: value out of range")
   507  		}
   508  
   509  		return byteSlices[:count], parquet.Type_FIXED_LEN_BYTE_ARRAY, nil
   510  	}
   511  
   512  	return nil, -1, fmt.Errorf("unsupported parquet encoding %v", encoding)
   513  }