github.com/apache/arrow/go/v14@v14.0.2/parquet/internal/encoding/levels_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package encoding_test 18 19 import ( 20 "encoding/binary" 21 "strconv" 22 "testing" 23 24 "github.com/apache/arrow/go/v14/arrow" 25 "github.com/apache/arrow/go/v14/arrow/memory" 26 "github.com/apache/arrow/go/v14/internal/utils" 27 "github.com/apache/arrow/go/v14/parquet" 28 "github.com/apache/arrow/go/v14/parquet/internal/encoding" 29 "github.com/stretchr/testify/assert" 30 ) 31 32 func generateLevels(minRepeat, maxRepeat int, maxLevel int16) []int16 { 33 // for each repetition count up to max repeat 34 ret := make([]int16, 0) 35 for rep := minRepeat; rep <= maxRepeat; rep++ { 36 var ( 37 repCount = 1 << rep 38 val int16 = 0 39 bwidth = 0 40 ) 41 // generate levels for repetition count up to max level 42 for val <= maxLevel { 43 for i := 0; i < repCount; i++ { 44 ret = append(ret, val) 45 } 46 val = int16((2 << bwidth) - 1) 47 bwidth++ 48 } 49 } 50 return ret 51 } 52 53 func encodeLevels(t *testing.T, enc parquet.Encoding, maxLvl int16, numLevels int, input []int16) []byte { 54 var ( 55 encoder encoding.LevelEncoder 56 lvlCount = 0 57 buf = encoding.NewBufferWriter(2*numLevels, memory.DefaultAllocator) 58 ) 59 60 if enc == parquet.Encodings.RLE { 61 buf.SetOffset(arrow.Int32SizeBytes) 62 // leave space to write the rle length value 63 encoder.Init(enc, maxLvl, buf) 64 lvlCount, _ = encoder.Encode(input) 65 buf.SetOffset(0) 66 arrow.Int32Traits.CastFromBytes(buf.Bytes())[0] = utils.ToLEInt32(int32(encoder.Len())) 67 } else { 68 encoder.Init(enc, maxLvl, buf) 69 lvlCount, _ = encoder.Encode(input) 70 } 71 72 assert.Equal(t, numLevels, lvlCount) 73 return buf.Bytes() 74 } 75 76 func verifyDecodingLvls(t *testing.T, enc parquet.Encoding, maxLvl int16, input []int16, buf []byte) { 77 var ( 78 decoder encoding.LevelDecoder 79 lvlCount = 0 80 numLevels = len(input) 81 output = make([]int16, numLevels) 82 decodeCount = 4 83 numInnerLevels = numLevels / decodeCount 84 ) 85 86 // decode levels and test with multiple decode calls 87 _, err := decoder.SetData(enc, maxLvl, numLevels, buf) 88 assert.NoError(t, err) 89 // try multiple decoding on a single setdata call 90 for ct := 0; ct < decodeCount; ct++ { 91 offset := ct * numInnerLevels 92 lvlCount, _ = decoder.Decode(output[:numInnerLevels]) 93 assert.Equal(t, numInnerLevels, lvlCount) 94 assert.Equal(t, input[offset:offset+numInnerLevels], output[:numInnerLevels]) 95 } 96 97 // check the remaining levels 98 var ( 99 levelsCompleted = decodeCount * (numLevels / decodeCount) 100 remaining = numLevels - levelsCompleted 101 ) 102 103 if remaining > 0 { 104 lvlCount, _ = decoder.Decode(output[:remaining]) 105 assert.Equal(t, remaining, lvlCount) 106 assert.Equal(t, input[levelsCompleted:], output[:remaining]) 107 } 108 // test decode zero values 109 lvlCount, _ = decoder.Decode(output[:1]) 110 assert.Zero(t, lvlCount) 111 } 112 113 func verifyDecodingMultipleSetData(t *testing.T, enc parquet.Encoding, max int16, input []int16, buf [][]byte) { 114 var ( 115 decoder encoding.LevelDecoder 116 lvlCount = 0 117 setdataCount = len(buf) 118 numLevels = len(input) / setdataCount 119 output = make([]int16, numLevels) 120 ) 121 122 for ct := 0; ct < setdataCount; ct++ { 123 offset := ct * numLevels 124 assert.Len(t, output, numLevels) 125 _, err := decoder.SetData(enc, max, numLevels, buf[ct]) 126 assert.NoError(t, err) 127 lvlCount, _ = decoder.Decode(output) 128 assert.Equal(t, numLevels, lvlCount) 129 assert.Equal(t, input[offset:offset+numLevels], output) 130 } 131 } 132 133 func TestLevelsDecodeMultipleBitWidth(t *testing.T) { 134 t.Parallel() 135 // Test levels with maximum bit-width from 1 to 8 136 // increase the repetition count for each iteration by a factor of 2 137 var ( 138 minRepeat = 0 139 maxRepeat = 7 // 128 140 maxBitWidth = 8 141 input []int16 142 buf []byte 143 encodings = [2]parquet.Encoding{parquet.Encodings.RLE, parquet.Encodings.BitPacked} 144 ) 145 146 for _, enc := range encodings { 147 t.Run(enc.String(), func(t *testing.T) { 148 // bitpacked requires a sequence of at least 8 149 if enc == parquet.Encodings.BitPacked { 150 minRepeat = 3 151 } 152 // for each max bit width 153 for bitWidth := 1; bitWidth <= maxBitWidth; bitWidth++ { 154 t.Run(strconv.Itoa(bitWidth), func(t *testing.T) { 155 max := int16((1 << bitWidth) - 1) 156 // generate levels 157 input = generateLevels(minRepeat, maxRepeat, max) 158 assert.NotPanics(t, func() { 159 buf = encodeLevels(t, enc, max, len(input), input) 160 }) 161 assert.NotPanics(t, func() { 162 verifyDecodingLvls(t, enc, max, input, buf) 163 }) 164 }) 165 } 166 }) 167 } 168 } 169 170 func TestLevelsDecodeMultipleSetData(t *testing.T) { 171 t.Parallel() 172 173 var ( 174 minRepeat = 3 175 maxRepeat = 7 176 bitWidth = 8 177 maxLevel = int16((1 << bitWidth) - 1) 178 encodings = [2]parquet.Encoding{parquet.Encodings.RLE, parquet.Encodings.BitPacked} 179 ) 180 181 input := generateLevels(minRepeat, maxRepeat, maxLevel) 182 183 var ( 184 numLevels = len(input) 185 setdataFactor = 8 186 splitLevelSize = numLevels / setdataFactor 187 buf = make([][]byte, setdataFactor) 188 ) 189 190 for _, enc := range encodings { 191 t.Run(enc.String(), func(t *testing.T) { 192 for rf := 0; rf < setdataFactor; rf++ { 193 offset := rf * splitLevelSize 194 assert.NotPanics(t, func() { 195 buf[rf] = encodeLevels(t, enc, maxLevel, splitLevelSize, input[offset:offset+splitLevelSize]) 196 }) 197 } 198 assert.NotPanics(t, func() { 199 verifyDecodingMultipleSetData(t, enc, maxLevel, input, buf) 200 }) 201 }) 202 } 203 } 204 205 func TestMinimumBufferSize(t *testing.T) { 206 t.Parallel() 207 208 const numToEncode = 1024 209 levels := make([]int16, numToEncode) 210 211 for idx := range levels { 212 if idx%9 == 0 { 213 levels[idx] = 0 214 } else { 215 levels[idx] = 1 216 } 217 } 218 219 output := encoding.NewBufferWriter(0, memory.DefaultAllocator) 220 221 var encoder encoding.LevelEncoder 222 encoder.Init(parquet.Encodings.RLE, 1, output) 223 count, _ := encoder.Encode(levels) 224 assert.Equal(t, numToEncode, count) 225 } 226 227 func TestMinimumBufferSize2(t *testing.T) { 228 t.Parallel() 229 230 // test the worst case for bit_width=2 consisting of 231 // LiteralRun(size=8) 232 // RepeatedRun(size=8) 233 // LiteralRun(size=8) 234 // ... 235 const numToEncode = 1024 236 levels := make([]int16, numToEncode) 237 238 for idx := range levels { 239 // This forces a literal run of 00000001 240 // followed by eight 1s 241 if (idx % 16) < 7 { 242 levels[idx] = 0 243 } else { 244 levels[idx] = 1 245 } 246 } 247 248 for bitWidth := int16(1); bitWidth <= 8; bitWidth++ { 249 output := encoding.NewBufferWriter(0, memory.DefaultAllocator) 250 251 var encoder encoding.LevelEncoder 252 encoder.Init(parquet.Encodings.RLE, bitWidth, output) 253 count, _ := encoder.Encode(levels) 254 assert.Equal(t, numToEncode, count) 255 } 256 } 257 258 func TestEncodeDecodeLevels(t *testing.T) { 259 t.Parallel() 260 const numToEncode = 2048 261 levels := make([]int16, numToEncode) 262 numones := 0 263 for idx := range levels { 264 if (idx % 16) < 7 { 265 levels[idx] = 0 266 } else { 267 levels[idx] = 1 268 numones++ 269 } 270 } 271 272 output := encoding.NewBufferWriter(0, memory.DefaultAllocator) 273 274 var encoder encoding.LevelEncoder 275 encoder.Init(parquet.Encodings.RLE, 1, output) 276 count, _ := encoder.Encode(levels) 277 assert.Equal(t, numToEncode, count) 278 encoder.Flush() 279 280 buf := output.Bytes() 281 var prefix [4]byte 282 binary.LittleEndian.PutUint32(prefix[:], uint32(len(buf))) 283 284 var decoder encoding.LevelDecoder 285 _, err := decoder.SetData(parquet.Encodings.RLE, 1, numToEncode, append(prefix[:], buf...)) 286 assert.NoError(t, err) 287 288 var levelOut [numToEncode]int16 289 total, vals := decoder.Decode(levelOut[:]) 290 assert.EqualValues(t, numToEncode, total) 291 assert.EqualValues(t, numones, vals) 292 assert.Equal(t, levels, levelOut[:]) 293 }