github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/encoding/encoding_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package encoding_test 18 19 import ( 20 "fmt" 21 "reflect" 22 "testing" 23 "unsafe" 24 25 "github.com/apache/arrow/go/v7/arrow" 26 "github.com/apache/arrow/go/v7/arrow/bitutil" 27 "github.com/apache/arrow/go/v7/arrow/memory" 28 "github.com/apache/arrow/go/v7/parquet" 29 "github.com/apache/arrow/go/v7/parquet/internal/encoding" 30 "github.com/apache/arrow/go/v7/parquet/internal/testutils" 31 "github.com/apache/arrow/go/v7/parquet/schema" 32 "github.com/stretchr/testify/assert" 33 "github.com/stretchr/testify/suite" 34 ) 35 36 type nodeFactory func(string, parquet.Repetition, int32) *schema.PrimitiveNode 37 38 func createNodeFactory(t reflect.Type) nodeFactory { 39 switch t { 40 case reflect.TypeOf(true): 41 return schema.NewBooleanNode 42 case reflect.TypeOf(int32(0)): 43 return schema.NewInt32Node 44 case reflect.TypeOf(int64(0)): 45 return schema.NewInt64Node 46 case reflect.TypeOf(parquet.Int96{}): 47 return schema.NewInt96Node 48 case reflect.TypeOf(float32(0)): 49 return schema.NewFloat32Node 50 case reflect.TypeOf(float64(0)): 51 return schema.NewFloat64Node 52 case reflect.TypeOf(parquet.ByteArray{}): 53 return schema.NewByteArrayNode 54 case reflect.TypeOf(parquet.FixedLenByteArray{}): 55 return func(name string, rep parquet.Repetition, field int32) *schema.PrimitiveNode { 56 return schema.NewFixedLenByteArrayNode(name, rep, 12, field) 57 } 58 } 59 return nil 60 } 61 62 func initdata(t reflect.Type, drawbuf, decodebuf []byte, nvals, repeats int, heap *memory.Buffer) (interface{}, interface{}) { 63 switch t { 64 case reflect.TypeOf(true): 65 draws := *(*[]bool)(unsafe.Pointer(&drawbuf)) 66 decode := *(*[]bool)(unsafe.Pointer(&decodebuf)) 67 testutils.InitValues(draws[:nvals], heap) 68 69 for j := 1; j < repeats; j++ { 70 for k := 0; k < nvals; k++ { 71 draws[nvals*j+k] = draws[k] 72 } 73 } 74 75 return draws[:nvals*repeats], decode[:nvals*repeats] 76 case reflect.TypeOf(int32(0)): 77 draws := arrow.Int32Traits.CastFromBytes(drawbuf) 78 decode := arrow.Int32Traits.CastFromBytes(decodebuf) 79 testutils.InitValues(draws[:nvals], heap) 80 81 for j := 1; j < repeats; j++ { 82 for k := 0; k < nvals; k++ { 83 draws[nvals*j+k] = draws[k] 84 } 85 } 86 87 return draws[:nvals*repeats], decode[:nvals*repeats] 88 case reflect.TypeOf(int64(0)): 89 draws := arrow.Int64Traits.CastFromBytes(drawbuf) 90 decode := arrow.Int64Traits.CastFromBytes(decodebuf) 91 testutils.InitValues(draws[:nvals], heap) 92 93 for j := 1; j < repeats; j++ { 94 for k := 0; k < nvals; k++ { 95 draws[nvals*j+k] = draws[k] 96 } 97 } 98 99 return draws[:nvals*repeats], decode[:nvals*repeats] 100 case reflect.TypeOf(parquet.Int96{}): 101 draws := parquet.Int96Traits.CastFromBytes(drawbuf) 102 decode := parquet.Int96Traits.CastFromBytes(decodebuf) 103 testutils.InitValues(draws[:nvals], heap) 104 105 for j := 1; j < repeats; j++ { 106 for k := 0; k < nvals; k++ { 107 draws[nvals*j+k] = draws[k] 108 } 109 } 110 111 return draws[:nvals*repeats], decode[:nvals*repeats] 112 case reflect.TypeOf(float32(0)): 113 draws := arrow.Float32Traits.CastFromBytes(drawbuf) 114 decode := arrow.Float32Traits.CastFromBytes(decodebuf) 115 testutils.InitValues(draws[:nvals], heap) 116 117 for j := 1; j < repeats; j++ { 118 for k := 0; k < nvals; k++ { 119 draws[nvals*j+k] = draws[k] 120 } 121 } 122 123 return draws[:nvals*repeats], decode[:nvals*repeats] 124 case reflect.TypeOf(float64(0)): 125 draws := arrow.Float64Traits.CastFromBytes(drawbuf) 126 decode := arrow.Float64Traits.CastFromBytes(decodebuf) 127 testutils.InitValues(draws[:nvals], heap) 128 129 for j := 1; j < repeats; j++ { 130 for k := 0; k < nvals; k++ { 131 draws[nvals*j+k] = draws[k] 132 } 133 } 134 135 return draws[:nvals*repeats], decode[:nvals*repeats] 136 case reflect.TypeOf(parquet.ByteArray{}): 137 draws := make([]parquet.ByteArray, nvals*repeats) 138 decode := make([]parquet.ByteArray, nvals*repeats) 139 testutils.InitValues(draws[:nvals], heap) 140 141 for j := 1; j < repeats; j++ { 142 for k := 0; k < nvals; k++ { 143 draws[nvals*j+k] = draws[k] 144 } 145 } 146 147 return draws[:nvals*repeats], decode[:nvals*repeats] 148 case reflect.TypeOf(parquet.FixedLenByteArray{}): 149 draws := make([]parquet.FixedLenByteArray, nvals*repeats) 150 decode := make([]parquet.FixedLenByteArray, nvals*repeats) 151 testutils.InitValues(draws[:nvals], heap) 152 153 for j := 1; j < repeats; j++ { 154 for k := 0; k < nvals; k++ { 155 draws[nvals*j+k] = draws[k] 156 } 157 } 158 159 return draws[:nvals*repeats], decode[:nvals*repeats] 160 } 161 return nil, nil 162 } 163 164 func encode(enc encoding.TypedEncoder, vals interface{}) { 165 switch v := vals.(type) { 166 case []bool: 167 enc.(encoding.BooleanEncoder).Put(v) 168 case []int32: 169 enc.(encoding.Int32Encoder).Put(v) 170 case []int64: 171 enc.(encoding.Int64Encoder).Put(v) 172 case []parquet.Int96: 173 enc.(encoding.Int96Encoder).Put(v) 174 case []float32: 175 enc.(encoding.Float32Encoder).Put(v) 176 case []float64: 177 enc.(encoding.Float64Encoder).Put(v) 178 case []parquet.ByteArray: 179 enc.(encoding.ByteArrayEncoder).Put(v) 180 case []parquet.FixedLenByteArray: 181 enc.(encoding.FixedLenByteArrayEncoder).Put(v) 182 } 183 } 184 185 func encodeSpaced(enc encoding.TypedEncoder, vals interface{}, validBits []byte, validBitsOffset int64) { 186 switch v := vals.(type) { 187 case []bool: 188 enc.(encoding.BooleanEncoder).PutSpaced(v, validBits, validBitsOffset) 189 case []int32: 190 enc.(encoding.Int32Encoder).PutSpaced(v, validBits, validBitsOffset) 191 case []int64: 192 enc.(encoding.Int64Encoder).PutSpaced(v, validBits, validBitsOffset) 193 case []parquet.Int96: 194 enc.(encoding.Int96Encoder).PutSpaced(v, validBits, validBitsOffset) 195 case []float32: 196 enc.(encoding.Float32Encoder).PutSpaced(v, validBits, validBitsOffset) 197 case []float64: 198 enc.(encoding.Float64Encoder).PutSpaced(v, validBits, validBitsOffset) 199 case []parquet.ByteArray: 200 enc.(encoding.ByteArrayEncoder).PutSpaced(v, validBits, validBitsOffset) 201 case []parquet.FixedLenByteArray: 202 enc.(encoding.FixedLenByteArrayEncoder).PutSpaced(v, validBits, validBitsOffset) 203 } 204 } 205 206 func decode(dec encoding.TypedDecoder, out interface{}) (int, error) { 207 switch v := out.(type) { 208 case []bool: 209 return dec.(encoding.BooleanDecoder).Decode(v) 210 case []int32: 211 return dec.(encoding.Int32Decoder).Decode(v) 212 case []int64: 213 return dec.(encoding.Int64Decoder).Decode(v) 214 case []parquet.Int96: 215 return dec.(encoding.Int96Decoder).Decode(v) 216 case []float32: 217 return dec.(encoding.Float32Decoder).Decode(v) 218 case []float64: 219 return dec.(encoding.Float64Decoder).Decode(v) 220 case []parquet.ByteArray: 221 return dec.(encoding.ByteArrayDecoder).Decode(v) 222 case []parquet.FixedLenByteArray: 223 return dec.(encoding.FixedLenByteArrayDecoder).Decode(v) 224 } 225 return 0, nil 226 } 227 228 func decodeSpaced(dec encoding.TypedDecoder, out interface{}, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { 229 switch v := out.(type) { 230 case []bool: 231 return dec.(encoding.BooleanDecoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) 232 case []int32: 233 return dec.(encoding.Int32Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) 234 case []int64: 235 return dec.(encoding.Int64Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) 236 case []parquet.Int96: 237 return dec.(encoding.Int96Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) 238 case []float32: 239 return dec.(encoding.Float32Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) 240 case []float64: 241 return dec.(encoding.Float64Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) 242 case []parquet.ByteArray: 243 return dec.(encoding.ByteArrayDecoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) 244 case []parquet.FixedLenByteArray: 245 return dec.(encoding.FixedLenByteArrayDecoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) 246 } 247 return 0, nil 248 } 249 250 type BaseEncodingTestSuite struct { 251 suite.Suite 252 253 descr *schema.Column 254 typeLen int 255 mem memory.Allocator 256 typ reflect.Type 257 258 nvalues int 259 heap *memory.Buffer 260 inputBytes *memory.Buffer 261 outputBytes *memory.Buffer 262 nodeFactory nodeFactory 263 264 draws interface{} 265 decodeBuf interface{} 266 } 267 268 func (b *BaseEncodingTestSuite) SetupSuite() { 269 b.mem = memory.DefaultAllocator 270 b.inputBytes = memory.NewResizableBuffer(b.mem) 271 b.outputBytes = memory.NewResizableBuffer(b.mem) 272 b.heap = memory.NewResizableBuffer(b.mem) 273 b.nodeFactory = createNodeFactory(b.typ) 274 } 275 276 func (b *BaseEncodingTestSuite) TearDownSuite() { 277 b.inputBytes.Release() 278 b.outputBytes.Release() 279 b.heap.Release() 280 } 281 282 func (b *BaseEncodingTestSuite) SetupTest() { 283 b.descr = schema.NewColumn(b.nodeFactory("name", parquet.Repetitions.Optional, -1), 0, 0) 284 b.typeLen = int(b.descr.TypeLength()) 285 } 286 287 func (b *BaseEncodingTestSuite) initData(nvalues, repeats int) { 288 b.nvalues = nvalues * repeats 289 b.inputBytes.ResizeNoShrink(b.nvalues * int(b.typ.Size())) 290 b.outputBytes.ResizeNoShrink(b.nvalues * int(b.typ.Size())) 291 memory.Set(b.inputBytes.Buf(), 0) 292 memory.Set(b.outputBytes.Buf(), 0) 293 294 b.draws, b.decodeBuf = initdata(b.typ, b.inputBytes.Buf(), b.outputBytes.Buf(), nvalues, repeats, b.heap) 295 } 296 297 func (b *BaseEncodingTestSuite) encodeTestData(e parquet.Encoding) (encoding.Buffer, error) { 298 enc := encoding.NewEncoder(testutils.TypeToParquetType(b.typ), e, false, b.descr, memory.DefaultAllocator) 299 b.Equal(e, enc.Encoding()) 300 b.Equal(b.descr.PhysicalType(), enc.Type()) 301 encode(enc, reflect.ValueOf(b.draws).Slice(0, b.nvalues).Interface()) 302 return enc.FlushValues() 303 } 304 305 func (b *BaseEncodingTestSuite) decodeTestData(e parquet.Encoding, buf []byte) { 306 dec := encoding.NewDecoder(testutils.TypeToParquetType(b.typ), e, b.descr, b.mem) 307 b.Equal(e, dec.Encoding()) 308 b.Equal(b.descr.PhysicalType(), dec.Type()) 309 310 dec.SetData(b.nvalues, buf) 311 decoded, _ := decode(dec, b.decodeBuf) 312 b.Equal(b.nvalues, decoded) 313 b.Equal(reflect.ValueOf(b.draws).Slice(0, b.nvalues).Interface(), reflect.ValueOf(b.decodeBuf).Slice(0, b.nvalues).Interface()) 314 } 315 316 func (b *BaseEncodingTestSuite) encodeTestDataSpaced(e parquet.Encoding, validBits []byte, validBitsOffset int64) (encoding.Buffer, error) { 317 enc := encoding.NewEncoder(testutils.TypeToParquetType(b.typ), e, false, b.descr, memory.DefaultAllocator) 318 encodeSpaced(enc, reflect.ValueOf(b.draws).Slice(0, b.nvalues).Interface(), validBits, validBitsOffset) 319 return enc.FlushValues() 320 } 321 322 func (b *BaseEncodingTestSuite) decodeTestDataSpaced(e parquet.Encoding, nullCount int, buf []byte, validBits []byte, validBitsOffset int64) { 323 dec := encoding.NewDecoder(testutils.TypeToParquetType(b.typ), e, b.descr, b.mem) 324 dec.SetData(b.nvalues-nullCount, buf) 325 decoded, _ := decodeSpaced(dec, b.decodeBuf, nullCount, validBits, validBitsOffset) 326 b.Equal(b.nvalues, decoded) 327 328 drawval := reflect.ValueOf(b.draws) 329 decodeval := reflect.ValueOf(b.decodeBuf) 330 for j := 0; j < b.nvalues; j++ { 331 if bitutil.BitIsSet(validBits, int(validBitsOffset)+j) { 332 b.Equal(drawval.Index(j).Interface(), decodeval.Index(j).Interface()) 333 } 334 } 335 } 336 337 func (b *BaseEncodingTestSuite) checkRoundTrip(e parquet.Encoding) { 338 buf, _ := b.encodeTestData(e) 339 defer buf.Release() 340 b.decodeTestData(e, buf.Bytes()) 341 } 342 343 func (b *BaseEncodingTestSuite) checkRoundTripSpaced(e parquet.Encoding, validBits []byte, validBitsOffset int64) { 344 buf, _ := b.encodeTestDataSpaced(e, validBits, validBitsOffset) 345 defer buf.Release() 346 347 nullCount := 0 348 for i := 0; i < b.nvalues; i++ { 349 if bitutil.BitIsNotSet(validBits, int(validBitsOffset)+i) { 350 nullCount++ 351 } 352 } 353 b.decodeTestDataSpaced(e, nullCount, buf.Bytes(), validBits, validBitsOffset) 354 } 355 356 func (b *BaseEncodingTestSuite) TestBasicRoundTrip() { 357 b.initData(10000, 1) 358 b.checkRoundTrip(parquet.Encodings.Plain) 359 } 360 361 func (b *BaseEncodingTestSuite) TestDeltaEncodingRoundTrip() { 362 b.initData(10000, 1) 363 364 switch b.typ { 365 case reflect.TypeOf(int32(0)), reflect.TypeOf(int64(0)): 366 b.checkRoundTrip(parquet.Encodings.DeltaBinaryPacked) 367 default: 368 b.Panics(func() { b.checkRoundTrip(parquet.Encodings.DeltaBinaryPacked) }) 369 } 370 } 371 372 func (b *BaseEncodingTestSuite) TestDeltaLengthByteArrayRoundTrip() { 373 b.initData(10000, 1) 374 375 switch b.typ { 376 case reflect.TypeOf(parquet.ByteArray{}): 377 b.checkRoundTrip(parquet.Encodings.DeltaLengthByteArray) 378 default: 379 b.Panics(func() { b.checkRoundTrip(parquet.Encodings.DeltaLengthByteArray) }) 380 } 381 } 382 383 func (b *BaseEncodingTestSuite) TestDeltaByteArrayRoundTrip() { 384 b.initData(10000, 1) 385 386 switch b.typ { 387 case reflect.TypeOf(parquet.ByteArray{}): 388 b.checkRoundTrip(parquet.Encodings.DeltaByteArray) 389 default: 390 b.Panics(func() { b.checkRoundTrip(parquet.Encodings.DeltaLengthByteArray) }) 391 } 392 } 393 394 func (b *BaseEncodingTestSuite) TestSpacedRoundTrip() { 395 exec := func(vals, repeats int, validBitsOffset int64, nullProb float64) { 396 b.Run(fmt.Sprintf("%d vals %d repeats %d offset %0.3f null", vals, repeats, validBitsOffset, 1-nullProb), func() { 397 b.initData(vals, repeats) 398 399 size := int64(b.nvalues) + validBitsOffset 400 r := testutils.NewRandomArrayGenerator(1923) 401 arr := r.Uint8(size, 0, 100, 1-nullProb) 402 validBits := arr.NullBitmapBytes() 403 if validBits != nil { 404 b.checkRoundTripSpaced(parquet.Encodings.Plain, validBits, validBitsOffset) 405 switch b.typ { 406 case reflect.TypeOf(int32(0)), reflect.TypeOf(int64(0)): 407 b.checkRoundTripSpaced(parquet.Encodings.DeltaBinaryPacked, validBits, validBitsOffset) 408 case reflect.TypeOf(parquet.ByteArray{}): 409 b.checkRoundTripSpaced(parquet.Encodings.DeltaLengthByteArray, validBits, validBitsOffset) 410 b.checkRoundTripSpaced(parquet.Encodings.DeltaByteArray, validBits, validBitsOffset) 411 } 412 } 413 }) 414 } 415 416 const ( 417 avx512Size = 64 418 simdSize = avx512Size 419 multiSimdSize = simdSize * 33 420 ) 421 422 for _, nullProb := range []float64{0.001, 0.1, 0.5, 0.9, 0.999} { 423 // Test with both size and offset up to 3 simd block 424 for i := 1; i < simdSize*3; i++ { 425 exec(i, 1, 0, nullProb) 426 exec(i, 1, int64(i+1), nullProb) 427 } 428 // large block and offset 429 exec(multiSimdSize, 1, 0, nullProb) 430 exec(multiSimdSize+33, 1, 0, nullProb) 431 exec(multiSimdSize, 1, 33, nullProb) 432 exec(multiSimdSize+33, 1, 33, nullProb) 433 } 434 } 435 436 func TestEncoding(t *testing.T) { 437 tests := []struct { 438 name string 439 typ reflect.Type 440 }{ 441 {"Bool", reflect.TypeOf(true)}, 442 {"Int32", reflect.TypeOf(int32(0))}, 443 {"Int64", reflect.TypeOf(int64(0))}, 444 {"Float32", reflect.TypeOf(float32(0))}, 445 {"Float64", reflect.TypeOf(float64(0))}, 446 {"Int96", reflect.TypeOf(parquet.Int96{})}, 447 {"ByteArray", reflect.TypeOf(parquet.ByteArray{})}, 448 {"FixedLenByteArray", reflect.TypeOf(parquet.FixedLenByteArray{})}, 449 } 450 451 for _, tt := range tests { 452 t.Run(tt.name, func(t *testing.T) { 453 suite.Run(t, &BaseEncodingTestSuite{typ: tt.typ}) 454 }) 455 } 456 } 457 458 type DictionaryEncodingTestSuite struct { 459 BaseEncodingTestSuite 460 } 461 462 func (d *DictionaryEncodingTestSuite) encodeTestDataDict(e parquet.Encoding) (dictBuffer, indices encoding.Buffer, numEntries int) { 463 enc := encoding.NewEncoder(testutils.TypeToParquetType(d.typ), e, true, d.descr, memory.DefaultAllocator).(encoding.DictEncoder) 464 465 d.Equal(parquet.Encodings.PlainDict, enc.Encoding()) 466 d.Equal(d.descr.PhysicalType(), enc.Type()) 467 encode(enc, reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface()) 468 dictBuffer = memory.NewResizableBuffer(d.mem) 469 dictBuffer.Resize(enc.DictEncodedSize()) 470 enc.WriteDict(dictBuffer.Bytes()) 471 indices, _ = enc.FlushValues() 472 numEntries = enc.NumEntries() 473 return 474 } 475 476 func (d *DictionaryEncodingTestSuite) encodeTestDataDictSpaced(e parquet.Encoding, validBits []byte, validBitsOffset int64) (dictBuffer, indices encoding.Buffer, numEntries int) { 477 enc := encoding.NewEncoder(testutils.TypeToParquetType(d.typ), e, true, d.descr, memory.DefaultAllocator).(encoding.DictEncoder) 478 d.Equal(d.descr.PhysicalType(), enc.Type()) 479 480 encodeSpaced(enc, reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface(), validBits, validBitsOffset) 481 dictBuffer = memory.NewResizableBuffer(d.mem) 482 dictBuffer.Resize(enc.DictEncodedSize()) 483 enc.WriteDict(dictBuffer.Bytes()) 484 indices, _ = enc.FlushValues() 485 numEntries = enc.NumEntries() 486 return 487 } 488 489 func (d *DictionaryEncodingTestSuite) checkRoundTrip() { 490 dictBuffer, indices, numEntries := d.encodeTestDataDict(parquet.Encodings.Plain) 491 defer dictBuffer.Release() 492 defer indices.Release() 493 validBits := make([]byte, int(bitutil.BytesForBits(int64(d.nvalues)))+1) 494 memory.Set(validBits, 255) 495 496 spacedBuffer, indicesSpaced, _ := d.encodeTestDataDictSpaced(parquet.Encodings.Plain, validBits, 0) 497 defer spacedBuffer.Release() 498 defer indicesSpaced.Release() 499 d.Equal(indices.Bytes(), indicesSpaced.Bytes()) 500 501 dictDecoder := encoding.NewDecoder(testutils.TypeToParquetType(d.typ), parquet.Encodings.Plain, d.descr, d.mem) 502 d.Equal(d.descr.PhysicalType(), dictDecoder.Type()) 503 dictDecoder.SetData(numEntries, dictBuffer.Bytes()) 504 decoder := encoding.NewDictDecoder(testutils.TypeToParquetType(d.typ), d.descr, d.mem) 505 decoder.SetDict(dictDecoder) 506 decoder.SetData(d.nvalues, indices.Bytes()) 507 508 decoded, _ := decode(decoder, d.decodeBuf) 509 d.Equal(d.nvalues, decoded) 510 d.Equal(reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface(), reflect.ValueOf(d.decodeBuf).Slice(0, d.nvalues).Interface()) 511 512 decoder.SetData(d.nvalues, indices.Bytes()) 513 decoded, _ = decodeSpaced(decoder, d.decodeBuf, 0, validBits, 0) 514 d.Equal(d.nvalues, decoded) 515 d.Equal(reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface(), reflect.ValueOf(d.decodeBuf).Slice(0, d.nvalues).Interface()) 516 } 517 518 func (d *DictionaryEncodingTestSuite) TestBasicRoundTrip() { 519 d.initData(2500, 2) 520 d.checkRoundTrip() 521 } 522 523 func TestDictEncoding(t *testing.T) { 524 tests := []struct { 525 name string 526 typ reflect.Type 527 }{ 528 {"Int32", reflect.TypeOf(int32(0))}, 529 {"Int64", reflect.TypeOf(int64(0))}, 530 {"Float32", reflect.TypeOf(float32(0))}, 531 {"Float64", reflect.TypeOf(float64(0))}, 532 {"ByteArray", reflect.TypeOf(parquet.ByteArray{})}, 533 {"FixedLenByteArray", reflect.TypeOf(parquet.FixedLenByteArray{})}, 534 } 535 536 for _, tt := range tests { 537 t.Run(tt.name, func(t *testing.T) { 538 suite.Run(t, &DictionaryEncodingTestSuite{BaseEncodingTestSuite{typ: tt.typ}}) 539 }) 540 } 541 } 542 543 func TestWriteDeltaBitPackedInt32(t *testing.T) { 544 column := schema.NewColumn(schema.NewInt32Node("int32", parquet.Repetitions.Required, -1), 0, 0) 545 546 tests := []struct { 547 name string 548 toencode []int32 549 expected []byte 550 }{ 551 {"simple 12345", []int32{1, 2, 3, 4, 5}, []byte{128, 1, 4, 5, 2, 2, 0, 0, 0, 0}}, 552 {"odd vals", []int32{7, 5, 3, 1, 2, 3, 4, 5}, []byte{128, 1, 4, 8, 14, 3, 2, 0, 0, 0, 192, 63, 0, 0, 0, 0, 0, 0}}, 553 } 554 555 for _, tt := range tests { 556 t.Run(tt.name, func(t *testing.T) { 557 enc := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator) 558 559 enc.(encoding.Int32Encoder).Put(tt.toencode) 560 buf, _ := enc.FlushValues() 561 defer buf.Release() 562 563 assert.Equal(t, tt.expected, buf.Bytes()) 564 565 dec := encoding.NewDecoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator) 566 567 dec.(encoding.Int32Decoder).SetData(len(tt.toencode), tt.expected) 568 out := make([]int32, len(tt.toencode)) 569 dec.(encoding.Int32Decoder).Decode(out) 570 assert.Equal(t, tt.toencode, out) 571 }) 572 } 573 574 t.Run("test progressive decoding", func(t *testing.T) { 575 values := make([]int32, 1000) 576 testutils.FillRandomInt32(0, values) 577 578 enc := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator) 579 enc.(encoding.Int32Encoder).Put(values) 580 buf, _ := enc.FlushValues() 581 defer buf.Release() 582 583 dec := encoding.NewDecoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator) 584 dec.(encoding.Int32Decoder).SetData(len(values), buf.Bytes()) 585 586 valueBuf := make([]int32, 100) 587 for i, j := 0, len(valueBuf); j <= len(values); i, j = i+len(valueBuf), j+len(valueBuf) { 588 dec.(encoding.Int32Decoder).Decode(valueBuf) 589 assert.Equalf(t, values[i:j], valueBuf, "indexes %d:%d", i, j) 590 } 591 }) 592 } 593 594 func TestWriteDeltaBitPackedInt64(t *testing.T) { 595 column := schema.NewColumn(schema.NewInt64Node("int64", parquet.Repetitions.Required, -1), 0, 0) 596 597 tests := []struct { 598 name string 599 toencode []int64 600 expected []byte 601 }{ 602 {"simple 12345", []int64{1, 2, 3, 4, 5}, []byte{128, 1, 4, 5, 2, 2, 0, 0, 0, 0}}, 603 {"odd vals", []int64{7, 5, 3, 1, 2, 3, 4, 5}, []byte{128, 1, 4, 8, 14, 3, 2, 0, 0, 0, 192, 63, 0, 0, 0, 0, 0, 0}}, 604 } 605 606 for _, tt := range tests { 607 t.Run(tt.name, func(t *testing.T) { 608 enc := encoding.NewEncoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator) 609 610 enc.(encoding.Int64Encoder).Put(tt.toencode) 611 buf, _ := enc.FlushValues() 612 defer buf.Release() 613 614 assert.Equal(t, tt.expected, buf.Bytes()) 615 616 dec := encoding.NewDecoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator) 617 618 dec.(encoding.Int64Decoder).SetData(len(tt.toencode), tt.expected) 619 out := make([]int64, len(tt.toencode)) 620 dec.(encoding.Int64Decoder).Decode(out) 621 assert.Equal(t, tt.toencode, out) 622 }) 623 } 624 625 t.Run("test progressive decoding", func(t *testing.T) { 626 values := make([]int64, 1000) 627 testutils.FillRandomInt64(0, values) 628 629 enc := encoding.NewEncoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator) 630 enc.(encoding.Int64Encoder).Put(values) 631 buf, _ := enc.FlushValues() 632 defer buf.Release() 633 634 dec := encoding.NewDecoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator) 635 dec.(encoding.Int64Decoder).SetData(len(values), buf.Bytes()) 636 637 valueBuf := make([]int64, 100) 638 for i, j := 0, len(valueBuf); j <= len(values); i, j = i+len(valueBuf), j+len(valueBuf) { 639 decoded, _ := dec.(encoding.Int64Decoder).Decode(valueBuf) 640 assert.Equal(t, len(valueBuf), decoded) 641 assert.Equalf(t, values[i:j], valueBuf, "indexes %d:%d", i, j) 642 } 643 }) 644 } 645 646 func TestDeltaLengthByteArrayEncoding(t *testing.T) { 647 column := schema.NewColumn(schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1), 0, 0) 648 649 test := []parquet.ByteArray{[]byte("Hello"), []byte("World"), []byte("Foobar"), []byte("ABCDEF")} 650 expected := []byte{128, 1, 4, 4, 10, 0, 1, 0, 0, 0, 2, 0, 0, 0, 72, 101, 108, 108, 111, 87, 111, 114, 108, 100, 70, 111, 111, 98, 97, 114, 65, 66, 67, 68, 69, 70} 651 652 enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.DeltaLengthByteArray, false, column, memory.DefaultAllocator) 653 enc.(encoding.ByteArrayEncoder).Put(test) 654 buf, _ := enc.FlushValues() 655 defer buf.Release() 656 657 assert.Equal(t, expected, buf.Bytes()) 658 659 dec := encoding.NewDecoder(parquet.Types.ByteArray, parquet.Encodings.DeltaLengthByteArray, column, nil) 660 dec.SetData(len(test), expected) 661 out := make([]parquet.ByteArray, len(test)) 662 decoded, _ := dec.(encoding.ByteArrayDecoder).Decode(out) 663 assert.Equal(t, len(test), decoded) 664 assert.Equal(t, test, out) 665 } 666 667 func TestDeltaByteArrayEncoding(t *testing.T) { 668 test := []parquet.ByteArray{[]byte("Hello"), []byte("World"), []byte("Foobar"), []byte("ABCDEF")} 669 expected := []byte{128, 1, 4, 4, 0, 0, 0, 0, 0, 0, 128, 1, 4, 4, 10, 0, 1, 0, 0, 0, 2, 0, 0, 0, 72, 101, 108, 108, 111, 87, 111, 114, 108, 100, 70, 111, 111, 98, 97, 114, 65, 66, 67, 68, 69, 70} 670 671 enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.DeltaByteArray, false, nil, nil) 672 enc.(encoding.ByteArrayEncoder).Put(test) 673 buf, _ := enc.FlushValues() 674 defer buf.Release() 675 676 assert.Equal(t, expected, buf.Bytes()) 677 678 dec := encoding.NewDecoder(parquet.Types.ByteArray, parquet.Encodings.DeltaByteArray, nil, nil) 679 dec.SetData(len(test), expected) 680 out := make([]parquet.ByteArray, len(test)) 681 decoded, _ := dec.(encoding.ByteArrayDecoder).Decode(out) 682 assert.Equal(t, len(test), decoded) 683 assert.Equal(t, test, out) 684 }